From e98e5ef446ef919c5b1a4b146a1e6d4e32381e11 Mon Sep 17 00:00:00 2001
From: vamsi-parasa <srinivas.vamsi.parasa@intel.com>
Date: Tue, 30 May 2023 11:51:00 -0700
Subject: [PATCH 01/40] 8309130: x86_64 AVX512 intrinsics for Arrays.sort
 methods (int, long, float and double arrays)

---
 make/modules/java.base/Lib.gmk                |  21 +
 src/hotspot/cpu/x86/stubGenerator_x86_64.cpp  |  26 +
 src/hotspot/share/classfile/vmIntrinsics.hpp  |  10 +
 src/hotspot/share/opto/c2compiler.cpp         |   4 +
 src/hotspot/share/opto/library_call.cpp       |  59 ++
 src/hotspot/share/opto/library_call.hpp       |   2 +-
 src/hotspot/share/opto/runtime.cpp            |  19 +
 src/hotspot/share/opto/runtime.hpp            |   1 +
 src/hotspot/share/runtime/stubRoutines.cpp    |  17 +
 src/hotspot/share/runtime/stubRoutines.hpp    |   5 +
 src/hotspot/share/runtime/vmStructs.cpp       |   4 +
 .../libavx512_x86_64/avx512-32bit-qsort.hpp   | 601 +++++++++++++
 .../libavx512_x86_64/avx512-64bit-common.h    | 588 ++++++++++++
 .../libavx512_x86_64/avx512-64bit-qsort.hpp   | 834 ++++++++++++++++++
 .../libavx512_x86_64/avx512-common-qsort.h    | 521 +++++++++++
 .../libavx512_x86_64/avxsort_linux_x86.cpp    |  54 ++
 .../share/classes/java/util/Arrays.java       |  36 +-
 .../openjdk/bench/java/util/ArraysSort.java   | 114 +++
 18 files changed, 2907 insertions(+), 9 deletions(-)
 create mode 100644 src/java.base/linux/native/libavx512_x86_64/avx512-32bit-qsort.hpp
 create mode 100644 src/java.base/linux/native/libavx512_x86_64/avx512-64bit-common.h
 create mode 100644 src/java.base/linux/native/libavx512_x86_64/avx512-64bit-qsort.hpp
 create mode 100644 src/java.base/linux/native/libavx512_x86_64/avx512-common-qsort.h
 create mode 100644 src/java.base/linux/native/libavx512_x86_64/avxsort_linux_x86.cpp
 create mode 100644 test/micro/org/openjdk/bench/java/util/ArraysSort.java

diff --git a/make/modules/java.base/Lib.gmk b/make/modules/java.base/Lib.gmk
index d6ca293291470..5ec5d03d59c07 100644
--- a/make/modules/java.base/Lib.gmk
+++ b/make/modules/java.base/Lib.gmk
@@ -230,3 +230,24 @@ ifeq ($(ENABLE_FALLBACK_LINKER), true)
 
   TARGETS += $(BUILD_LIBFALLBACKLINKER)
 endif
+
+################################################################################
+
+ifeq ($(call isTargetOs, linux)+$(call isTargetCpu, x86_64)+$(INCLUDE_COMPILER2), true+true+true)
+  $(eval $(call SetupJdkLibrary, BUILD_LIBAVX512_X86_64, \
+      NAME := avx512_x86_64, \
+      OPTIMIZATION := HIGH, \
+      CFLAGS := $(CFLAGS_JDKLIB) -mavx512f -mavx512dq, \
+      CXXFLAGS := $(CXXFLAGS_JDKLIB) -mavx512f -mavx512dq, \
+      LDFLAGS := $(LDFLAGS_JDKLIB) \
+          $(call SET_SHARED_LIBRARY_ORIGIN), \
+      LDFLAGS_linux := -Wl$(COMMA)--no-as-needed, \
+      LDFLAGS_windows := -defaultlib:msvcrt, \
+      LIBS := $(LIBCXX), \
+      LIBS_linux := -lc -lm -ldl -lstdc++, \
+  ))
+
+  TARGETS += $(BUILD_LIBAVX512_X86_64)
+endif
+
+################################################################################
diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
index 6cd1765151492..f1a8d4928488b 100644
--- a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
+++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
@@ -4126,6 +4126,32 @@ void StubGenerator::generate_compiler_stubs() {
       = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
   }
 
+  // Get avx512 sort stub routine addresses
+  void *libavx512_x86_64 = nullptr;
+  char ebuf_avx512[1024];
+  char dll_name_avx512[JVM_MAXPATHLEN];
+  if (os::dll_locate_lib(dll_name_avx512, sizeof(dll_name_avx512), Arguments::get_dll_dir(), "avx512_x86_64")) {
+    libavx512_x86_64 = os::dll_load(dll_name_avx512, ebuf_avx512, sizeof ebuf_avx512);
+  }
+  if (libavx512_x86_64 != nullptr) {
+    log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "avx512_x86_64" JNI_LIB_SUFFIX, p2i(libavx512_x86_64));
+
+    if (UseAVX > 2 && VM_Version::supports_avx512dq()) {
+
+      snprintf(ebuf_avx512, sizeof(ebuf_avx512), "avx512_sort_int");
+      StubRoutines::_arraysort_int = (address)os::dll_lookup(libavx512_x86_64, ebuf_avx512);
+
+      snprintf(ebuf_avx512, sizeof(ebuf_avx512), "avx512_sort_long");
+      StubRoutines::_arraysort_long = (address)os::dll_lookup(libavx512_x86_64, ebuf_avx512);
+
+      snprintf(ebuf_avx512, sizeof(ebuf_avx512), "avx512_sort_float");
+      StubRoutines::_arraysort_float = (address)os::dll_lookup(libavx512_x86_64, ebuf_avx512);
+
+      snprintf(ebuf_avx512, sizeof(ebuf_avx512), "avx512_sort_double");
+      StubRoutines::_arraysort_double = (address)os::dll_lookup(libavx512_x86_64, ebuf_avx512);
+    }
+  }
+
   // Get svml stub routine addresses
   void *libjsvml = nullptr;
   char ebuf[1024];
diff --git a/src/hotspot/share/classfile/vmIntrinsics.hpp b/src/hotspot/share/classfile/vmIntrinsics.hpp
index 86d5cc9ce5f8c..de02d4bad0092 100644
--- a/src/hotspot/share/classfile/vmIntrinsics.hpp
+++ b/src/hotspot/share/classfile/vmIntrinsics.hpp
@@ -341,6 +341,16 @@ class methodHandle;
    do_name(     copyOf_name,                                     "copyOf")                                              \
    do_signature(copyOf_signature,             "([Ljava/lang/Object;ILjava/lang/Class;)[Ljava/lang/Object;")             \
                                                                                                                         \
+  do_intrinsic(_arraySortI,                 java_util_Arrays,     arraySort_name, arraySortI_signature,          F_S)   \
+   do_name(     arraySort_name,                                   "arraySort")                                          \
+   do_signature(arraySortI_signature,                             "([III)V")                                            \
+  do_intrinsic(_arraySortL,                 java_util_Arrays,     arraySort_name, arraySortL_signature,          F_S)   \
+   do_signature(arraySortL_signature,                             "([JII)V")                                            \
+  do_intrinsic(_arraySortF,                 java_util_Arrays,     arraySort_name, arraySortF_signature,          F_S)   \
+   do_signature(arraySortF_signature,                             "([FII)V")                                            \
+  do_intrinsic(_arraySortD,                 java_util_Arrays,     arraySort_name, arraySortD_signature,          F_S)   \
+   do_signature(arraySortD_signature,                             "([DII)V")                                            \
+                                                                                                                        \
   do_intrinsic(_copyOfRange,              java_util_Arrays,       copyOfRange_name, copyOfRange_signature,       F_S)   \
    do_name(     copyOfRange_name,                                "copyOfRange")                                         \
    do_signature(copyOfRange_signature,        "([Ljava/lang/Object;IILjava/lang/Class;)[Ljava/lang/Object;")            \
diff --git a/src/hotspot/share/opto/c2compiler.cpp b/src/hotspot/share/opto/c2compiler.cpp
index e26c992d55827..c904c49d22046 100644
--- a/src/hotspot/share/opto/c2compiler.cpp
+++ b/src/hotspot/share/opto/c2compiler.cpp
@@ -575,6 +575,10 @@ bool C2Compiler::is_intrinsic_supported(const methodHandle& method) {
   case vmIntrinsics::_min_strict:
   case vmIntrinsics::_max_strict:
   case vmIntrinsics::_arraycopy:
+  case vmIntrinsics::_arraySortI:
+  case vmIntrinsics::_arraySortL:
+  case vmIntrinsics::_arraySortF:
+  case vmIntrinsics::_arraySortD:
   case vmIntrinsics::_indexOfL:
   case vmIntrinsics::_indexOfU:
   case vmIntrinsics::_indexOfUL:
diff --git a/src/hotspot/share/opto/library_call.cpp b/src/hotspot/share/opto/library_call.cpp
index f2e095a4d1740..21eb6b4483064 100644
--- a/src/hotspot/share/opto/library_call.cpp
+++ b/src/hotspot/share/opto/library_call.cpp
@@ -292,6 +292,11 @@ bool LibraryCallKit::try_to_inline(int predicate) {
 
   case vmIntrinsics::_arraycopy:                return inline_arraycopy();
 
+  case vmIntrinsics::_arraySortI:
+  case vmIntrinsics::_arraySortL:
+  case vmIntrinsics::_arraySortF:
+  case vmIntrinsics::_arraySortD:               return inline_arraysort(intrinsic_id());
+
   case vmIntrinsics::_compareToL:               return inline_string_compareTo(StrIntrinsicNode::LL);
   case vmIntrinsics::_compareToU:               return inline_string_compareTo(StrIntrinsicNode::UU);
   case vmIntrinsics::_compareToLU:              return inline_string_compareTo(StrIntrinsicNode::LU);
@@ -5192,6 +5197,60 @@ void LibraryCallKit::create_new_uncommon_trap(CallStaticJavaNode* uncommon_trap_
   uncommon_trap_call->set_req(0, top()); // not used anymore, kill it
 }
 
+//------------------------------inline_arraysort-----------------------
+bool LibraryCallKit::inline_arraysort(vmIntrinsics::ID id) {
+
+  address stubAddr = nullptr;
+  const char *stubName;
+  stubName = "arraysort_stub";
+  BasicType bt;
+
+  switch(id) {
+    case vmIntrinsics::_arraySortI:
+      bt = T_INT;
+      break;
+    case vmIntrinsics::_arraySortL:
+      bt = T_LONG;
+      break;
+    case vmIntrinsics::_arraySortF:
+      bt = T_FLOAT;
+      break;
+    case vmIntrinsics::_arraySortD:
+      bt = T_DOUBLE;
+      break;
+    default:
+      break;
+  }
+
+  stubAddr = StubRoutines::select_arraysort_function(bt);
+  if (stubAddr == nullptr) return false;
+
+  Node* array           = argument(0);
+  Node* fromIndex       = argument(1);
+  Node* toIndex         = argument(2);
+
+  array = must_be_not_null(array, true);
+
+  const TypeAryPtr* array_type = array->Value(&_gvn)->isa_aryptr();
+  assert(array_type != nullptr &&  array_type->elem() != Type::BOTTOM, "args are strange");
+
+  // for the quick and dirty code we will skip all the checks.
+  // we are just trying to get the call to be generated.
+  Node* array_fromIndex  = array;
+  if (fromIndex != nullptr || toIndex != nullptr) {
+    assert(fromIndex != nullptr && toIndex != nullptr, "");
+    array_fromIndex = array_element_address(array, fromIndex, bt);
+  }
+
+  // Call the stub.
+  make_runtime_call(RC_LEAF|RC_NO_FP, OptoRuntime::array_sort_Type(),
+                    stubAddr, stubName, TypePtr::BOTTOM,
+                    array_fromIndex, fromIndex, toIndex);
+
+  return true;
+}
+
+
 //------------------------------inline_arraycopy-----------------------
 // public static native void java.lang.System.arraycopy(Object src,  int  srcPos,
 //                                                      Object dest, int destPos,
diff --git a/src/hotspot/share/opto/library_call.hpp b/src/hotspot/share/opto/library_call.hpp
index 46dd51bf654a9..52725e87080f1 100644
--- a/src/hotspot/share/opto/library_call.hpp
+++ b/src/hotspot/share/opto/library_call.hpp
@@ -279,7 +279,7 @@ class LibraryCallKit : public GraphKit {
   JVMState* arraycopy_restore_alloc_state(AllocateArrayNode* alloc, int& saved_reexecute_sp);
   void arraycopy_move_allocation_here(AllocateArrayNode* alloc, Node* dest, JVMState* saved_jvms_before_guards, int saved_reexecute_sp,
                                       uint new_idx);
-
+  bool inline_arraysort(vmIntrinsics::ID id);
   typedef enum { LS_get_add, LS_get_set, LS_cmp_swap, LS_cmp_swap_weak, LS_cmp_exchange } LoadStoreKind;
   bool inline_unsafe_load_store(BasicType type,  LoadStoreKind kind, AccessKind access_kind);
   bool inline_unsafe_fence(vmIntrinsics::ID id);
diff --git a/src/hotspot/share/opto/runtime.cpp b/src/hotspot/share/opto/runtime.cpp
index 6cc044962c2f8..cd556c2d85cd9 100644
--- a/src/hotspot/share/opto/runtime.cpp
+++ b/src/hotspot/share/opto/runtime.cpp
@@ -857,6 +857,25 @@ const TypeFunc* OptoRuntime::array_fill_Type() {
   return TypeFunc::make(domain, range);
 }
 
+const TypeFunc* OptoRuntime::array_sort_Type() {
+  // create input type (domain)
+  int num_args      = 3;
+  int argcnt = num_args;
+  const Type** fields = TypeTuple::fields(argcnt);
+  int argp = TypeFunc::Parms;
+  fields[argp++] = TypePtr::NOTNULL;    // array(fromIndex)
+  fields[argp++] = TypeInt::INT;    // fromIndex
+  fields[argp++] = TypeInt::INT;    // toIndex
+  assert(argp == TypeFunc::Parms+argcnt, "correct decoding");
+  const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields);
+
+  // no result type needed
+  fields = TypeTuple::fields(1);
+  fields[TypeFunc::Parms+0] = nullptr; // void
+  const TypeTuple* range = TypeTuple::make(TypeFunc::Parms, fields);
+  return TypeFunc::make(domain, range);
+}
+
 // for aescrypt encrypt/decrypt operations, just three pointers returning void (length is constant)
 const TypeFunc* OptoRuntime::aescrypt_block_Type() {
   // create input type (domain)
diff --git a/src/hotspot/share/opto/runtime.hpp b/src/hotspot/share/opto/runtime.hpp
index cd13c14148d71..e4d5f749d3efa 100644
--- a/src/hotspot/share/opto/runtime.hpp
+++ b/src/hotspot/share/opto/runtime.hpp
@@ -268,6 +268,7 @@ class OptoRuntime : public AllStatic {
 
   static const TypeFunc* array_fill_Type();
 
+  static const TypeFunc* array_sort_Type();
   static const TypeFunc* aescrypt_block_Type();
   static const TypeFunc* cipherBlockChaining_aescrypt_Type();
   static const TypeFunc* electronicCodeBook_aescrypt_Type();
diff --git a/src/hotspot/share/runtime/stubRoutines.cpp b/src/hotspot/share/runtime/stubRoutines.cpp
index 7a6974088ba43..e5b39646f52f0 100644
--- a/src/hotspot/share/runtime/stubRoutines.cpp
+++ b/src/hotspot/share/runtime/stubRoutines.cpp
@@ -175,6 +175,11 @@ address StubRoutines::_hf2f = nullptr;
 address StubRoutines::_vector_f_math[VectorSupport::NUM_VEC_SIZES][VectorSupport::NUM_SVML_OP] = {{nullptr}, {nullptr}};
 address StubRoutines::_vector_d_math[VectorSupport::NUM_VEC_SIZES][VectorSupport::NUM_SVML_OP] = {{nullptr}, {nullptr}};
 
+address StubRoutines::_arraysort_int        = nullptr;
+address StubRoutines::_arraysort_long       = nullptr;
+address StubRoutines::_arraysort_float      = nullptr;
+address StubRoutines::_arraysort_double     = nullptr;
+
 address StubRoutines::_cont_thaw          = nullptr;
 address StubRoutines::_cont_returnBarrier = nullptr;
 address StubRoutines::_cont_returnBarrierExc = nullptr;
@@ -647,3 +652,15 @@ UnsafeCopyMemoryMark::~UnsafeCopyMemoryMark() {
     }
   }
 }
+
+address StubRoutines::select_arraysort_function(BasicType t) {
+  switch(t) {
+    case T_INT:    return _arraysort_int;
+    case T_LONG:   return _arraysort_long;
+    case T_FLOAT:  return _arraysort_float;
+    case T_DOUBLE: return _arraysort_double;
+  default:
+    ShouldNotReachHere();
+    return nullptr;
+  }
+}
diff --git a/src/hotspot/share/runtime/stubRoutines.hpp b/src/hotspot/share/runtime/stubRoutines.hpp
index 5ce9176f08a2f..0e54f43e93646 100644
--- a/src/hotspot/share/runtime/stubRoutines.hpp
+++ b/src/hotspot/share/runtime/stubRoutines.hpp
@@ -153,6 +153,10 @@ class StubRoutines: AllStatic {
   static BufferBlob* _compiler_stubs_code;                 // code buffer for C2 intrinsics
   static BufferBlob* _final_stubs_code;                    // code buffer for all other routines
 
+  static address _arraysort_int;
+  static address _arraysort_long;
+  static address _arraysort_float;
+  static address _arraysort_double;
   // Leaf routines which implement arraycopy and their addresses
   // arraycopy operands aligned on element type boundary
   static address _jbyte_arraycopy;
@@ -372,6 +376,7 @@ class StubRoutines: AllStatic {
   static UnsafeArrayCopyStub UnsafeArrayCopy_stub()         { return CAST_TO_FN_PTR(UnsafeArrayCopyStub,  _unsafe_arraycopy); }
 
   static address generic_arraycopy()   { return _generic_arraycopy; }
+  static address select_arraysort_function(BasicType t);
 
   static address jbyte_fill()          { return _jbyte_fill; }
   static address jshort_fill()         { return _jshort_fill; }
diff --git a/src/hotspot/share/runtime/vmStructs.cpp b/src/hotspot/share/runtime/vmStructs.cpp
index 37241534b2b7e..0b252d1c53760 100644
--- a/src/hotspot/share/runtime/vmStructs.cpp
+++ b/src/hotspot/share/runtime/vmStructs.cpp
@@ -588,6 +588,10 @@
      static_field(StubRoutines,                _checkcast_arraycopy_uninit,                   address)                               \
      static_field(StubRoutines,                _unsafe_arraycopy,                             address)                               \
      static_field(StubRoutines,                _generic_arraycopy,                            address)                               \
+     static_field(StubRoutines,                _arraysort_int,                                address)                               \
+     static_field(StubRoutines,                _arraysort_long,                               address)                               \
+     static_field(StubRoutines,                _arraysort_float,                              address)                               \
+     static_field(StubRoutines,                _arraysort_double,                             address)                               \
                                                                                                                                      \
   /*****************/                                                                                                                \
   /* SharedRuntime */                                                                                                                \
diff --git a/src/java.base/linux/native/libavx512_x86_64/avx512-32bit-qsort.hpp b/src/java.base/linux/native/libavx512_x86_64/avx512-32bit-qsort.hpp
new file mode 100644
index 0000000000000..05efac20cbdb2
--- /dev/null
+++ b/src/java.base/linux/native/libavx512_x86_64/avx512-32bit-qsort.hpp
@@ -0,0 +1,601 @@
+/*
+ * Copyright (c) 2023 Intel Corporation. All rights reserved.
+ * Intel x86-simd-sort source code.
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+#ifndef AVX512_QSORT_32BIT
+#define AVX512_QSORT_32BIT
+
+#include "avx512-common-qsort.h"
+
+/*
+ * Constants used in sorting 16 elements in a ZMM registers. Based on Bitonic
+ * sorting network (see
+ * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg)
+ */
+#define NETWORK_32BIT_1 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
+#define NETWORK_32BIT_2 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
+#define NETWORK_32BIT_3 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7
+#define NETWORK_32BIT_4 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2
+#define NETWORK_32BIT_5 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+#define NETWORK_32BIT_6 11, 10, 9, 8, 15, 14, 13, 12, 3, 2, 1, 0, 7, 6, 5, 4
+#define NETWORK_32BIT_7 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
+
+template <>
+struct zmm_vector<int32_t> {
+    using type_t = int32_t;
+    using zmm_t = __m512i;
+    using ymm_t = __m256i;
+    using opmask_t = __mmask16;
+    static const uint8_t numlanes = 16;
+
+    static type_t type_max() { return X86_SIMD_SORT_MAX_INT32; }
+    static type_t type_min() { return X86_SIMD_SORT_MIN_INT32; }
+    static zmm_t zmm_max() { return _mm512_set1_epi32(type_max()); }
+
+    static opmask_t knot_opmask(opmask_t x) { return _mm512_knot(x); }
+    static opmask_t ge(zmm_t x, zmm_t y) {
+        return _mm512_cmp_epi32_mask(x, y, _MM_CMPINT_NLT);
+    }
+    template <int scale>
+    static ymm_t i64gather(__m512i index, void const *base) {
+        return _mm512_i64gather_epi32(index, base, scale);
+    }
+    static zmm_t merge(ymm_t y1, ymm_t y2) {
+        zmm_t z1 = _mm512_castsi256_si512(y1);
+        return _mm512_inserti32x8(z1, y2, 1);
+    }
+    static zmm_t loadu(void const *mem) { return _mm512_loadu_si512(mem); }
+    static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x) {
+        return _mm512_mask_compressstoreu_epi32(mem, mask, x);
+    }
+    static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem) {
+        return _mm512_mask_loadu_epi32(x, mask, mem);
+    }
+    static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y) {
+        return _mm512_mask_mov_epi32(x, mask, y);
+    }
+    static void mask_storeu(void *mem, opmask_t mask, zmm_t x) {
+        return _mm512_mask_storeu_epi32(mem, mask, x);
+    }
+    static zmm_t min(zmm_t x, zmm_t y) { return _mm512_min_epi32(x, y); }
+    static zmm_t max(zmm_t x, zmm_t y) { return _mm512_max_epi32(x, y); }
+    static zmm_t permutexvar(__m512i idx, zmm_t zmm) {
+        return _mm512_permutexvar_epi32(idx, zmm);
+    }
+    static type_t reducemax(zmm_t v) { return _mm512_reduce_max_epi32(v); }
+    static type_t reducemin(zmm_t v) { return _mm512_reduce_min_epi32(v); }
+    static zmm_t set1(type_t v) { return _mm512_set1_epi32(v); }
+    template <uint8_t mask>
+    static zmm_t shuffle(zmm_t zmm) {
+        return _mm512_shuffle_epi32(zmm, (_MM_PERM_ENUM)mask);
+    }
+    static void storeu(void *mem, zmm_t x) {
+        return _mm512_storeu_si512(mem, x);
+    }
+
+    static ymm_t max(ymm_t x, ymm_t y) { return _mm256_max_epi32(x, y); }
+    static ymm_t min(ymm_t x, ymm_t y) { return _mm256_min_epi32(x, y); }
+};
+template <>
+struct zmm_vector<uint32_t> {
+    using type_t = uint32_t;
+    using zmm_t = __m512i;
+    using ymm_t = __m256i;
+    using opmask_t = __mmask16;
+    static const uint8_t numlanes = 16;
+
+    static type_t type_max() { return X86_SIMD_SORT_MAX_UINT32; }
+    static type_t type_min() { return 0; }
+    static zmm_t zmm_max() {
+        return _mm512_set1_epi32(type_max());
+    }  // TODO: this should broadcast bits as is?
+
+    template <int scale>
+    static ymm_t i64gather(__m512i index, void const *base) {
+        return _mm512_i64gather_epi32(index, base, scale);
+    }
+    static zmm_t merge(ymm_t y1, ymm_t y2) {
+        zmm_t z1 = _mm512_castsi256_si512(y1);
+        return _mm512_inserti32x8(z1, y2, 1);
+    }
+    static opmask_t knot_opmask(opmask_t x) { return _mm512_knot(x); }
+    static opmask_t ge(zmm_t x, zmm_t y) {
+        return _mm512_cmp_epu32_mask(x, y, _MM_CMPINT_NLT);
+    }
+    static zmm_t loadu(void const *mem) { return _mm512_loadu_si512(mem); }
+    static zmm_t max(zmm_t x, zmm_t y) { return _mm512_max_epu32(x, y); }
+    static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x) {
+        return _mm512_mask_compressstoreu_epi32(mem, mask, x);
+    }
+    static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem) {
+        return _mm512_mask_loadu_epi32(x, mask, mem);
+    }
+    static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y) {
+        return _mm512_mask_mov_epi32(x, mask, y);
+    }
+    static void mask_storeu(void *mem, opmask_t mask, zmm_t x) {
+        return _mm512_mask_storeu_epi32(mem, mask, x);
+    }
+    static zmm_t min(zmm_t x, zmm_t y) { return _mm512_min_epu32(x, y); }
+    static zmm_t permutexvar(__m512i idx, zmm_t zmm) {
+        return _mm512_permutexvar_epi32(idx, zmm);
+    }
+    static type_t reducemax(zmm_t v) { return _mm512_reduce_max_epu32(v); }
+    static type_t reducemin(zmm_t v) { return _mm512_reduce_min_epu32(v); }
+    static zmm_t set1(type_t v) { return _mm512_set1_epi32(v); }
+    template <uint8_t mask>
+    static zmm_t shuffle(zmm_t zmm) {
+        return _mm512_shuffle_epi32(zmm, (_MM_PERM_ENUM)mask);
+    }
+    static void storeu(void *mem, zmm_t x) {
+        return _mm512_storeu_si512(mem, x);
+    }
+
+    static ymm_t max(ymm_t x, ymm_t y) { return _mm256_max_epu32(x, y); }
+    static ymm_t min(ymm_t x, ymm_t y) { return _mm256_min_epu32(x, y); }
+};
+template <>
+struct zmm_vector<float> {
+    using type_t = float;
+    using zmm_t = __m512;
+    using ymm_t = __m256;
+    using opmask_t = __mmask16;
+    static const uint8_t numlanes = 16;
+
+    static type_t type_max() { return X86_SIMD_SORT_INFINITYF; }
+    static type_t type_min() { return -X86_SIMD_SORT_INFINITYF; }
+    static zmm_t zmm_max() { return _mm512_set1_ps(type_max()); }
+
+    static opmask_t knot_opmask(opmask_t x) { return _mm512_knot(x); }
+    static opmask_t ge(zmm_t x, zmm_t y) {
+        return _mm512_cmp_ps_mask(x, y, _CMP_GE_OQ);
+    }
+    template <int scale>
+    static ymm_t i64gather(__m512i index, void const *base) {
+        return _mm512_i64gather_ps(index, base, scale);
+    }
+    static zmm_t merge(ymm_t y1, ymm_t y2) {
+        zmm_t z1 = _mm512_castsi512_ps(
+            _mm512_castsi256_si512(_mm256_castps_si256(y1)));
+        return _mm512_insertf32x8(z1, y2, 1);
+    }
+    static zmm_t loadu(void const *mem) { return _mm512_loadu_ps(mem); }
+    static zmm_t max(zmm_t x, zmm_t y) { return _mm512_max_ps(x, y); }
+    static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x) {
+        return _mm512_mask_compressstoreu_ps(mem, mask, x);
+    }
+    static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem) {
+        return _mm512_mask_loadu_ps(x, mask, mem);
+    }
+    static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y) {
+        return _mm512_mask_mov_ps(x, mask, y);
+    }
+    static void mask_storeu(void *mem, opmask_t mask, zmm_t x) {
+        return _mm512_mask_storeu_ps(mem, mask, x);
+    }
+    static zmm_t min(zmm_t x, zmm_t y) { return _mm512_min_ps(x, y); }
+    static zmm_t permutexvar(__m512i idx, zmm_t zmm) {
+        return _mm512_permutexvar_ps(idx, zmm);
+    }
+    static type_t reducemax(zmm_t v) { return _mm512_reduce_max_ps(v); }
+    static type_t reducemin(zmm_t v) { return _mm512_reduce_min_ps(v); }
+    static zmm_t set1(type_t v) { return _mm512_set1_ps(v); }
+    template <uint8_t mask>
+    static zmm_t shuffle(zmm_t zmm) {
+        return _mm512_shuffle_ps(zmm, zmm, (_MM_PERM_ENUM)mask);
+    }
+    static void storeu(void *mem, zmm_t x) { return _mm512_storeu_ps(mem, x); }
+
+    static ymm_t max(ymm_t x, ymm_t y) { return _mm256_max_ps(x, y); }
+    static ymm_t min(ymm_t x, ymm_t y) { return _mm256_min_ps(x, y); }
+};
+
+/*
+ * Assumes zmm is random and performs a full sorting network defined in
+ * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg
+ */
+template <typename vtype, typename zmm_t = typename vtype::zmm_t>
+X86_SIMD_SORT_INLINE zmm_t sort_zmm_32bit(zmm_t zmm) {
+    zmm = cmp_merge<vtype>(
+        zmm, vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm), 0xAAAA);
+    zmm = cmp_merge<vtype>(
+        zmm, vtype::template shuffle<SHUFFLE_MASK(0, 1, 2, 3)>(zmm), 0xCCCC);
+    zmm = cmp_merge<vtype>(
+        zmm, vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm), 0xAAAA);
+    zmm = cmp_merge<vtype>(
+        zmm, vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_3), zmm),
+        0xF0F0);
+    zmm = cmp_merge<vtype>(
+        zmm, vtype::template shuffle<SHUFFLE_MASK(1, 0, 3, 2)>(zmm), 0xCCCC);
+    zmm = cmp_merge<vtype>(
+        zmm, vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm), 0xAAAA);
+    zmm = cmp_merge<vtype>(
+        zmm, vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm),
+        0xFF00);
+    zmm = cmp_merge<vtype>(
+        zmm, vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_6), zmm),
+        0xF0F0);
+    zmm = cmp_merge<vtype>(
+        zmm, vtype::template shuffle<SHUFFLE_MASK(1, 0, 3, 2)>(zmm), 0xCCCC);
+    zmm = cmp_merge<vtype>(
+        zmm, vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm), 0xAAAA);
+    return zmm;
+}
+
+// Assumes zmm is bitonic and performs a recursive half cleaner
+template <typename vtype, typename zmm_t = typename vtype::zmm_t>
+X86_SIMD_SORT_INLINE zmm_t bitonic_merge_zmm_32bit(zmm_t zmm) {
+    // 1) half_cleaner[16]: compare 1-9, 2-10, 3-11 etc ..
+    zmm = cmp_merge<vtype>(
+        zmm, vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_7), zmm),
+        0xFF00);
+    // 2) half_cleaner[8]: compare 1-5, 2-6, 3-7 etc ..
+    zmm = cmp_merge<vtype>(
+        zmm, vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_6), zmm),
+        0xF0F0);
+    // 3) half_cleaner[4]
+    zmm = cmp_merge<vtype>(
+        zmm, vtype::template shuffle<SHUFFLE_MASK(1, 0, 3, 2)>(zmm), 0xCCCC);
+    // 3) half_cleaner[1]
+    zmm = cmp_merge<vtype>(
+        zmm, vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm), 0xAAAA);
+    return zmm;
+}
+
+// Assumes zmm1 and zmm2 are sorted and performs a recursive half cleaner
+template <typename vtype, typename zmm_t = typename vtype::zmm_t>
+X86_SIMD_SORT_INLINE void bitonic_merge_two_zmm_32bit(zmm_t *zmm1,
+                                                      zmm_t *zmm2) {
+    // 1) First step of a merging network: coex of zmm1 and zmm2 reversed
+    *zmm2 = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), *zmm2);
+    zmm_t zmm3 = vtype::min(*zmm1, *zmm2);
+    zmm_t zmm4 = vtype::max(*zmm1, *zmm2);
+    // 2) Recursive half cleaner for each
+    *zmm1 = bitonic_merge_zmm_32bit<vtype>(zmm3);
+    *zmm2 = bitonic_merge_zmm_32bit<vtype>(zmm4);
+}
+
+// Assumes [zmm0, zmm1] and [zmm2, zmm3] are sorted and performs a recursive
+// half cleaner
+template <typename vtype, typename zmm_t = typename vtype::zmm_t>
+X86_SIMD_SORT_INLINE void bitonic_merge_four_zmm_32bit(zmm_t *zmm) {
+    zmm_t zmm2r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[2]);
+    zmm_t zmm3r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[3]);
+    zmm_t zmm_t1 = vtype::min(zmm[0], zmm3r);
+    zmm_t zmm_t2 = vtype::min(zmm[1], zmm2r);
+    zmm_t zmm_t3 = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5),
+                                      vtype::max(zmm[1], zmm2r));
+    zmm_t zmm_t4 = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5),
+                                      vtype::max(zmm[0], zmm3r));
+    zmm_t zmm0 = vtype::min(zmm_t1, zmm_t2);
+    zmm_t zmm1 = vtype::max(zmm_t1, zmm_t2);
+    zmm_t zmm2 = vtype::min(zmm_t3, zmm_t4);
+    zmm_t zmm3 = vtype::max(zmm_t3, zmm_t4);
+    zmm[0] = bitonic_merge_zmm_32bit<vtype>(zmm0);
+    zmm[1] = bitonic_merge_zmm_32bit<vtype>(zmm1);
+    zmm[2] = bitonic_merge_zmm_32bit<vtype>(zmm2);
+    zmm[3] = bitonic_merge_zmm_32bit<vtype>(zmm3);
+}
+
+template <typename vtype, typename zmm_t = typename vtype::zmm_t>
+X86_SIMD_SORT_INLINE void bitonic_merge_eight_zmm_32bit(zmm_t *zmm) {
+    zmm_t zmm4r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[4]);
+    zmm_t zmm5r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[5]);
+    zmm_t zmm6r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[6]);
+    zmm_t zmm7r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[7]);
+    zmm_t zmm_t1 = vtype::min(zmm[0], zmm7r);
+    zmm_t zmm_t2 = vtype::min(zmm[1], zmm6r);
+    zmm_t zmm_t3 = vtype::min(zmm[2], zmm5r);
+    zmm_t zmm_t4 = vtype::min(zmm[3], zmm4r);
+    zmm_t zmm_t5 = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5),
+                                      vtype::max(zmm[3], zmm4r));
+    zmm_t zmm_t6 = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5),
+                                      vtype::max(zmm[2], zmm5r));
+    zmm_t zmm_t7 = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5),
+                                      vtype::max(zmm[1], zmm6r));
+    zmm_t zmm_t8 = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5),
+                                      vtype::max(zmm[0], zmm7r));
+    COEX<vtype>(zmm_t1, zmm_t3);
+    COEX<vtype>(zmm_t2, zmm_t4);
+    COEX<vtype>(zmm_t5, zmm_t7);
+    COEX<vtype>(zmm_t6, zmm_t8);
+    COEX<vtype>(zmm_t1, zmm_t2);
+    COEX<vtype>(zmm_t3, zmm_t4);
+    COEX<vtype>(zmm_t5, zmm_t6);
+    COEX<vtype>(zmm_t7, zmm_t8);
+    zmm[0] = bitonic_merge_zmm_32bit<vtype>(zmm_t1);
+    zmm[1] = bitonic_merge_zmm_32bit<vtype>(zmm_t2);
+    zmm[2] = bitonic_merge_zmm_32bit<vtype>(zmm_t3);
+    zmm[3] = bitonic_merge_zmm_32bit<vtype>(zmm_t4);
+    zmm[4] = bitonic_merge_zmm_32bit<vtype>(zmm_t5);
+    zmm[5] = bitonic_merge_zmm_32bit<vtype>(zmm_t6);
+    zmm[6] = bitonic_merge_zmm_32bit<vtype>(zmm_t7);
+    zmm[7] = bitonic_merge_zmm_32bit<vtype>(zmm_t8);
+}
+
+template <typename vtype, typename type_t>
+X86_SIMD_SORT_INLINE void sort_16_32bit(type_t *arr, int32_t N) {
+    typename vtype::opmask_t load_mask = (0x0001 << N) - 0x0001;
+    typename vtype::zmm_t zmm =
+        vtype::mask_loadu(vtype::zmm_max(), load_mask, arr);
+    vtype::mask_storeu(arr, load_mask, sort_zmm_32bit<vtype>(zmm));
+}
+
+template <typename vtype, typename type_t>
+X86_SIMD_SORT_INLINE void sort_32_32bit(type_t *arr, int32_t N) {
+    if (N <= 16) {
+        sort_16_32bit<vtype>(arr, N);
+        return;
+    }
+    using zmm_t = typename vtype::zmm_t;
+    zmm_t zmm1 = vtype::loadu(arr);
+    typename vtype::opmask_t load_mask = (0x0001 << (N - 16)) - 0x0001;
+    zmm_t zmm2 = vtype::mask_loadu(vtype::zmm_max(), load_mask, arr + 16);
+    zmm1 = sort_zmm_32bit<vtype>(zmm1);
+    zmm2 = sort_zmm_32bit<vtype>(zmm2);
+    bitonic_merge_two_zmm_32bit<vtype>(&zmm1, &zmm2);
+    vtype::storeu(arr, zmm1);
+    vtype::mask_storeu(arr + 16, load_mask, zmm2);
+}
+
+template <typename vtype, typename type_t>
+X86_SIMD_SORT_INLINE void sort_64_32bit(type_t *arr, int32_t N) {
+    if (N <= 32) {
+        sort_32_32bit<vtype>(arr, N);
+        return;
+    }
+    using zmm_t = typename vtype::zmm_t;
+    using opmask_t = typename vtype::opmask_t;
+    zmm_t zmm[4];
+    zmm[0] = vtype::loadu(arr);
+    zmm[1] = vtype::loadu(arr + 16);
+    opmask_t load_mask1 = 0xFFFF, load_mask2 = 0xFFFF;
+    uint64_t combined_mask = (0x1ull << (N - 32)) - 0x1ull;
+    load_mask1 &= combined_mask & 0xFFFF;
+    load_mask2 &= (combined_mask >> 16) & 0xFFFF;
+    zmm[2] = vtype::mask_loadu(vtype::zmm_max(), load_mask1, arr + 32);
+    zmm[3] = vtype::mask_loadu(vtype::zmm_max(), load_mask2, arr + 48);
+    zmm[0] = sort_zmm_32bit<vtype>(zmm[0]);
+    zmm[1] = sort_zmm_32bit<vtype>(zmm[1]);
+    zmm[2] = sort_zmm_32bit<vtype>(zmm[2]);
+    zmm[3] = sort_zmm_32bit<vtype>(zmm[3]);
+    bitonic_merge_two_zmm_32bit<vtype>(&zmm[0], &zmm[1]);
+    bitonic_merge_two_zmm_32bit<vtype>(&zmm[2], &zmm[3]);
+    bitonic_merge_four_zmm_32bit<vtype>(zmm);
+    vtype::storeu(arr, zmm[0]);
+    vtype::storeu(arr + 16, zmm[1]);
+    vtype::mask_storeu(arr + 32, load_mask1, zmm[2]);
+    vtype::mask_storeu(arr + 48, load_mask2, zmm[3]);
+}
+
+template <typename vtype, typename type_t>
+X86_SIMD_SORT_INLINE void sort_128_32bit(type_t *arr, int32_t N) {
+    if (N <= 64) {
+        sort_64_32bit<vtype>(arr, N);
+        return;
+    }
+    using zmm_t = typename vtype::zmm_t;
+    using opmask_t = typename vtype::opmask_t;
+    zmm_t zmm[8];
+    zmm[0] = vtype::loadu(arr);
+    zmm[1] = vtype::loadu(arr + 16);
+    zmm[2] = vtype::loadu(arr + 32);
+    zmm[3] = vtype::loadu(arr + 48);
+    zmm[0] = sort_zmm_32bit<vtype>(zmm[0]);
+    zmm[1] = sort_zmm_32bit<vtype>(zmm[1]);
+    zmm[2] = sort_zmm_32bit<vtype>(zmm[2]);
+    zmm[3] = sort_zmm_32bit<vtype>(zmm[3]);
+    opmask_t load_mask1 = 0xFFFF, load_mask2 = 0xFFFF;
+    opmask_t load_mask3 = 0xFFFF, load_mask4 = 0xFFFF;
+    if (N != 128) {
+        uint64_t combined_mask = (0x1ull << (N - 64)) - 0x1ull;
+        load_mask1 &= combined_mask & 0xFFFF;
+        load_mask2 &= (combined_mask >> 16) & 0xFFFF;
+        load_mask3 &= (combined_mask >> 32) & 0xFFFF;
+        load_mask4 &= (combined_mask >> 48) & 0xFFFF;
+    }
+    zmm[4] = vtype::mask_loadu(vtype::zmm_max(), load_mask1, arr + 64);
+    zmm[5] = vtype::mask_loadu(vtype::zmm_max(), load_mask2, arr + 80);
+    zmm[6] = vtype::mask_loadu(vtype::zmm_max(), load_mask3, arr + 96);
+    zmm[7] = vtype::mask_loadu(vtype::zmm_max(), load_mask4, arr + 112);
+    zmm[4] = sort_zmm_32bit<vtype>(zmm[4]);
+    zmm[5] = sort_zmm_32bit<vtype>(zmm[5]);
+    zmm[6] = sort_zmm_32bit<vtype>(zmm[6]);
+    zmm[7] = sort_zmm_32bit<vtype>(zmm[7]);
+    bitonic_merge_two_zmm_32bit<vtype>(&zmm[0], &zmm[1]);
+    bitonic_merge_two_zmm_32bit<vtype>(&zmm[2], &zmm[3]);
+    bitonic_merge_two_zmm_32bit<vtype>(&zmm[4], &zmm[5]);
+    bitonic_merge_two_zmm_32bit<vtype>(&zmm[6], &zmm[7]);
+    bitonic_merge_four_zmm_32bit<vtype>(zmm);
+    bitonic_merge_four_zmm_32bit<vtype>(zmm + 4);
+    bitonic_merge_eight_zmm_32bit<vtype>(zmm);
+    vtype::storeu(arr, zmm[0]);
+    vtype::storeu(arr + 16, zmm[1]);
+    vtype::storeu(arr + 32, zmm[2]);
+    vtype::storeu(arr + 48, zmm[3]);
+    vtype::mask_storeu(arr + 64, load_mask1, zmm[4]);
+    vtype::mask_storeu(arr + 80, load_mask2, zmm[5]);
+    vtype::mask_storeu(arr + 96, load_mask3, zmm[6]);
+    vtype::mask_storeu(arr + 112, load_mask4, zmm[7]);
+}
+
+template <typename vtype, typename type_t>
+X86_SIMD_SORT_INLINE type_t get_pivot_32bit(type_t *arr, const int64_t left,
+                                            const int64_t right) {
+    // median of 16
+    int64_t size = (right - left) / 16;
+    using zmm_t = typename vtype::zmm_t;
+    using ymm_t = typename vtype::ymm_t;
+    __m512i rand_index1 = _mm512_set_epi64(
+        left + size, left + 2 * size, left + 3 * size, left + 4 * size,
+        left + 5 * size, left + 6 * size, left + 7 * size, left + 8 * size);
+    __m512i rand_index2 = _mm512_set_epi64(
+        left + 9 * size, left + 10 * size, left + 11 * size, left + 12 * size,
+        left + 13 * size, left + 14 * size, left + 15 * size, left + 16 * size);
+    ymm_t rand_vec1 =
+        vtype::template i64gather<sizeof(type_t)>(rand_index1, arr);
+    ymm_t rand_vec2 =
+        vtype::template i64gather<sizeof(type_t)>(rand_index2, arr);
+    zmm_t rand_vec = vtype::merge(rand_vec1, rand_vec2);
+    zmm_t sort = sort_zmm_32bit<vtype>(rand_vec);
+    // pivot will never be a nan, since there are no nan's!
+    return ((type_t *)&sort)[8];
+}
+
+template <typename vtype, typename type_t>
+static void qsort_32bit_(type_t *arr, int64_t left, int64_t right,
+                         int64_t max_iters) {
+    /*
+     * Resort to std::sort if quicksort isnt making any progress
+     */
+    if (max_iters <= 0) {
+        std::sort(arr + left, arr + right + 1);
+        return;
+    }
+    /*
+     * Base case: use bitonic networks to sort arrays <= 128
+     */
+    if (right + 1 - left <= 128) {
+        sort_128_32bit<vtype>(arr + left, (int32_t)(right + 1 - left));
+        return;
+    }
+
+    type_t pivot = get_pivot_32bit<vtype>(arr, left, right);
+    type_t smallest = vtype::type_max();
+    type_t biggest = vtype::type_min();
+    int64_t pivot_index = partition_avx512_unrolled<vtype, 2>(
+        arr, left, right + 1, pivot, &smallest, &biggest);
+    if (pivot != smallest)
+        qsort_32bit_<vtype>(arr, left, pivot_index - 1, max_iters - 1);
+    if (pivot != biggest)
+        qsort_32bit_<vtype>(arr, pivot_index, right, max_iters - 1);
+}
+
+template <typename vtype, typename type_t>
+static void qselect_32bit_(type_t *arr, int64_t pos, int64_t left,
+                           int64_t right, int64_t max_iters) {
+    /*
+     * Resort to std::sort if quicksort isnt making any progress
+     */
+    if (max_iters <= 0) {
+        std::sort(arr + left, arr + right + 1);
+        return;
+    }
+    /*
+     * Base case: use bitonic networks to sort arrays <= 128
+     */
+    if (right + 1 - left <= 128) {
+        sort_128_32bit<vtype>(arr + left, (int32_t)(right + 1 - left));
+        return;
+    }
+
+    type_t pivot = get_pivot_32bit<vtype>(arr, left, right);
+    type_t smallest = vtype::type_max();
+    type_t biggest = vtype::type_min();
+    int64_t pivot_index = partition_avx512_unrolled<vtype, 2>(
+        arr, left, right + 1, pivot, &smallest, &biggest);
+    if ((pivot != smallest) && (pos < pivot_index))
+        qselect_32bit_<vtype>(arr, pos, left, pivot_index - 1, max_iters - 1);
+    else if ((pivot != biggest) && (pos >= pivot_index))
+        qselect_32bit_<vtype>(arr, pos, pivot_index, right, max_iters - 1);
+}
+
+X86_SIMD_SORT_INLINE int64_t replace_nan_with_inf(float *arr, int64_t arrsize) {
+    int64_t nan_count = 0;
+    __mmask16 loadmask = 0xFFFF;
+    while (arrsize > 0) {
+        if (arrsize < 16) {
+            loadmask = (0x0001 << arrsize) - 0x0001;
+        }
+        __m512 in_zmm = _mm512_maskz_loadu_ps(loadmask, arr);
+        __mmask16 nanmask = _mm512_cmp_ps_mask(in_zmm, in_zmm, _CMP_NEQ_UQ);
+        nan_count += _mm_popcnt_u32((int32_t)nanmask);
+        _mm512_mask_storeu_ps(arr, nanmask, ZMM_MAX_FLOAT);
+        arr += 16;
+        arrsize -= 16;
+    }
+    return nan_count;
+}
+
+X86_SIMD_SORT_INLINE void replace_inf_with_nan(float *arr, int64_t arrsize,
+                                               int64_t nan_count) {
+    for (int64_t ii = arrsize - 1; nan_count > 0; --ii) {
+        arr[ii] = std::nanf("1");
+        nan_count -= 1;
+    }
+}
+
+template <>
+void avx512_qselect<int32_t>(int32_t *arr, int64_t k, int64_t arrsize) {
+    if (arrsize > 1) {
+        qselect_32bit_<zmm_vector<int32_t>, int32_t>(
+            arr, k, 0, arrsize - 1, 2 * (int64_t)log2(arrsize));
+    }
+}
+
+template <>
+void avx512_qselect<uint32_t>(uint32_t *arr, int64_t k, int64_t arrsize) {
+    if (arrsize > 1) {
+        qselect_32bit_<zmm_vector<uint32_t>, uint32_t>(
+            arr, k, 0, arrsize - 1, 2 * (int64_t)log2(arrsize));
+    }
+}
+
+template <>
+void avx512_qselect<float>(float *arr, int64_t k, int64_t arrsize) {
+    if (arrsize > 1) {
+        int64_t nan_count = replace_nan_with_inf(arr, arrsize);
+        qselect_32bit_<zmm_vector<float>, float>(arr, k, 0, arrsize - 1,
+                                                 2 * (int64_t)log2(arrsize));
+        replace_inf_with_nan(arr, arrsize, nan_count);
+    }
+}
+
+template <>
+void avx512_qsort<int32_t>(int32_t *arr, int64_t arrsize) {
+    if (arrsize > 1) {
+        qsort_32bit_<zmm_vector<int32_t>, int32_t>(arr, 0, arrsize - 1,
+                                                   2 * (int64_t)log2(arrsize));
+    }
+}
+
+template <>
+void avx512_qsort<uint32_t>(uint32_t *arr, int64_t arrsize) {
+    if (arrsize > 1) {
+        qsort_32bit_<zmm_vector<uint32_t>, uint32_t>(
+            arr, 0, arrsize - 1, 2 * (int64_t)log2(arrsize));
+    }
+}
+
+template <>
+void avx512_qsort<float>(float *arr, int64_t arrsize) {
+    if (arrsize > 1) {
+        int64_t nan_count = replace_nan_with_inf(arr, arrsize);
+        qsort_32bit_<zmm_vector<float>, float>(arr, 0, arrsize - 1,
+                                               2 * (int64_t)log2(arrsize));
+        replace_inf_with_nan(arr, arrsize, nan_count);
+    }
+}
+
+#endif  // AVX512_QSORT_32BIT
diff --git a/src/java.base/linux/native/libavx512_x86_64/avx512-64bit-common.h b/src/java.base/linux/native/libavx512_x86_64/avx512-64bit-common.h
new file mode 100644
index 0000000000000..88fee99c0d79e
--- /dev/null
+++ b/src/java.base/linux/native/libavx512_x86_64/avx512-64bit-common.h
@@ -0,0 +1,588 @@
+/*
+ * Copyright (c) 2023 Intel Corporation. All rights reserved.
+ * Intel x86-simd-sort source code.
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef AVX512_64BIT_COMMON
+#define AVX512_64BIT_COMMON
+#include "avx512-common-qsort.h"
+
+/*
+ * Constants used in sorting 8 elements in a ZMM registers. Based on Bitonic
+ * sorting network (see
+ * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg)
+ */
+// ZMM                  7, 6, 5, 4, 3, 2, 1, 0
+#define NETWORK_64BIT_1 4, 5, 6, 7, 0, 1, 2, 3
+#define NETWORK_64BIT_2 0, 1, 2, 3, 4, 5, 6, 7
+#define NETWORK_64BIT_3 5, 4, 7, 6, 1, 0, 3, 2
+#define NETWORK_64BIT_4 3, 2, 1, 0, 7, 6, 5, 4
+
+template <>
+struct ymm_vector<float> {
+    using type_t = float;
+    using zmm_t = __m256;
+    using zmmi_t = __m256i;
+    using opmask_t = __mmask8;
+    static const uint8_t numlanes = 8;
+
+    static type_t type_max() { return X86_SIMD_SORT_INFINITYF; }
+    static type_t type_min() { return -X86_SIMD_SORT_INFINITYF; }
+    static zmm_t zmm_max() { return _mm256_set1_ps(type_max()); }
+
+    static zmmi_t seti(int v1, int v2, int v3, int v4, int v5, int v6, int v7,
+                       int v8) {
+        return _mm256_set_epi32(v1, v2, v3, v4, v5, v6, v7, v8);
+    }
+    static opmask_t kxor_opmask(opmask_t x, opmask_t y) {
+        return _kxor_mask8(x, y);
+    }
+    static opmask_t knot_opmask(opmask_t x) { return _knot_mask8(x); }
+    static opmask_t le(zmm_t x, zmm_t y) {
+        return _mm256_cmp_ps_mask(x, y, _CMP_LE_OQ);
+    }
+    static opmask_t ge(zmm_t x, zmm_t y) {
+        return _mm256_cmp_ps_mask(x, y, _CMP_GE_OQ);
+    }
+    static opmask_t eq(zmm_t x, zmm_t y) {
+        return _mm256_cmp_ps_mask(x, y, _CMP_EQ_OQ);
+    }
+    template <int type>
+    static opmask_t fpclass(zmm_t x) {
+        return _mm256_fpclass_ps_mask(x, type);
+    }
+    template <int scale>
+    static zmm_t mask_i64gather(zmm_t src, opmask_t mask, __m512i index,
+                                void const *base) {
+        return _mm512_mask_i64gather_ps(src, mask, index, base, scale);
+    }
+    template <int scale>
+    static zmm_t i64gather(__m512i index, void const *base) {
+        return _mm512_i64gather_ps(index, base, scale);
+    }
+    static zmm_t loadu(void const *mem) {
+        return _mm256_loadu_ps((float *)mem);
+    }
+    static zmm_t max(zmm_t x, zmm_t y) { return _mm256_max_ps(x, y); }
+    static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x) {
+        return _mm256_mask_compressstoreu_ps(mem, mask, x);
+    }
+    static zmm_t maskz_loadu(opmask_t mask, void const *mem) {
+        return _mm256_maskz_loadu_ps(mask, mem);
+    }
+    static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem) {
+        return _mm256_mask_loadu_ps(x, mask, mem);
+    }
+    static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y) {
+        return _mm256_mask_mov_ps(x, mask, y);
+    }
+    static void mask_storeu(void *mem, opmask_t mask, zmm_t x) {
+        return _mm256_mask_storeu_ps(mem, mask, x);
+    }
+    static zmm_t min(zmm_t x, zmm_t y) { return _mm256_min_ps(x, y); }
+    static zmm_t permutexvar(__m256i idx, zmm_t zmm) {
+        return _mm256_permutexvar_ps(idx, zmm);
+    }
+    static type_t reducemax(zmm_t v) {
+        __m128 v128 =
+            _mm_max_ps(_mm256_castps256_ps128(v), _mm256_extractf32x4_ps(v, 1));
+        __m128 v64 = _mm_max_ps(
+            v128, _mm_shuffle_ps(v128, v128, _MM_SHUFFLE(1, 0, 3, 2)));
+        __m128 v32 =
+            _mm_max_ps(v64, _mm_shuffle_ps(v64, v64, _MM_SHUFFLE(0, 0, 0, 1)));
+        return _mm_cvtss_f32(v32);
+    }
+    static type_t reducemin(zmm_t v) {
+        __m128 v128 =
+            _mm_min_ps(_mm256_castps256_ps128(v), _mm256_extractf32x4_ps(v, 1));
+        __m128 v64 = _mm_min_ps(
+            v128, _mm_shuffle_ps(v128, v128, _MM_SHUFFLE(1, 0, 3, 2)));
+        __m128 v32 =
+            _mm_min_ps(v64, _mm_shuffle_ps(v64, v64, _MM_SHUFFLE(0, 0, 0, 1)));
+        return _mm_cvtss_f32(v32);
+    }
+    static zmm_t set1(type_t v) { return _mm256_set1_ps(v); }
+    template <uint8_t mask, bool = (mask == 0b01010101)>
+    static zmm_t shuffle(zmm_t zmm) {
+        /* Hack!: have to make shuffles within 128-bit lanes work for both
+         * 32-bit and 64-bit */
+        return _mm256_shuffle_ps(zmm, zmm, 0b10110001);
+        // if constexpr (mask == 0b01010101) {
+        // }
+        // else {
+        //     /* Not used, so far */
+        //     return _mm256_shuffle_ps(zmm, zmm, mask);
+        // }
+    }
+    static void storeu(void *mem, zmm_t x) {
+        _mm256_storeu_ps((float *)mem, x);
+    }
+};
+template <>
+struct ymm_vector<uint32_t> {
+    using type_t = uint32_t;
+    using zmm_t = __m256i;
+    using zmmi_t = __m256i;
+    using opmask_t = __mmask8;
+    static const uint8_t numlanes = 8;
+
+    static type_t type_max() { return X86_SIMD_SORT_MAX_UINT32; }
+    static type_t type_min() { return 0; }
+    static zmm_t zmm_max() { return _mm256_set1_epi32(type_max()); }
+
+    static zmmi_t seti(int v1, int v2, int v3, int v4, int v5, int v6, int v7,
+                       int v8) {
+        return _mm256_set_epi32(v1, v2, v3, v4, v5, v6, v7, v8);
+    }
+    static opmask_t kxor_opmask(opmask_t x, opmask_t y) {
+        return _kxor_mask8(x, y);
+    }
+    static opmask_t knot_opmask(opmask_t x) { return _knot_mask8(x); }
+    static opmask_t le(zmm_t x, zmm_t y) {
+        return _mm256_cmp_epu32_mask(x, y, _MM_CMPINT_LE);
+    }
+    static opmask_t ge(zmm_t x, zmm_t y) {
+        return _mm256_cmp_epu32_mask(x, y, _MM_CMPINT_NLT);
+    }
+    static opmask_t eq(zmm_t x, zmm_t y) {
+        return _mm256_cmp_epu32_mask(x, y, _MM_CMPINT_EQ);
+    }
+    template <int scale>
+    static zmm_t mask_i64gather(zmm_t src, opmask_t mask, __m512i index,
+                                void const *base) {
+        return _mm512_mask_i64gather_epi32(src, mask, index, base, scale);
+    }
+    template <int scale>
+    static zmm_t i64gather(__m512i index, void const *base) {
+        return _mm512_i64gather_epi32(index, base, scale);
+    }
+    static zmm_t loadu(void const *mem) {
+        return _mm256_loadu_si256((__m256i *)mem);
+    }
+    static zmm_t max(zmm_t x, zmm_t y) { return _mm256_max_epu32(x, y); }
+    static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x) {
+        return _mm256_mask_compressstoreu_epi32(mem, mask, x);
+    }
+    static zmm_t maskz_loadu(opmask_t mask, void const *mem) {
+        return _mm256_maskz_loadu_epi32(mask, mem);
+    }
+    static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem) {
+        return _mm256_mask_loadu_epi32(x, mask, mem);
+    }
+    static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y) {
+        return _mm256_mask_mov_epi32(x, mask, y);
+    }
+    static void mask_storeu(void *mem, opmask_t mask, zmm_t x) {
+        return _mm256_mask_storeu_epi32(mem, mask, x);
+    }
+    static zmm_t min(zmm_t x, zmm_t y) { return _mm256_min_epu32(x, y); }
+    static zmm_t permutexvar(__m256i idx, zmm_t zmm) {
+        return _mm256_permutexvar_epi32(idx, zmm);
+    }
+    static type_t reducemax(zmm_t v) {
+        __m128i v128 = _mm_max_epu32(_mm256_castsi256_si128(v),
+                                     _mm256_extracti128_si256(v, 1));
+        __m128i v64 = _mm_max_epu32(
+            v128, _mm_shuffle_epi32(v128, _MM_SHUFFLE(1, 0, 3, 2)));
+        __m128i v32 =
+            _mm_max_epu32(v64, _mm_shuffle_epi32(v64, _MM_SHUFFLE(0, 0, 0, 1)));
+        return (type_t)_mm_cvtsi128_si32(v32);
+    }
+    static type_t reducemin(zmm_t v) {
+        __m128i v128 = _mm_min_epu32(_mm256_castsi256_si128(v),
+                                     _mm256_extracti128_si256(v, 1));
+        __m128i v64 = _mm_min_epu32(
+            v128, _mm_shuffle_epi32(v128, _MM_SHUFFLE(1, 0, 3, 2)));
+        __m128i v32 =
+            _mm_min_epu32(v64, _mm_shuffle_epi32(v64, _MM_SHUFFLE(0, 0, 0, 1)));
+        return (type_t)_mm_cvtsi128_si32(v32);
+    }
+    static zmm_t set1(type_t v) { return _mm256_set1_epi32(v); }
+    template <uint8_t mask, bool = (mask == 0b01010101)>
+    static zmm_t shuffle(zmm_t zmm) {
+        /* Hack!: have to make shuffles within 128-bit lanes work for both
+         * 32-bit and 64-bit */
+        return _mm256_shuffle_epi32(zmm, 0b10110001);
+    }
+    static void storeu(void *mem, zmm_t x) {
+        _mm256_storeu_si256((__m256i *)mem, x);
+    }
+};
+template <>
+struct ymm_vector<int32_t> {
+    using type_t = int32_t;
+    using zmm_t = __m256i;
+    using zmmi_t = __m256i;
+    using opmask_t = __mmask8;
+    static const uint8_t numlanes = 8;
+
+    static type_t type_max() { return X86_SIMD_SORT_MAX_INT32; }
+    static type_t type_min() { return X86_SIMD_SORT_MIN_INT32; }
+    static zmm_t zmm_max() {
+        return _mm256_set1_epi32(type_max());
+    }  // TODO: this should broadcast bits as is?
+
+    static zmmi_t seti(int v1, int v2, int v3, int v4, int v5, int v6, int v7,
+                       int v8) {
+        return _mm256_set_epi32(v1, v2, v3, v4, v5, v6, v7, v8);
+    }
+    static opmask_t kxor_opmask(opmask_t x, opmask_t y) {
+        return _kxor_mask8(x, y);
+    }
+    static opmask_t knot_opmask(opmask_t x) { return _knot_mask8(x); }
+    static opmask_t le(zmm_t x, zmm_t y) {
+        return _mm256_cmp_epi32_mask(x, y, _MM_CMPINT_LE);
+    }
+    static opmask_t ge(zmm_t x, zmm_t y) {
+        return _mm256_cmp_epi32_mask(x, y, _MM_CMPINT_NLT);
+    }
+    static opmask_t eq(zmm_t x, zmm_t y) {
+        return _mm256_cmp_epi32_mask(x, y, _MM_CMPINT_EQ);
+    }
+    template <int scale>
+    static zmm_t mask_i64gather(zmm_t src, opmask_t mask, __m512i index,
+                                void const *base) {
+        return _mm512_mask_i64gather_epi32(src, mask, index, base, scale);
+    }
+    template <int scale>
+    static zmm_t i64gather(__m512i index, void const *base) {
+        return _mm512_i64gather_epi32(index, base, scale);
+    }
+    static zmm_t loadu(void const *mem) {
+        return _mm256_loadu_si256((__m256i *)mem);
+    }
+    static zmm_t max(zmm_t x, zmm_t y) { return _mm256_max_epi32(x, y); }
+    static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x) {
+        return _mm256_mask_compressstoreu_epi32(mem, mask, x);
+    }
+    static zmm_t maskz_loadu(opmask_t mask, void const *mem) {
+        return _mm256_maskz_loadu_epi32(mask, mem);
+    }
+    static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem) {
+        return _mm256_mask_loadu_epi32(x, mask, mem);
+    }
+    static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y) {
+        return _mm256_mask_mov_epi32(x, mask, y);
+    }
+    static void mask_storeu(void *mem, opmask_t mask, zmm_t x) {
+        return _mm256_mask_storeu_epi32(mem, mask, x);
+    }
+    static zmm_t min(zmm_t x, zmm_t y) { return _mm256_min_epi32(x, y); }
+    static zmm_t permutexvar(__m256i idx, zmm_t zmm) {
+        return _mm256_permutexvar_epi32(idx, zmm);
+    }
+    static type_t reducemax(zmm_t v) {
+        __m128i v128 = _mm_max_epi32(_mm256_castsi256_si128(v),
+                                     _mm256_extracti128_si256(v, 1));
+        __m128i v64 = _mm_max_epi32(
+            v128, _mm_shuffle_epi32(v128, _MM_SHUFFLE(1, 0, 3, 2)));
+        __m128i v32 =
+            _mm_max_epi32(v64, _mm_shuffle_epi32(v64, _MM_SHUFFLE(0, 0, 0, 1)));
+        return (type_t)_mm_cvtsi128_si32(v32);
+    }
+    static type_t reducemin(zmm_t v) {
+        __m128i v128 = _mm_min_epi32(_mm256_castsi256_si128(v),
+                                     _mm256_extracti128_si256(v, 1));
+        __m128i v64 = _mm_min_epi32(
+            v128, _mm_shuffle_epi32(v128, _MM_SHUFFLE(1, 0, 3, 2)));
+        __m128i v32 =
+            _mm_min_epi32(v64, _mm_shuffle_epi32(v64, _MM_SHUFFLE(0, 0, 0, 1)));
+        return (type_t)_mm_cvtsi128_si32(v32);
+    }
+    static zmm_t set1(type_t v) { return _mm256_set1_epi32(v); }
+    template <uint8_t mask, bool = (mask == 0b01010101)>
+    static zmm_t shuffle(zmm_t zmm) {
+        /* Hack!: have to make shuffles within 128-bit lanes work for both
+         * 32-bit and 64-bit */
+        return _mm256_shuffle_epi32(zmm, 0b10110001);
+    }
+    static void storeu(void *mem, zmm_t x) {
+        _mm256_storeu_si256((__m256i *)mem, x);
+    }
+};
+template <>
+struct zmm_vector<int64_t> {
+    using type_t = int64_t;
+    using zmm_t = __m512i;
+    using zmmi_t = __m512i;
+    using ymm_t = __m512i;
+    using opmask_t = __mmask8;
+    static const uint8_t numlanes = 8;
+
+    static type_t type_max() { return X86_SIMD_SORT_MAX_INT64; }
+    static type_t type_min() { return X86_SIMD_SORT_MIN_INT64; }
+    static zmm_t zmm_max() {
+        return _mm512_set1_epi64(type_max());
+    }  // TODO: this should broadcast bits as is?
+
+    static zmmi_t seti(int v1, int v2, int v3, int v4, int v5, int v6, int v7,
+                       int v8) {
+        return _mm512_set_epi64(v1, v2, v3, v4, v5, v6, v7, v8);
+    }
+    static opmask_t kxor_opmask(opmask_t x, opmask_t y) {
+        return _kxor_mask8(x, y);
+    }
+    static opmask_t knot_opmask(opmask_t x) { return _knot_mask8(x); }
+    static opmask_t le(zmm_t x, zmm_t y) {
+        return _mm512_cmp_epi64_mask(x, y, _MM_CMPINT_LE);
+    }
+    static opmask_t ge(zmm_t x, zmm_t y) {
+        return _mm512_cmp_epi64_mask(x, y, _MM_CMPINT_NLT);
+    }
+    static opmask_t eq(zmm_t x, zmm_t y) {
+        return _mm512_cmp_epi64_mask(x, y, _MM_CMPINT_EQ);
+    }
+    template <int scale>
+    static zmm_t mask_i64gather(zmm_t src, opmask_t mask, __m512i index,
+                                void const *base) {
+        return _mm512_mask_i64gather_epi64(src, mask, index, base, scale);
+    }
+    template <int scale>
+    static zmm_t i64gather(__m512i index, void const *base) {
+        return _mm512_i64gather_epi64(index, base, scale);
+    }
+    static zmm_t loadu(void const *mem) { return _mm512_loadu_si512(mem); }
+    static zmm_t max(zmm_t x, zmm_t y) { return _mm512_max_epi64(x, y); }
+    static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x) {
+        return _mm512_mask_compressstoreu_epi64(mem, mask, x);
+    }
+    static zmm_t maskz_loadu(opmask_t mask, void const *mem) {
+        return _mm512_maskz_loadu_epi64(mask, mem);
+    }
+    static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem) {
+        return _mm512_mask_loadu_epi64(x, mask, mem);
+    }
+    static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y) {
+        return _mm512_mask_mov_epi64(x, mask, y);
+    }
+    static void mask_storeu(void *mem, opmask_t mask, zmm_t x) {
+        return _mm512_mask_storeu_epi64(mem, mask, x);
+    }
+    static zmm_t min(zmm_t x, zmm_t y) { return _mm512_min_epi64(x, y); }
+    static zmm_t permutexvar(__m512i idx, zmm_t zmm) {
+        return _mm512_permutexvar_epi64(idx, zmm);
+    }
+    static type_t reducemax(zmm_t v) { return _mm512_reduce_max_epi64(v); }
+    static type_t reducemin(zmm_t v) { return _mm512_reduce_min_epi64(v); }
+    static zmm_t set1(type_t v) { return _mm512_set1_epi64(v); }
+    template <uint8_t mask>
+    static zmm_t shuffle(zmm_t zmm) {
+        __m512d temp = _mm512_castsi512_pd(zmm);
+        return _mm512_castpd_si512(
+            _mm512_shuffle_pd(temp, temp, (_MM_PERM_ENUM)mask));
+    }
+    static void storeu(void *mem, zmm_t x) { _mm512_storeu_si512(mem, x); }
+};
+template <>
+struct zmm_vector<uint64_t> {
+    using type_t = uint64_t;
+    using zmm_t = __m512i;
+    using zmmi_t = __m512i;
+    using ymm_t = __m512i;
+    using opmask_t = __mmask8;
+    static const uint8_t numlanes = 8;
+
+    static type_t type_max() { return X86_SIMD_SORT_MAX_UINT64; }
+    static type_t type_min() { return 0; }
+    static zmm_t zmm_max() { return _mm512_set1_epi64(type_max()); }
+
+    static zmmi_t seti(int v1, int v2, int v3, int v4, int v5, int v6, int v7,
+                       int v8) {
+        return _mm512_set_epi64(v1, v2, v3, v4, v5, v6, v7, v8);
+    }
+    template <int scale>
+    static zmm_t mask_i64gather(zmm_t src, opmask_t mask, __m512i index,
+                                void const *base) {
+        return _mm512_mask_i64gather_epi64(src, mask, index, base, scale);
+    }
+    template <int scale>
+    static zmm_t i64gather(__m512i index, void const *base) {
+        return _mm512_i64gather_epi64(index, base, scale);
+    }
+    static opmask_t knot_opmask(opmask_t x) { return _knot_mask8(x); }
+    static opmask_t ge(zmm_t x, zmm_t y) {
+        return _mm512_cmp_epu64_mask(x, y, _MM_CMPINT_NLT);
+    }
+    static opmask_t eq(zmm_t x, zmm_t y) {
+        return _mm512_cmp_epu64_mask(x, y, _MM_CMPINT_EQ);
+    }
+    static zmm_t loadu(void const *mem) { return _mm512_loadu_si512(mem); }
+    static zmm_t max(zmm_t x, zmm_t y) { return _mm512_max_epu64(x, y); }
+    static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x) {
+        return _mm512_mask_compressstoreu_epi64(mem, mask, x);
+    }
+    static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem) {
+        return _mm512_mask_loadu_epi64(x, mask, mem);
+    }
+    static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y) {
+        return _mm512_mask_mov_epi64(x, mask, y);
+    }
+    static void mask_storeu(void *mem, opmask_t mask, zmm_t x) {
+        return _mm512_mask_storeu_epi64(mem, mask, x);
+    }
+    static zmm_t min(zmm_t x, zmm_t y) { return _mm512_min_epu64(x, y); }
+    static zmm_t permutexvar(__m512i idx, zmm_t zmm) {
+        return _mm512_permutexvar_epi64(idx, zmm);
+    }
+    static type_t reducemax(zmm_t v) { return _mm512_reduce_max_epu64(v); }
+    static type_t reducemin(zmm_t v) { return _mm512_reduce_min_epu64(v); }
+    static zmm_t set1(type_t v) { return _mm512_set1_epi64(v); }
+    template <uint8_t mask>
+    static zmm_t shuffle(zmm_t zmm) {
+        __m512d temp = _mm512_castsi512_pd(zmm);
+        return _mm512_castpd_si512(
+            _mm512_shuffle_pd(temp, temp, (_MM_PERM_ENUM)mask));
+    }
+    static void storeu(void *mem, zmm_t x) { _mm512_storeu_si512(mem, x); }
+};
+template <>
+struct zmm_vector<double> {
+    using type_t = double;
+    using zmm_t = __m512d;
+    using zmmi_t = __m512i;
+    using ymm_t = __m512d;
+    using opmask_t = __mmask8;
+    static const uint8_t numlanes = 8;
+
+    static type_t type_max() { return X86_SIMD_SORT_INFINITY; }
+    static type_t type_min() { return -X86_SIMD_SORT_INFINITY; }
+    static zmm_t zmm_max() { return _mm512_set1_pd(type_max()); }
+
+    static zmmi_t seti(int v1, int v2, int v3, int v4, int v5, int v6, int v7,
+                       int v8) {
+        return _mm512_set_epi64(v1, v2, v3, v4, v5, v6, v7, v8);
+    }
+
+    static zmm_t maskz_loadu(opmask_t mask, void const *mem) {
+        return _mm512_maskz_loadu_pd(mask, mem);
+    }
+    static opmask_t knot_opmask(opmask_t x) { return _knot_mask8(x); }
+    static opmask_t ge(zmm_t x, zmm_t y) {
+        return _mm512_cmp_pd_mask(x, y, _CMP_GE_OQ);
+    }
+    static opmask_t eq(zmm_t x, zmm_t y) {
+        return _mm512_cmp_pd_mask(x, y, _CMP_EQ_OQ);
+    }
+    template <int type>
+    static opmask_t fpclass(zmm_t x) {
+        return _mm512_fpclass_pd_mask(x, type);
+    }
+    template <int scale>
+    static zmm_t mask_i64gather(zmm_t src, opmask_t mask, __m512i index,
+                                void const *base) {
+        return _mm512_mask_i64gather_pd(src, mask, index, base, scale);
+    }
+    template <int scale>
+    static zmm_t i64gather(__m512i index, void const *base) {
+        return _mm512_i64gather_pd(index, base, scale);
+    }
+    static zmm_t loadu(void const *mem) { return _mm512_loadu_pd(mem); }
+    static zmm_t max(zmm_t x, zmm_t y) { return _mm512_max_pd(x, y); }
+    static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x) {
+        return _mm512_mask_compressstoreu_pd(mem, mask, x);
+    }
+    static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem) {
+        return _mm512_mask_loadu_pd(x, mask, mem);
+    }
+    static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y) {
+        return _mm512_mask_mov_pd(x, mask, y);
+    }
+    static void mask_storeu(void *mem, opmask_t mask, zmm_t x) {
+        return _mm512_mask_storeu_pd(mem, mask, x);
+    }
+    static zmm_t min(zmm_t x, zmm_t y) { return _mm512_min_pd(x, y); }
+    static zmm_t permutexvar(__m512i idx, zmm_t zmm) {
+        return _mm512_permutexvar_pd(idx, zmm);
+    }
+    static type_t reducemax(zmm_t v) { return _mm512_reduce_max_pd(v); }
+    static type_t reducemin(zmm_t v) { return _mm512_reduce_min_pd(v); }
+    static zmm_t set1(type_t v) { return _mm512_set1_pd(v); }
+    template <uint8_t mask>
+    static zmm_t shuffle(zmm_t zmm) {
+        return _mm512_shuffle_pd(zmm, zmm, (_MM_PERM_ENUM)mask);
+    }
+    static void storeu(void *mem, zmm_t x) { _mm512_storeu_pd(mem, x); }
+};
+X86_SIMD_SORT_INLINE int64_t replace_nan_with_inf(double *arr,
+                                                  int64_t arrsize) {
+    int64_t nan_count = 0;
+    __mmask8 loadmask = 0xFF;
+    while (arrsize > 0) {
+        if (arrsize < 8) {
+            loadmask = (0x01 << arrsize) - 0x01;
+        }
+        __m512d in_zmm = _mm512_maskz_loadu_pd(loadmask, arr);
+        __mmask8 nanmask = _mm512_cmp_pd_mask(in_zmm, in_zmm, _CMP_NEQ_UQ);
+        nan_count += _mm_popcnt_u32((int32_t)nanmask);
+        _mm512_mask_storeu_pd(arr, nanmask, ZMM_MAX_DOUBLE);
+        arr += 8;
+        arrsize -= 8;
+    }
+    return nan_count;
+}
+
+X86_SIMD_SORT_INLINE void replace_inf_with_nan(double *arr, int64_t arrsize,
+                                               int64_t nan_count) {
+    for (int64_t ii = arrsize - 1; nan_count > 0; --ii) {
+        arr[ii] = std::nan("1");
+        nan_count -= 1;
+    }
+}
+/*
+ * Assumes zmm is random and performs a full sorting network defined in
+ * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg
+ */
+template <typename vtype, typename zmm_t = typename vtype::zmm_t>
+X86_SIMD_SORT_INLINE zmm_t sort_zmm_64bit(zmm_t zmm) {
+    const typename vtype::zmmi_t rev_index = vtype::seti(NETWORK_64BIT_2);
+    zmm = cmp_merge<vtype>(
+        zmm, vtype::template shuffle<SHUFFLE_MASK(1, 1, 1, 1)>(zmm), 0xAA);
+    zmm = cmp_merge<vtype>(
+        zmm, vtype::permutexvar(vtype::seti(NETWORK_64BIT_1), zmm), 0xCC);
+    zmm = cmp_merge<vtype>(
+        zmm, vtype::template shuffle<SHUFFLE_MASK(1, 1, 1, 1)>(zmm), 0xAA);
+    zmm = cmp_merge<vtype>(zmm, vtype::permutexvar(rev_index, zmm), 0xF0);
+    zmm = cmp_merge<vtype>(
+        zmm, vtype::permutexvar(vtype::seti(NETWORK_64BIT_3), zmm), 0xCC);
+    zmm = cmp_merge<vtype>(
+        zmm, vtype::template shuffle<SHUFFLE_MASK(1, 1, 1, 1)>(zmm), 0xAA);
+    return zmm;
+}
+
+template <typename vtype, typename type_t>
+X86_SIMD_SORT_INLINE type_t get_pivot_64bit(type_t *arr, const int64_t left,
+                                            const int64_t right) {
+    // median of 8
+    int64_t size = (right - left) / 8;
+    using zmm_t = typename vtype::zmm_t;
+    __m512i rand_index = _mm512_set_epi64(
+        left + size, left + 2 * size, left + 3 * size, left + 4 * size,
+        left + 5 * size, left + 6 * size, left + 7 * size, left + 8 * size);
+    zmm_t rand_vec = vtype::template i64gather<sizeof(type_t)>(rand_index, arr);
+    // pivot will never be a nan, since there are no nan's!
+    zmm_t sort = sort_zmm_64bit<vtype>(rand_vec);
+    return ((type_t *)&sort)[4];
+}
+
+#endif
diff --git a/src/java.base/linux/native/libavx512_x86_64/avx512-64bit-qsort.hpp b/src/java.base/linux/native/libavx512_x86_64/avx512-64bit-qsort.hpp
new file mode 100644
index 0000000000000..893f2ce8363c8
--- /dev/null
+++ b/src/java.base/linux/native/libavx512_x86_64/avx512-64bit-qsort.hpp
@@ -0,0 +1,834 @@
+/*
+ * Copyright (c) 2023 Intel Corporation. All rights reserved.
+ * Intel x86-simd-sort source code.
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef AVX512_QSORT_64BIT
+#define AVX512_QSORT_64BIT
+
+#include "avx512-64bit-common.h"
+
+// Assumes zmm is bitonic and performs a recursive half cleaner
+template <typename vtype, typename zmm_t = typename vtype::zmm_t>
+X86_SIMD_SORT_INLINE zmm_t bitonic_merge_zmm_64bit(zmm_t zmm) {
+    // 1) half_cleaner[8]: compare 0-4, 1-5, 2-6, 3-7
+    zmm = cmp_merge<vtype>(
+        zmm, vtype::permutexvar(_mm512_set_epi64(NETWORK_64BIT_4), zmm), 0xF0);
+    // 2) half_cleaner[4]
+    zmm = cmp_merge<vtype>(
+        zmm, vtype::permutexvar(_mm512_set_epi64(NETWORK_64BIT_3), zmm), 0xCC);
+    // 3) half_cleaner[1]
+    zmm = cmp_merge<vtype>(
+        zmm, vtype::template shuffle<SHUFFLE_MASK(1, 1, 1, 1)>(zmm), 0xAA);
+    return zmm;
+}
+// Assumes zmm1 and zmm2 are sorted and performs a recursive half cleaner
+template <typename vtype, typename zmm_t = typename vtype::zmm_t>
+X86_SIMD_SORT_INLINE void bitonic_merge_two_zmm_64bit(zmm_t &zmm1,
+                                                      zmm_t &zmm2) {
+    const __m512i rev_index = _mm512_set_epi64(NETWORK_64BIT_2);
+    // 1) First step of a merging network: coex of zmm1 and zmm2 reversed
+    zmm2 = vtype::permutexvar(rev_index, zmm2);
+    zmm_t zmm3 = vtype::min(zmm1, zmm2);
+    zmm_t zmm4 = vtype::max(zmm1, zmm2);
+    // 2) Recursive half cleaner for each
+    zmm1 = bitonic_merge_zmm_64bit<vtype>(zmm3);
+    zmm2 = bitonic_merge_zmm_64bit<vtype>(zmm4);
+}
+// Assumes [zmm0, zmm1] and [zmm2, zmm3] are sorted and performs a recursive
+// half cleaner
+template <typename vtype, typename zmm_t = typename vtype::zmm_t>
+X86_SIMD_SORT_INLINE void bitonic_merge_four_zmm_64bit(zmm_t *zmm) {
+    const __m512i rev_index = _mm512_set_epi64(NETWORK_64BIT_2);
+    // 1) First step of a merging network
+    zmm_t zmm2r = vtype::permutexvar(rev_index, zmm[2]);
+    zmm_t zmm3r = vtype::permutexvar(rev_index, zmm[3]);
+    zmm_t zmm_t1 = vtype::min(zmm[0], zmm3r);
+    zmm_t zmm_t2 = vtype::min(zmm[1], zmm2r);
+    // 2) Recursive half clearer: 16
+    zmm_t zmm_t3 = vtype::permutexvar(rev_index, vtype::max(zmm[1], zmm2r));
+    zmm_t zmm_t4 = vtype::permutexvar(rev_index, vtype::max(zmm[0], zmm3r));
+    zmm_t zmm0 = vtype::min(zmm_t1, zmm_t2);
+    zmm_t zmm1 = vtype::max(zmm_t1, zmm_t2);
+    zmm_t zmm2 = vtype::min(zmm_t3, zmm_t4);
+    zmm_t zmm3 = vtype::max(zmm_t3, zmm_t4);
+    zmm[0] = bitonic_merge_zmm_64bit<vtype>(zmm0);
+    zmm[1] = bitonic_merge_zmm_64bit<vtype>(zmm1);
+    zmm[2] = bitonic_merge_zmm_64bit<vtype>(zmm2);
+    zmm[3] = bitonic_merge_zmm_64bit<vtype>(zmm3);
+}
+template <typename vtype, typename zmm_t = typename vtype::zmm_t>
+X86_SIMD_SORT_INLINE void bitonic_merge_eight_zmm_64bit(zmm_t *zmm) {
+    const __m512i rev_index = _mm512_set_epi64(NETWORK_64BIT_2);
+    zmm_t zmm4r = vtype::permutexvar(rev_index, zmm[4]);
+    zmm_t zmm5r = vtype::permutexvar(rev_index, zmm[5]);
+    zmm_t zmm6r = vtype::permutexvar(rev_index, zmm[6]);
+    zmm_t zmm7r = vtype::permutexvar(rev_index, zmm[7]);
+    zmm_t zmm_t1 = vtype::min(zmm[0], zmm7r);
+    zmm_t zmm_t2 = vtype::min(zmm[1], zmm6r);
+    zmm_t zmm_t3 = vtype::min(zmm[2], zmm5r);
+    zmm_t zmm_t4 = vtype::min(zmm[3], zmm4r);
+    zmm_t zmm_t5 = vtype::permutexvar(rev_index, vtype::max(zmm[3], zmm4r));
+    zmm_t zmm_t6 = vtype::permutexvar(rev_index, vtype::max(zmm[2], zmm5r));
+    zmm_t zmm_t7 = vtype::permutexvar(rev_index, vtype::max(zmm[1], zmm6r));
+    zmm_t zmm_t8 = vtype::permutexvar(rev_index, vtype::max(zmm[0], zmm7r));
+    COEX<vtype>(zmm_t1, zmm_t3);
+    COEX<vtype>(zmm_t2, zmm_t4);
+    COEX<vtype>(zmm_t5, zmm_t7);
+    COEX<vtype>(zmm_t6, zmm_t8);
+    COEX<vtype>(zmm_t1, zmm_t2);
+    COEX<vtype>(zmm_t3, zmm_t4);
+    COEX<vtype>(zmm_t5, zmm_t6);
+    COEX<vtype>(zmm_t7, zmm_t8);
+    zmm[0] = bitonic_merge_zmm_64bit<vtype>(zmm_t1);
+    zmm[1] = bitonic_merge_zmm_64bit<vtype>(zmm_t2);
+    zmm[2] = bitonic_merge_zmm_64bit<vtype>(zmm_t3);
+    zmm[3] = bitonic_merge_zmm_64bit<vtype>(zmm_t4);
+    zmm[4] = bitonic_merge_zmm_64bit<vtype>(zmm_t5);
+    zmm[5] = bitonic_merge_zmm_64bit<vtype>(zmm_t6);
+    zmm[6] = bitonic_merge_zmm_64bit<vtype>(zmm_t7);
+    zmm[7] = bitonic_merge_zmm_64bit<vtype>(zmm_t8);
+}
+template <typename vtype, typename zmm_t = typename vtype::zmm_t>
+X86_SIMD_SORT_INLINE void bitonic_merge_sixteen_zmm_64bit(zmm_t *zmm) {
+    const __m512i rev_index = _mm512_set_epi64(NETWORK_64BIT_2);
+    zmm_t zmm8r = vtype::permutexvar(rev_index, zmm[8]);
+    zmm_t zmm9r = vtype::permutexvar(rev_index, zmm[9]);
+    zmm_t zmm10r = vtype::permutexvar(rev_index, zmm[10]);
+    zmm_t zmm11r = vtype::permutexvar(rev_index, zmm[11]);
+    zmm_t zmm12r = vtype::permutexvar(rev_index, zmm[12]);
+    zmm_t zmm13r = vtype::permutexvar(rev_index, zmm[13]);
+    zmm_t zmm14r = vtype::permutexvar(rev_index, zmm[14]);
+    zmm_t zmm15r = vtype::permutexvar(rev_index, zmm[15]);
+    zmm_t zmm_t1 = vtype::min(zmm[0], zmm15r);
+    zmm_t zmm_t2 = vtype::min(zmm[1], zmm14r);
+    zmm_t zmm_t3 = vtype::min(zmm[2], zmm13r);
+    zmm_t zmm_t4 = vtype::min(zmm[3], zmm12r);
+    zmm_t zmm_t5 = vtype::min(zmm[4], zmm11r);
+    zmm_t zmm_t6 = vtype::min(zmm[5], zmm10r);
+    zmm_t zmm_t7 = vtype::min(zmm[6], zmm9r);
+    zmm_t zmm_t8 = vtype::min(zmm[7], zmm8r);
+    zmm_t zmm_t9 = vtype::permutexvar(rev_index, vtype::max(zmm[7], zmm8r));
+    zmm_t zmm_t10 = vtype::permutexvar(rev_index, vtype::max(zmm[6], zmm9r));
+    zmm_t zmm_t11 = vtype::permutexvar(rev_index, vtype::max(zmm[5], zmm10r));
+    zmm_t zmm_t12 = vtype::permutexvar(rev_index, vtype::max(zmm[4], zmm11r));
+    zmm_t zmm_t13 = vtype::permutexvar(rev_index, vtype::max(zmm[3], zmm12r));
+    zmm_t zmm_t14 = vtype::permutexvar(rev_index, vtype::max(zmm[2], zmm13r));
+    zmm_t zmm_t15 = vtype::permutexvar(rev_index, vtype::max(zmm[1], zmm14r));
+    zmm_t zmm_t16 = vtype::permutexvar(rev_index, vtype::max(zmm[0], zmm15r));
+    // Recusive half clear 16 zmm regs
+    COEX<vtype>(zmm_t1, zmm_t5);
+    COEX<vtype>(zmm_t2, zmm_t6);
+    COEX<vtype>(zmm_t3, zmm_t7);
+    COEX<vtype>(zmm_t4, zmm_t8);
+    COEX<vtype>(zmm_t9, zmm_t13);
+    COEX<vtype>(zmm_t10, zmm_t14);
+    COEX<vtype>(zmm_t11, zmm_t15);
+    COEX<vtype>(zmm_t12, zmm_t16);
+    //
+    COEX<vtype>(zmm_t1, zmm_t3);
+    COEX<vtype>(zmm_t2, zmm_t4);
+    COEX<vtype>(zmm_t5, zmm_t7);
+    COEX<vtype>(zmm_t6, zmm_t8);
+    COEX<vtype>(zmm_t9, zmm_t11);
+    COEX<vtype>(zmm_t10, zmm_t12);
+    COEX<vtype>(zmm_t13, zmm_t15);
+    COEX<vtype>(zmm_t14, zmm_t16);
+    //
+    COEX<vtype>(zmm_t1, zmm_t2);
+    COEX<vtype>(zmm_t3, zmm_t4);
+    COEX<vtype>(zmm_t5, zmm_t6);
+    COEX<vtype>(zmm_t7, zmm_t8);
+    COEX<vtype>(zmm_t9, zmm_t10);
+    COEX<vtype>(zmm_t11, zmm_t12);
+    COEX<vtype>(zmm_t13, zmm_t14);
+    COEX<vtype>(zmm_t15, zmm_t16);
+    //
+    zmm[0] = bitonic_merge_zmm_64bit<vtype>(zmm_t1);
+    zmm[1] = bitonic_merge_zmm_64bit<vtype>(zmm_t2);
+    zmm[2] = bitonic_merge_zmm_64bit<vtype>(zmm_t3);
+    zmm[3] = bitonic_merge_zmm_64bit<vtype>(zmm_t4);
+    zmm[4] = bitonic_merge_zmm_64bit<vtype>(zmm_t5);
+    zmm[5] = bitonic_merge_zmm_64bit<vtype>(zmm_t6);
+    zmm[6] = bitonic_merge_zmm_64bit<vtype>(zmm_t7);
+    zmm[7] = bitonic_merge_zmm_64bit<vtype>(zmm_t8);
+    zmm[8] = bitonic_merge_zmm_64bit<vtype>(zmm_t9);
+    zmm[9] = bitonic_merge_zmm_64bit<vtype>(zmm_t10);
+    zmm[10] = bitonic_merge_zmm_64bit<vtype>(zmm_t11);
+    zmm[11] = bitonic_merge_zmm_64bit<vtype>(zmm_t12);
+    zmm[12] = bitonic_merge_zmm_64bit<vtype>(zmm_t13);
+    zmm[13] = bitonic_merge_zmm_64bit<vtype>(zmm_t14);
+    zmm[14] = bitonic_merge_zmm_64bit<vtype>(zmm_t15);
+    zmm[15] = bitonic_merge_zmm_64bit<vtype>(zmm_t16);
+}
+
+template <typename vtype, typename zmm_t = typename vtype::zmm_t>
+X86_SIMD_SORT_INLINE void bitonic_merge_32_zmm_64bit(zmm_t *zmm) {
+    const __m512i rev_index = _mm512_set_epi64(NETWORK_64BIT_2);
+    zmm_t zmm16r = vtype::permutexvar(rev_index, zmm[16]);
+    zmm_t zmm17r = vtype::permutexvar(rev_index, zmm[17]);
+    zmm_t zmm18r = vtype::permutexvar(rev_index, zmm[18]);
+    zmm_t zmm19r = vtype::permutexvar(rev_index, zmm[19]);
+    zmm_t zmm20r = vtype::permutexvar(rev_index, zmm[20]);
+    zmm_t zmm21r = vtype::permutexvar(rev_index, zmm[21]);
+    zmm_t zmm22r = vtype::permutexvar(rev_index, zmm[22]);
+    zmm_t zmm23r = vtype::permutexvar(rev_index, zmm[23]);
+    zmm_t zmm24r = vtype::permutexvar(rev_index, zmm[24]);
+    zmm_t zmm25r = vtype::permutexvar(rev_index, zmm[25]);
+    zmm_t zmm26r = vtype::permutexvar(rev_index, zmm[26]);
+    zmm_t zmm27r = vtype::permutexvar(rev_index, zmm[27]);
+    zmm_t zmm28r = vtype::permutexvar(rev_index, zmm[28]);
+    zmm_t zmm29r = vtype::permutexvar(rev_index, zmm[29]);
+    zmm_t zmm30r = vtype::permutexvar(rev_index, zmm[30]);
+    zmm_t zmm31r = vtype::permutexvar(rev_index, zmm[31]);
+    zmm_t zmm_t1 = vtype::min(zmm[0], zmm31r);
+    zmm_t zmm_t2 = vtype::min(zmm[1], zmm30r);
+    zmm_t zmm_t3 = vtype::min(zmm[2], zmm29r);
+    zmm_t zmm_t4 = vtype::min(zmm[3], zmm28r);
+    zmm_t zmm_t5 = vtype::min(zmm[4], zmm27r);
+    zmm_t zmm_t6 = vtype::min(zmm[5], zmm26r);
+    zmm_t zmm_t7 = vtype::min(zmm[6], zmm25r);
+    zmm_t zmm_t8 = vtype::min(zmm[7], zmm24r);
+    zmm_t zmm_t9 = vtype::min(zmm[8], zmm23r);
+    zmm_t zmm_t10 = vtype::min(zmm[9], zmm22r);
+    zmm_t zmm_t11 = vtype::min(zmm[10], zmm21r);
+    zmm_t zmm_t12 = vtype::min(zmm[11], zmm20r);
+    zmm_t zmm_t13 = vtype::min(zmm[12], zmm19r);
+    zmm_t zmm_t14 = vtype::min(zmm[13], zmm18r);
+    zmm_t zmm_t15 = vtype::min(zmm[14], zmm17r);
+    zmm_t zmm_t16 = vtype::min(zmm[15], zmm16r);
+    zmm_t zmm_t17 = vtype::permutexvar(rev_index, vtype::max(zmm[15], zmm16r));
+    zmm_t zmm_t18 = vtype::permutexvar(rev_index, vtype::max(zmm[14], zmm17r));
+    zmm_t zmm_t19 = vtype::permutexvar(rev_index, vtype::max(zmm[13], zmm18r));
+    zmm_t zmm_t20 = vtype::permutexvar(rev_index, vtype::max(zmm[12], zmm19r));
+    zmm_t zmm_t21 = vtype::permutexvar(rev_index, vtype::max(zmm[11], zmm20r));
+    zmm_t zmm_t22 = vtype::permutexvar(rev_index, vtype::max(zmm[10], zmm21r));
+    zmm_t zmm_t23 = vtype::permutexvar(rev_index, vtype::max(zmm[9], zmm22r));
+    zmm_t zmm_t24 = vtype::permutexvar(rev_index, vtype::max(zmm[8], zmm23r));
+    zmm_t zmm_t25 = vtype::permutexvar(rev_index, vtype::max(zmm[7], zmm24r));
+    zmm_t zmm_t26 = vtype::permutexvar(rev_index, vtype::max(zmm[6], zmm25r));
+    zmm_t zmm_t27 = vtype::permutexvar(rev_index, vtype::max(zmm[5], zmm26r));
+    zmm_t zmm_t28 = vtype::permutexvar(rev_index, vtype::max(zmm[4], zmm27r));
+    zmm_t zmm_t29 = vtype::permutexvar(rev_index, vtype::max(zmm[3], zmm28r));
+    zmm_t zmm_t30 = vtype::permutexvar(rev_index, vtype::max(zmm[2], zmm29r));
+    zmm_t zmm_t31 = vtype::permutexvar(rev_index, vtype::max(zmm[1], zmm30r));
+    zmm_t zmm_t32 = vtype::permutexvar(rev_index, vtype::max(zmm[0], zmm31r));
+    // Recusive half clear 16 zmm regs
+    COEX<vtype>(zmm_t1, zmm_t9);
+    COEX<vtype>(zmm_t2, zmm_t10);
+    COEX<vtype>(zmm_t3, zmm_t11);
+    COEX<vtype>(zmm_t4, zmm_t12);
+    COEX<vtype>(zmm_t5, zmm_t13);
+    COEX<vtype>(zmm_t6, zmm_t14);
+    COEX<vtype>(zmm_t7, zmm_t15);
+    COEX<vtype>(zmm_t8, zmm_t16);
+    COEX<vtype>(zmm_t17, zmm_t25);
+    COEX<vtype>(zmm_t18, zmm_t26);
+    COEX<vtype>(zmm_t19, zmm_t27);
+    COEX<vtype>(zmm_t20, zmm_t28);
+    COEX<vtype>(zmm_t21, zmm_t29);
+    COEX<vtype>(zmm_t22, zmm_t30);
+    COEX<vtype>(zmm_t23, zmm_t31);
+    COEX<vtype>(zmm_t24, zmm_t32);
+    //
+    COEX<vtype>(zmm_t1, zmm_t5);
+    COEX<vtype>(zmm_t2, zmm_t6);
+    COEX<vtype>(zmm_t3, zmm_t7);
+    COEX<vtype>(zmm_t4, zmm_t8);
+    COEX<vtype>(zmm_t9, zmm_t13);
+    COEX<vtype>(zmm_t10, zmm_t14);
+    COEX<vtype>(zmm_t11, zmm_t15);
+    COEX<vtype>(zmm_t12, zmm_t16);
+    COEX<vtype>(zmm_t17, zmm_t21);
+    COEX<vtype>(zmm_t18, zmm_t22);
+    COEX<vtype>(zmm_t19, zmm_t23);
+    COEX<vtype>(zmm_t20, zmm_t24);
+    COEX<vtype>(zmm_t25, zmm_t29);
+    COEX<vtype>(zmm_t26, zmm_t30);
+    COEX<vtype>(zmm_t27, zmm_t31);
+    COEX<vtype>(zmm_t28, zmm_t32);
+    //
+    COEX<vtype>(zmm_t1, zmm_t3);
+    COEX<vtype>(zmm_t2, zmm_t4);
+    COEX<vtype>(zmm_t5, zmm_t7);
+    COEX<vtype>(zmm_t6, zmm_t8);
+    COEX<vtype>(zmm_t9, zmm_t11);
+    COEX<vtype>(zmm_t10, zmm_t12);
+    COEX<vtype>(zmm_t13, zmm_t15);
+    COEX<vtype>(zmm_t14, zmm_t16);
+    COEX<vtype>(zmm_t17, zmm_t19);
+    COEX<vtype>(zmm_t18, zmm_t20);
+    COEX<vtype>(zmm_t21, zmm_t23);
+    COEX<vtype>(zmm_t22, zmm_t24);
+    COEX<vtype>(zmm_t25, zmm_t27);
+    COEX<vtype>(zmm_t26, zmm_t28);
+    COEX<vtype>(zmm_t29, zmm_t31);
+    COEX<vtype>(zmm_t30, zmm_t32);
+    //
+    COEX<vtype>(zmm_t1, zmm_t2);
+    COEX<vtype>(zmm_t3, zmm_t4);
+    COEX<vtype>(zmm_t5, zmm_t6);
+    COEX<vtype>(zmm_t7, zmm_t8);
+    COEX<vtype>(zmm_t9, zmm_t10);
+    COEX<vtype>(zmm_t11, zmm_t12);
+    COEX<vtype>(zmm_t13, zmm_t14);
+    COEX<vtype>(zmm_t15, zmm_t16);
+    COEX<vtype>(zmm_t17, zmm_t18);
+    COEX<vtype>(zmm_t19, zmm_t20);
+    COEX<vtype>(zmm_t21, zmm_t22);
+    COEX<vtype>(zmm_t23, zmm_t24);
+    COEX<vtype>(zmm_t25, zmm_t26);
+    COEX<vtype>(zmm_t27, zmm_t28);
+    COEX<vtype>(zmm_t29, zmm_t30);
+    COEX<vtype>(zmm_t31, zmm_t32);
+    //
+    zmm[0] = bitonic_merge_zmm_64bit<vtype>(zmm_t1);
+    zmm[1] = bitonic_merge_zmm_64bit<vtype>(zmm_t2);
+    zmm[2] = bitonic_merge_zmm_64bit<vtype>(zmm_t3);
+    zmm[3] = bitonic_merge_zmm_64bit<vtype>(zmm_t4);
+    zmm[4] = bitonic_merge_zmm_64bit<vtype>(zmm_t5);
+    zmm[5] = bitonic_merge_zmm_64bit<vtype>(zmm_t6);
+    zmm[6] = bitonic_merge_zmm_64bit<vtype>(zmm_t7);
+    zmm[7] = bitonic_merge_zmm_64bit<vtype>(zmm_t8);
+    zmm[8] = bitonic_merge_zmm_64bit<vtype>(zmm_t9);
+    zmm[9] = bitonic_merge_zmm_64bit<vtype>(zmm_t10);
+    zmm[10] = bitonic_merge_zmm_64bit<vtype>(zmm_t11);
+    zmm[11] = bitonic_merge_zmm_64bit<vtype>(zmm_t12);
+    zmm[12] = bitonic_merge_zmm_64bit<vtype>(zmm_t13);
+    zmm[13] = bitonic_merge_zmm_64bit<vtype>(zmm_t14);
+    zmm[14] = bitonic_merge_zmm_64bit<vtype>(zmm_t15);
+    zmm[15] = bitonic_merge_zmm_64bit<vtype>(zmm_t16);
+    zmm[16] = bitonic_merge_zmm_64bit<vtype>(zmm_t17);
+    zmm[17] = bitonic_merge_zmm_64bit<vtype>(zmm_t18);
+    zmm[18] = bitonic_merge_zmm_64bit<vtype>(zmm_t19);
+    zmm[19] = bitonic_merge_zmm_64bit<vtype>(zmm_t20);
+    zmm[20] = bitonic_merge_zmm_64bit<vtype>(zmm_t21);
+    zmm[21] = bitonic_merge_zmm_64bit<vtype>(zmm_t22);
+    zmm[22] = bitonic_merge_zmm_64bit<vtype>(zmm_t23);
+    zmm[23] = bitonic_merge_zmm_64bit<vtype>(zmm_t24);
+    zmm[24] = bitonic_merge_zmm_64bit<vtype>(zmm_t25);
+    zmm[25] = bitonic_merge_zmm_64bit<vtype>(zmm_t26);
+    zmm[26] = bitonic_merge_zmm_64bit<vtype>(zmm_t27);
+    zmm[27] = bitonic_merge_zmm_64bit<vtype>(zmm_t28);
+    zmm[28] = bitonic_merge_zmm_64bit<vtype>(zmm_t29);
+    zmm[29] = bitonic_merge_zmm_64bit<vtype>(zmm_t30);
+    zmm[30] = bitonic_merge_zmm_64bit<vtype>(zmm_t31);
+    zmm[31] = bitonic_merge_zmm_64bit<vtype>(zmm_t32);
+}
+
+template <typename vtype, typename type_t>
+X86_SIMD_SORT_INLINE void sort_8_64bit(type_t *arr, int32_t N) {
+    typename vtype::opmask_t load_mask = (0x01 << N) - 0x01;
+    typename vtype::zmm_t zmm =
+        vtype::mask_loadu(vtype::zmm_max(), load_mask, arr);
+    vtype::mask_storeu(arr, load_mask, sort_zmm_64bit<vtype>(zmm));
+}
+
+template <typename vtype, typename type_t>
+X86_SIMD_SORT_INLINE void sort_16_64bit(type_t *arr, int32_t N) {
+    if (N <= 8) {
+        sort_8_64bit<vtype>(arr, N);
+        return;
+    }
+    using zmm_t = typename vtype::zmm_t;
+    zmm_t zmm1 = vtype::loadu(arr);
+    typename vtype::opmask_t load_mask = (0x01 << (N - 8)) - 0x01;
+    zmm_t zmm2 = vtype::mask_loadu(vtype::zmm_max(), load_mask, arr + 8);
+    zmm1 = sort_zmm_64bit<vtype>(zmm1);
+    zmm2 = sort_zmm_64bit<vtype>(zmm2);
+    bitonic_merge_two_zmm_64bit<vtype>(zmm1, zmm2);
+    vtype::storeu(arr, zmm1);
+    vtype::mask_storeu(arr + 8, load_mask, zmm2);
+}
+
+template <typename vtype, typename type_t>
+X86_SIMD_SORT_INLINE void sort_32_64bit(type_t *arr, int32_t N) {
+    if (N <= 16) {
+        sort_16_64bit<vtype>(arr, N);
+        return;
+    }
+    using zmm_t = typename vtype::zmm_t;
+    using opmask_t = typename vtype::opmask_t;
+    zmm_t zmm[4];
+    zmm[0] = vtype::loadu(arr);
+    zmm[1] = vtype::loadu(arr + 8);
+    opmask_t load_mask1 = 0xFF, load_mask2 = 0xFF;
+    uint64_t combined_mask = (0x1ull << (N - 16)) - 0x1ull;
+    load_mask1 = (combined_mask)&0xFF;
+    load_mask2 = (combined_mask >> 8) & 0xFF;
+    zmm[2] = vtype::mask_loadu(vtype::zmm_max(), load_mask1, arr + 16);
+    zmm[3] = vtype::mask_loadu(vtype::zmm_max(), load_mask2, arr + 24);
+    zmm[0] = sort_zmm_64bit<vtype>(zmm[0]);
+    zmm[1] = sort_zmm_64bit<vtype>(zmm[1]);
+    zmm[2] = sort_zmm_64bit<vtype>(zmm[2]);
+    zmm[3] = sort_zmm_64bit<vtype>(zmm[3]);
+    bitonic_merge_two_zmm_64bit<vtype>(zmm[0], zmm[1]);
+    bitonic_merge_two_zmm_64bit<vtype>(zmm[2], zmm[3]);
+    bitonic_merge_four_zmm_64bit<vtype>(zmm);
+    vtype::storeu(arr, zmm[0]);
+    vtype::storeu(arr + 8, zmm[1]);
+    vtype::mask_storeu(arr + 16, load_mask1, zmm[2]);
+    vtype::mask_storeu(arr + 24, load_mask2, zmm[3]);
+}
+
+template <typename vtype, typename type_t>
+X86_SIMD_SORT_INLINE void sort_64_64bit(type_t *arr, int32_t N) {
+    if (N <= 32) {
+        sort_32_64bit<vtype>(arr, N);
+        return;
+    }
+    using zmm_t = typename vtype::zmm_t;
+    using opmask_t = typename vtype::opmask_t;
+    zmm_t zmm[8];
+    zmm[0] = vtype::loadu(arr);
+    zmm[1] = vtype::loadu(arr + 8);
+    zmm[2] = vtype::loadu(arr + 16);
+    zmm[3] = vtype::loadu(arr + 24);
+    zmm[0] = sort_zmm_64bit<vtype>(zmm[0]);
+    zmm[1] = sort_zmm_64bit<vtype>(zmm[1]);
+    zmm[2] = sort_zmm_64bit<vtype>(zmm[2]);
+    zmm[3] = sort_zmm_64bit<vtype>(zmm[3]);
+    opmask_t load_mask1 = 0xFF, load_mask2 = 0xFF;
+    opmask_t load_mask3 = 0xFF, load_mask4 = 0xFF;
+    // N-32 >= 1
+    uint64_t combined_mask = (0x1ull << (N - 32)) - 0x1ull;
+    load_mask1 = (combined_mask)&0xFF;
+    load_mask2 = (combined_mask >> 8) & 0xFF;
+    load_mask3 = (combined_mask >> 16) & 0xFF;
+    load_mask4 = (combined_mask >> 24) & 0xFF;
+    zmm[4] = vtype::mask_loadu(vtype::zmm_max(), load_mask1, arr + 32);
+    zmm[5] = vtype::mask_loadu(vtype::zmm_max(), load_mask2, arr + 40);
+    zmm[6] = vtype::mask_loadu(vtype::zmm_max(), load_mask3, arr + 48);
+    zmm[7] = vtype::mask_loadu(vtype::zmm_max(), load_mask4, arr + 56);
+    zmm[4] = sort_zmm_64bit<vtype>(zmm[4]);
+    zmm[5] = sort_zmm_64bit<vtype>(zmm[5]);
+    zmm[6] = sort_zmm_64bit<vtype>(zmm[6]);
+    zmm[7] = sort_zmm_64bit<vtype>(zmm[7]);
+    bitonic_merge_two_zmm_64bit<vtype>(zmm[0], zmm[1]);
+    bitonic_merge_two_zmm_64bit<vtype>(zmm[2], zmm[3]);
+    bitonic_merge_two_zmm_64bit<vtype>(zmm[4], zmm[5]);
+    bitonic_merge_two_zmm_64bit<vtype>(zmm[6], zmm[7]);
+    bitonic_merge_four_zmm_64bit<vtype>(zmm);
+    bitonic_merge_four_zmm_64bit<vtype>(zmm + 4);
+    bitonic_merge_eight_zmm_64bit<vtype>(zmm);
+    vtype::storeu(arr, zmm[0]);
+    vtype::storeu(arr + 8, zmm[1]);
+    vtype::storeu(arr + 16, zmm[2]);
+    vtype::storeu(arr + 24, zmm[3]);
+    vtype::mask_storeu(arr + 32, load_mask1, zmm[4]);
+    vtype::mask_storeu(arr + 40, load_mask2, zmm[5]);
+    vtype::mask_storeu(arr + 48, load_mask3, zmm[6]);
+    vtype::mask_storeu(arr + 56, load_mask4, zmm[7]);
+}
+
+template <typename vtype, typename type_t>
+X86_SIMD_SORT_INLINE void sort_128_64bit(type_t *arr, int32_t N) {
+    if (N <= 64) {
+        sort_64_64bit<vtype>(arr, N);
+        return;
+    }
+    using zmm_t = typename vtype::zmm_t;
+    using opmask_t = typename vtype::opmask_t;
+    zmm_t zmm[16];
+    zmm[0] = vtype::loadu(arr);
+    zmm[1] = vtype::loadu(arr + 8);
+    zmm[2] = vtype::loadu(arr + 16);
+    zmm[3] = vtype::loadu(arr + 24);
+    zmm[4] = vtype::loadu(arr + 32);
+    zmm[5] = vtype::loadu(arr + 40);
+    zmm[6] = vtype::loadu(arr + 48);
+    zmm[7] = vtype::loadu(arr + 56);
+    zmm[0] = sort_zmm_64bit<vtype>(zmm[0]);
+    zmm[1] = sort_zmm_64bit<vtype>(zmm[1]);
+    zmm[2] = sort_zmm_64bit<vtype>(zmm[2]);
+    zmm[3] = sort_zmm_64bit<vtype>(zmm[3]);
+    zmm[4] = sort_zmm_64bit<vtype>(zmm[4]);
+    zmm[5] = sort_zmm_64bit<vtype>(zmm[5]);
+    zmm[6] = sort_zmm_64bit<vtype>(zmm[6]);
+    zmm[7] = sort_zmm_64bit<vtype>(zmm[7]);
+    opmask_t load_mask1 = 0xFF, load_mask2 = 0xFF;
+    opmask_t load_mask3 = 0xFF, load_mask4 = 0xFF;
+    opmask_t load_mask5 = 0xFF, load_mask6 = 0xFF;
+    opmask_t load_mask7 = 0xFF, load_mask8 = 0xFF;
+    if (N != 128) {
+        uint64_t combined_mask = (0x1ull << (N - 64)) - 0x1ull;
+        load_mask1 = (combined_mask)&0xFF;
+        load_mask2 = (combined_mask >> 8) & 0xFF;
+        load_mask3 = (combined_mask >> 16) & 0xFF;
+        load_mask4 = (combined_mask >> 24) & 0xFF;
+        load_mask5 = (combined_mask >> 32) & 0xFF;
+        load_mask6 = (combined_mask >> 40) & 0xFF;
+        load_mask7 = (combined_mask >> 48) & 0xFF;
+        load_mask8 = (combined_mask >> 56) & 0xFF;
+    }
+    zmm[8] = vtype::mask_loadu(vtype::zmm_max(), load_mask1, arr + 64);
+    zmm[9] = vtype::mask_loadu(vtype::zmm_max(), load_mask2, arr + 72);
+    zmm[10] = vtype::mask_loadu(vtype::zmm_max(), load_mask3, arr + 80);
+    zmm[11] = vtype::mask_loadu(vtype::zmm_max(), load_mask4, arr + 88);
+    zmm[12] = vtype::mask_loadu(vtype::zmm_max(), load_mask5, arr + 96);
+    zmm[13] = vtype::mask_loadu(vtype::zmm_max(), load_mask6, arr + 104);
+    zmm[14] = vtype::mask_loadu(vtype::zmm_max(), load_mask7, arr + 112);
+    zmm[15] = vtype::mask_loadu(vtype::zmm_max(), load_mask8, arr + 120);
+    zmm[8] = sort_zmm_64bit<vtype>(zmm[8]);
+    zmm[9] = sort_zmm_64bit<vtype>(zmm[9]);
+    zmm[10] = sort_zmm_64bit<vtype>(zmm[10]);
+    zmm[11] = sort_zmm_64bit<vtype>(zmm[11]);
+    zmm[12] = sort_zmm_64bit<vtype>(zmm[12]);
+    zmm[13] = sort_zmm_64bit<vtype>(zmm[13]);
+    zmm[14] = sort_zmm_64bit<vtype>(zmm[14]);
+    zmm[15] = sort_zmm_64bit<vtype>(zmm[15]);
+    bitonic_merge_two_zmm_64bit<vtype>(zmm[0], zmm[1]);
+    bitonic_merge_two_zmm_64bit<vtype>(zmm[2], zmm[3]);
+    bitonic_merge_two_zmm_64bit<vtype>(zmm[4], zmm[5]);
+    bitonic_merge_two_zmm_64bit<vtype>(zmm[6], zmm[7]);
+    bitonic_merge_two_zmm_64bit<vtype>(zmm[8], zmm[9]);
+    bitonic_merge_two_zmm_64bit<vtype>(zmm[10], zmm[11]);
+    bitonic_merge_two_zmm_64bit<vtype>(zmm[12], zmm[13]);
+    bitonic_merge_two_zmm_64bit<vtype>(zmm[14], zmm[15]);
+    bitonic_merge_four_zmm_64bit<vtype>(zmm);
+    bitonic_merge_four_zmm_64bit<vtype>(zmm + 4);
+    bitonic_merge_four_zmm_64bit<vtype>(zmm + 8);
+    bitonic_merge_four_zmm_64bit<vtype>(zmm + 12);
+    bitonic_merge_eight_zmm_64bit<vtype>(zmm);
+    bitonic_merge_eight_zmm_64bit<vtype>(zmm + 8);
+    bitonic_merge_sixteen_zmm_64bit<vtype>(zmm);
+    vtype::storeu(arr, zmm[0]);
+    vtype::storeu(arr + 8, zmm[1]);
+    vtype::storeu(arr + 16, zmm[2]);
+    vtype::storeu(arr + 24, zmm[3]);
+    vtype::storeu(arr + 32, zmm[4]);
+    vtype::storeu(arr + 40, zmm[5]);
+    vtype::storeu(arr + 48, zmm[6]);
+    vtype::storeu(arr + 56, zmm[7]);
+    vtype::mask_storeu(arr + 64, load_mask1, zmm[8]);
+    vtype::mask_storeu(arr + 72, load_mask2, zmm[9]);
+    vtype::mask_storeu(arr + 80, load_mask3, zmm[10]);
+    vtype::mask_storeu(arr + 88, load_mask4, zmm[11]);
+    vtype::mask_storeu(arr + 96, load_mask5, zmm[12]);
+    vtype::mask_storeu(arr + 104, load_mask6, zmm[13]);
+    vtype::mask_storeu(arr + 112, load_mask7, zmm[14]);
+    vtype::mask_storeu(arr + 120, load_mask8, zmm[15]);
+}
+
+template <typename vtype, typename type_t>
+X86_SIMD_SORT_INLINE void sort_256_64bit(type_t *arr, int32_t N) {
+    if (N <= 128) {
+        sort_128_64bit<vtype>(arr, N);
+        return;
+    }
+    using zmm_t = typename vtype::zmm_t;
+    using opmask_t = typename vtype::opmask_t;
+    zmm_t zmm[32];
+    zmm[0] = vtype::loadu(arr);
+    zmm[1] = vtype::loadu(arr + 8);
+    zmm[2] = vtype::loadu(arr + 16);
+    zmm[3] = vtype::loadu(arr + 24);
+    zmm[4] = vtype::loadu(arr + 32);
+    zmm[5] = vtype::loadu(arr + 40);
+    zmm[6] = vtype::loadu(arr + 48);
+    zmm[7] = vtype::loadu(arr + 56);
+    zmm[8] = vtype::loadu(arr + 64);
+    zmm[9] = vtype::loadu(arr + 72);
+    zmm[10] = vtype::loadu(arr + 80);
+    zmm[11] = vtype::loadu(arr + 88);
+    zmm[12] = vtype::loadu(arr + 96);
+    zmm[13] = vtype::loadu(arr + 104);
+    zmm[14] = vtype::loadu(arr + 112);
+    zmm[15] = vtype::loadu(arr + 120);
+    zmm[0] = sort_zmm_64bit<vtype>(zmm[0]);
+    zmm[1] = sort_zmm_64bit<vtype>(zmm[1]);
+    zmm[2] = sort_zmm_64bit<vtype>(zmm[2]);
+    zmm[3] = sort_zmm_64bit<vtype>(zmm[3]);
+    zmm[4] = sort_zmm_64bit<vtype>(zmm[4]);
+    zmm[5] = sort_zmm_64bit<vtype>(zmm[5]);
+    zmm[6] = sort_zmm_64bit<vtype>(zmm[6]);
+    zmm[7] = sort_zmm_64bit<vtype>(zmm[7]);
+    zmm[8] = sort_zmm_64bit<vtype>(zmm[8]);
+    zmm[9] = sort_zmm_64bit<vtype>(zmm[9]);
+    zmm[10] = sort_zmm_64bit<vtype>(zmm[10]);
+    zmm[11] = sort_zmm_64bit<vtype>(zmm[11]);
+    zmm[12] = sort_zmm_64bit<vtype>(zmm[12]);
+    zmm[13] = sort_zmm_64bit<vtype>(zmm[13]);
+    zmm[14] = sort_zmm_64bit<vtype>(zmm[14]);
+    zmm[15] = sort_zmm_64bit<vtype>(zmm[15]);
+    opmask_t load_mask1 = 0xFF, load_mask2 = 0xFF;
+    opmask_t load_mask3 = 0xFF, load_mask4 = 0xFF;
+    opmask_t load_mask5 = 0xFF, load_mask6 = 0xFF;
+    opmask_t load_mask7 = 0xFF, load_mask8 = 0xFF;
+    opmask_t load_mask9 = 0xFF, load_mask10 = 0xFF;
+    opmask_t load_mask11 = 0xFF, load_mask12 = 0xFF;
+    opmask_t load_mask13 = 0xFF, load_mask14 = 0xFF;
+    opmask_t load_mask15 = 0xFF, load_mask16 = 0xFF;
+    if (N != 256) {
+        uint64_t combined_mask;
+        if (N < 192) {
+            combined_mask = (0x1ull << (N - 128)) - 0x1ull;
+            load_mask1 = (combined_mask)&0xFF;
+            load_mask2 = (combined_mask >> 8) & 0xFF;
+            load_mask3 = (combined_mask >> 16) & 0xFF;
+            load_mask4 = (combined_mask >> 24) & 0xFF;
+            load_mask5 = (combined_mask >> 32) & 0xFF;
+            load_mask6 = (combined_mask >> 40) & 0xFF;
+            load_mask7 = (combined_mask >> 48) & 0xFF;
+            load_mask8 = (combined_mask >> 56) & 0xFF;
+            load_mask9 = 0x00;
+            load_mask10 = 0x0;
+            load_mask11 = 0x00;
+            load_mask12 = 0x00;
+            load_mask13 = 0x00;
+            load_mask14 = 0x00;
+            load_mask15 = 0x00;
+            load_mask16 = 0x00;
+        } else {
+            combined_mask = (0x1ull << (N - 192)) - 0x1ull;
+            load_mask9 = (combined_mask)&0xFF;
+            load_mask10 = (combined_mask >> 8) & 0xFF;
+            load_mask11 = (combined_mask >> 16) & 0xFF;
+            load_mask12 = (combined_mask >> 24) & 0xFF;
+            load_mask13 = (combined_mask >> 32) & 0xFF;
+            load_mask14 = (combined_mask >> 40) & 0xFF;
+            load_mask15 = (combined_mask >> 48) & 0xFF;
+            load_mask16 = (combined_mask >> 56) & 0xFF;
+        }
+    }
+    zmm[16] = vtype::mask_loadu(vtype::zmm_max(), load_mask1, arr + 128);
+    zmm[17] = vtype::mask_loadu(vtype::zmm_max(), load_mask2, arr + 136);
+    zmm[18] = vtype::mask_loadu(vtype::zmm_max(), load_mask3, arr + 144);
+    zmm[19] = vtype::mask_loadu(vtype::zmm_max(), load_mask4, arr + 152);
+    zmm[20] = vtype::mask_loadu(vtype::zmm_max(), load_mask5, arr + 160);
+    zmm[21] = vtype::mask_loadu(vtype::zmm_max(), load_mask6, arr + 168);
+    zmm[22] = vtype::mask_loadu(vtype::zmm_max(), load_mask7, arr + 176);
+    zmm[23] = vtype::mask_loadu(vtype::zmm_max(), load_mask8, arr + 184);
+    if (N < 192) {
+        zmm[24] = vtype::zmm_max();
+        zmm[25] = vtype::zmm_max();
+        zmm[26] = vtype::zmm_max();
+        zmm[27] = vtype::zmm_max();
+        zmm[28] = vtype::zmm_max();
+        zmm[29] = vtype::zmm_max();
+        zmm[30] = vtype::zmm_max();
+        zmm[31] = vtype::zmm_max();
+    } else {
+        zmm[24] = vtype::mask_loadu(vtype::zmm_max(), load_mask9, arr + 192);
+        zmm[25] = vtype::mask_loadu(vtype::zmm_max(), load_mask10, arr + 200);
+        zmm[26] = vtype::mask_loadu(vtype::zmm_max(), load_mask11, arr + 208);
+        zmm[27] = vtype::mask_loadu(vtype::zmm_max(), load_mask12, arr + 216);
+        zmm[28] = vtype::mask_loadu(vtype::zmm_max(), load_mask13, arr + 224);
+        zmm[29] = vtype::mask_loadu(vtype::zmm_max(), load_mask14, arr + 232);
+        zmm[30] = vtype::mask_loadu(vtype::zmm_max(), load_mask15, arr + 240);
+        zmm[31] = vtype::mask_loadu(vtype::zmm_max(), load_mask16, arr + 248);
+    }
+    zmm[16] = sort_zmm_64bit<vtype>(zmm[16]);
+    zmm[17] = sort_zmm_64bit<vtype>(zmm[17]);
+    zmm[18] = sort_zmm_64bit<vtype>(zmm[18]);
+    zmm[19] = sort_zmm_64bit<vtype>(zmm[19]);
+    zmm[20] = sort_zmm_64bit<vtype>(zmm[20]);
+    zmm[21] = sort_zmm_64bit<vtype>(zmm[21]);
+    zmm[22] = sort_zmm_64bit<vtype>(zmm[22]);
+    zmm[23] = sort_zmm_64bit<vtype>(zmm[23]);
+    zmm[24] = sort_zmm_64bit<vtype>(zmm[24]);
+    zmm[25] = sort_zmm_64bit<vtype>(zmm[25]);
+    zmm[26] = sort_zmm_64bit<vtype>(zmm[26]);
+    zmm[27] = sort_zmm_64bit<vtype>(zmm[27]);
+    zmm[28] = sort_zmm_64bit<vtype>(zmm[28]);
+    zmm[29] = sort_zmm_64bit<vtype>(zmm[29]);
+    zmm[30] = sort_zmm_64bit<vtype>(zmm[30]);
+    zmm[31] = sort_zmm_64bit<vtype>(zmm[31]);
+    bitonic_merge_two_zmm_64bit<vtype>(zmm[0], zmm[1]);
+    bitonic_merge_two_zmm_64bit<vtype>(zmm[2], zmm[3]);
+    bitonic_merge_two_zmm_64bit<vtype>(zmm[4], zmm[5]);
+    bitonic_merge_two_zmm_64bit<vtype>(zmm[6], zmm[7]);
+    bitonic_merge_two_zmm_64bit<vtype>(zmm[8], zmm[9]);
+    bitonic_merge_two_zmm_64bit<vtype>(zmm[10], zmm[11]);
+    bitonic_merge_two_zmm_64bit<vtype>(zmm[12], zmm[13]);
+    bitonic_merge_two_zmm_64bit<vtype>(zmm[14], zmm[15]);
+    bitonic_merge_two_zmm_64bit<vtype>(zmm[16], zmm[17]);
+    bitonic_merge_two_zmm_64bit<vtype>(zmm[18], zmm[19]);
+    bitonic_merge_two_zmm_64bit<vtype>(zmm[20], zmm[21]);
+    bitonic_merge_two_zmm_64bit<vtype>(zmm[22], zmm[23]);
+    bitonic_merge_two_zmm_64bit<vtype>(zmm[24], zmm[25]);
+    bitonic_merge_two_zmm_64bit<vtype>(zmm[26], zmm[27]);
+    bitonic_merge_two_zmm_64bit<vtype>(zmm[28], zmm[29]);
+    bitonic_merge_two_zmm_64bit<vtype>(zmm[30], zmm[31]);
+    bitonic_merge_four_zmm_64bit<vtype>(zmm);
+    bitonic_merge_four_zmm_64bit<vtype>(zmm + 4);
+    bitonic_merge_four_zmm_64bit<vtype>(zmm + 8);
+    bitonic_merge_four_zmm_64bit<vtype>(zmm + 12);
+    bitonic_merge_four_zmm_64bit<vtype>(zmm + 16);
+    bitonic_merge_four_zmm_64bit<vtype>(zmm + 20);
+    bitonic_merge_four_zmm_64bit<vtype>(zmm + 24);
+    bitonic_merge_four_zmm_64bit<vtype>(zmm + 28);
+    bitonic_merge_eight_zmm_64bit<vtype>(zmm);
+    bitonic_merge_eight_zmm_64bit<vtype>(zmm + 8);
+    bitonic_merge_eight_zmm_64bit<vtype>(zmm + 16);
+    bitonic_merge_eight_zmm_64bit<vtype>(zmm + 24);
+    bitonic_merge_sixteen_zmm_64bit<vtype>(zmm);
+    bitonic_merge_sixteen_zmm_64bit<vtype>(zmm + 16);
+    bitonic_merge_32_zmm_64bit<vtype>(zmm);
+    vtype::storeu(arr, zmm[0]);
+    vtype::storeu(arr + 8, zmm[1]);
+    vtype::storeu(arr + 16, zmm[2]);
+    vtype::storeu(arr + 24, zmm[3]);
+    vtype::storeu(arr + 32, zmm[4]);
+    vtype::storeu(arr + 40, zmm[5]);
+    vtype::storeu(arr + 48, zmm[6]);
+    vtype::storeu(arr + 56, zmm[7]);
+    vtype::storeu(arr + 64, zmm[8]);
+    vtype::storeu(arr + 72, zmm[9]);
+    vtype::storeu(arr + 80, zmm[10]);
+    vtype::storeu(arr + 88, zmm[11]);
+    vtype::storeu(arr + 96, zmm[12]);
+    vtype::storeu(arr + 104, zmm[13]);
+    vtype::storeu(arr + 112, zmm[14]);
+    vtype::storeu(arr + 120, zmm[15]);
+    vtype::mask_storeu(arr + 128, load_mask1, zmm[16]);
+    vtype::mask_storeu(arr + 136, load_mask2, zmm[17]);
+    vtype::mask_storeu(arr + 144, load_mask3, zmm[18]);
+    vtype::mask_storeu(arr + 152, load_mask4, zmm[19]);
+    vtype::mask_storeu(arr + 160, load_mask5, zmm[20]);
+    vtype::mask_storeu(arr + 168, load_mask6, zmm[21]);
+    vtype::mask_storeu(arr + 176, load_mask7, zmm[22]);
+    vtype::mask_storeu(arr + 184, load_mask8, zmm[23]);
+    if (N > 192) {
+        vtype::mask_storeu(arr + 192, load_mask9, zmm[24]);
+        vtype::mask_storeu(arr + 200, load_mask10, zmm[25]);
+        vtype::mask_storeu(arr + 208, load_mask11, zmm[26]);
+        vtype::mask_storeu(arr + 216, load_mask12, zmm[27]);
+        vtype::mask_storeu(arr + 224, load_mask13, zmm[28]);
+        vtype::mask_storeu(arr + 232, load_mask14, zmm[29]);
+        vtype::mask_storeu(arr + 240, load_mask15, zmm[30]);
+        vtype::mask_storeu(arr + 248, load_mask16, zmm[31]);
+    }
+}
+
+template <typename vtype, typename type_t>
+static void qsort_64bit_(type_t *arr, int64_t left, int64_t right,
+                         int64_t max_iters) {
+    /*
+     * Resort to std::sort if quicksort isnt making any progress
+     */
+    if (max_iters <= 0) {
+        std::sort(arr + left, arr + right + 1);
+        return;
+    }
+    /*
+     * Base case: use bitonic networks to sort arrays <= 128
+     */
+    if (right + 1 - left <= 256) {
+        sort_256_64bit<vtype>(arr + left, (int32_t)(right + 1 - left));
+        return;
+    }
+
+    type_t pivot = get_pivot_64bit<vtype>(arr, left, right);
+    type_t smallest = vtype::type_max();
+    type_t biggest = vtype::type_min();
+    int64_t pivot_index = partition_avx512_unrolled<vtype, 8>(
+        arr, left, right + 1, pivot, &smallest, &biggest);
+    if (pivot != smallest)
+        qsort_64bit_<vtype>(arr, left, pivot_index - 1, max_iters - 1);
+    if (pivot != biggest)
+        qsort_64bit_<vtype>(arr, pivot_index, right, max_iters - 1);
+}
+
+template <typename vtype, typename type_t>
+static void qselect_64bit_(type_t *arr, int64_t pos, int64_t left,
+                           int64_t right, int64_t max_iters) {
+    /*
+     * Resort to std::sort if quicksort isnt making any progress
+     */
+    if (max_iters <= 0) {
+        std::sort(arr + left, arr + right + 1);
+        return;
+    }
+    /*
+     * Base case: use bitonic networks to sort arrays <= 128
+     */
+    if (right + 1 - left <= 128) {
+        sort_128_64bit<vtype>(arr + left, (int32_t)(right + 1 - left));
+        return;
+    }
+
+    type_t pivot = get_pivot_64bit<vtype>(arr, left, right);
+    type_t smallest = vtype::type_max();
+    type_t biggest = vtype::type_min();
+    int64_t pivot_index = partition_avx512_unrolled<vtype, 8>(
+        arr, left, right + 1, pivot, &smallest, &biggest);
+    if ((pivot != smallest) && (pos < pivot_index))
+        qselect_64bit_<vtype>(arr, pos, left, pivot_index - 1, max_iters - 1);
+    else if ((pivot != biggest) && (pos >= pivot_index))
+        qselect_64bit_<vtype>(arr, pos, pivot_index, right, max_iters - 1);
+}
+
+template <>
+void avx512_qselect<int64_t>(int64_t *arr, int64_t k, int64_t arrsize) {
+    if (arrsize > 1) {
+        qselect_64bit_<zmm_vector<int64_t>, int64_t>(
+            arr, k, 0, arrsize - 1, 2 * (int64_t)log2(arrsize));
+    }
+}
+
+template <>
+void avx512_qselect<uint64_t>(uint64_t *arr, int64_t k, int64_t arrsize) {
+    if (arrsize > 1) {
+        qselect_64bit_<zmm_vector<uint64_t>, uint64_t>(
+            arr, k, 0, arrsize - 1, 2 * (int64_t)log2(arrsize));
+    }
+}
+
+template <>
+void avx512_qselect<double>(double *arr, int64_t k, int64_t arrsize) {
+    if (arrsize > 1) {
+        int64_t nan_count = replace_nan_with_inf(arr, arrsize);
+        qselect_64bit_<zmm_vector<double>, double>(arr, k, 0, arrsize - 1,
+                                                   2 * (int64_t)log2(arrsize));
+        replace_inf_with_nan(arr, arrsize, nan_count);
+    }
+}
+
+template <>
+void avx512_qsort<int64_t>(int64_t *arr, int64_t arrsize) {
+    if (arrsize > 1) {
+        qsort_64bit_<zmm_vector<int64_t>, int64_t>(arr, 0, arrsize - 1,
+                                                   2 * (int64_t)log2(arrsize));
+    }
+}
+
+template <>
+void avx512_qsort<uint64_t>(uint64_t *arr, int64_t arrsize) {
+    if (arrsize > 1) {
+        qsort_64bit_<zmm_vector<uint64_t>, uint64_t>(
+            arr, 0, arrsize - 1, 2 * (int64_t)log2(arrsize));
+    }
+}
+
+template <>
+void avx512_qsort<double>(double *arr, int64_t arrsize) {
+    if (arrsize > 1) {
+        int64_t nan_count = replace_nan_with_inf(arr, arrsize);
+        qsort_64bit_<zmm_vector<double>, double>(arr, 0, arrsize - 1,
+                                                 2 * (int64_t)log2(arrsize));
+        replace_inf_with_nan(arr, arrsize, nan_count);
+    }
+}
+#endif  // AVX512_QSORT_64BIT
diff --git a/src/java.base/linux/native/libavx512_x86_64/avx512-common-qsort.h b/src/java.base/linux/native/libavx512_x86_64/avx512-common-qsort.h
new file mode 100644
index 0000000000000..b477f9e65c233
--- /dev/null
+++ b/src/java.base/linux/native/libavx512_x86_64/avx512-common-qsort.h
@@ -0,0 +1,521 @@
+/*
+ * Copyright (c) 2023 Intel Corporation. All rights reserved.
+ * Intel x86-simd-sort source code.
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef AVX512_QSORT_COMMON
+#define AVX512_QSORT_COMMON
+
+/*
+ * Quicksort using AVX-512. The ideas and code are based on these two research
+ * papers [1] and [2]. On a high level, the idea is to vectorize quicksort
+ * partitioning using AVX-512 compressstore instructions. If the array size is
+ * < 128, then use Bitonic sorting network implemented on 512-bit registers.
+ * The precise network definitions depend on the dtype and are defined in
+ * separate files: avx512-16bit-qsort.hpp, avx512-32bit-qsort.hpp and
+ * avx512-64bit-qsort.hpp. Article [4] is a good resource for bitonic sorting
+ * network. The core implementations of the vectorized qsort functions
+ * avx512_qsort<T>(T*, int64_t) are modified versions of avx2 quicksort
+ * presented in the paper [2] and source code associated with that paper [3].
+ *
+ * [1] Fast and Robust Vectorized In-Place Sorting of Primitive Types
+ *     https://drops.dagstuhl.de/opus/volltexte/2021/13775/
+ *
+ * [2] A Novel Hybrid Quicksort Algorithm Vectorized using AVX-512 on Intel
+ * Skylake https://arxiv.org/pdf/1704.08579.pdf
+ *
+ * [3] https://github.com/simd-sorting/fast-and-robust: SPDX-License-Identifier:
+ * MIT
+ *
+ * [4]
+ * http://mitp-content-server.mit.edu:18180/books/content/sectbyfn?collid=books_pres_0&fn=Chapter%2027.pdf&id=8030
+ *
+ */
+
+#include <immintrin.h>
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <cstring>
+#include <limits>
+
+#define X86_SIMD_SORT_INFINITY std::numeric_limits<double>::infinity()
+#define X86_SIMD_SORT_INFINITYF std::numeric_limits<float>::infinity()
+#define X86_SIMD_SORT_INFINITYH 0x7c00
+#define X86_SIMD_SORT_NEGINFINITYH 0xfc00
+#define X86_SIMD_SORT_MAX_UINT16 std::numeric_limits<uint16_t>::max()
+#define X86_SIMD_SORT_MAX_INT16 std::numeric_limits<int16_t>::max()
+#define X86_SIMD_SORT_MIN_INT16 std::numeric_limits<int16_t>::min()
+#define X86_SIMD_SORT_MAX_UINT32 std::numeric_limits<uint32_t>::max()
+#define X86_SIMD_SORT_MAX_INT32 std::numeric_limits<int32_t>::max()
+#define X86_SIMD_SORT_MIN_INT32 std::numeric_limits<int32_t>::min()
+#define X86_SIMD_SORT_MAX_UINT64 std::numeric_limits<uint64_t>::max()
+#define X86_SIMD_SORT_MAX_INT64 std::numeric_limits<int64_t>::max()
+#define X86_SIMD_SORT_MIN_INT64 std::numeric_limits<int64_t>::min()
+#define ZMM_MAX_DOUBLE _mm512_set1_pd(X86_SIMD_SORT_INFINITY)
+#define ZMM_MAX_UINT64 _mm512_set1_epi64(X86_SIMD_SORT_MAX_UINT64)
+#define ZMM_MAX_INT64 _mm512_set1_epi64(X86_SIMD_SORT_MAX_INT64)
+#define ZMM_MAX_FLOAT _mm512_set1_ps(X86_SIMD_SORT_INFINITYF)
+#define ZMM_MAX_UINT _mm512_set1_epi32(X86_SIMD_SORT_MAX_UINT32)
+#define ZMM_MAX_INT _mm512_set1_epi32(X86_SIMD_SORT_MAX_INT32)
+#define ZMM_MAX_HALF _mm512_set1_epi16(X86_SIMD_SORT_INFINITYH)
+#define YMM_MAX_HALF _mm256_set1_epi16(X86_SIMD_SORT_INFINITYH)
+#define ZMM_MAX_UINT16 _mm512_set1_epi16(X86_SIMD_SORT_MAX_UINT16)
+#define ZMM_MAX_INT16 _mm512_set1_epi16(X86_SIMD_SORT_MAX_INT16)
+#define SHUFFLE_MASK(a, b, c, d) (a << 6) | (b << 4) | (c << 2) | d
+
+#ifdef _MSC_VER
+#define X86_SIMD_SORT_INLINE static inline
+#define X86_SIMD_SORT_FINLINE static __forceinline
+#elif defined(__CYGWIN__)
+/*
+ * Force inline in cygwin to work around a compiler bug. See
+ * https://github.com/numpy/numpy/pull/22315#issuecomment-1267757584
+ */
+#define X86_SIMD_SORT_INLINE static __attribute__((always_inline))
+#define X86_SIMD_SORT_FINLINE static __attribute__((always_inline))
+#elif defined(__GNUC__)
+#define X86_SIMD_SORT_INLINE static inline
+#define X86_SIMD_SORT_FINLINE static __attribute__((always_inline))
+#else
+#define X86_SIMD_SORT_INLINE static
+#define X86_SIMD_SORT_FINLINE static
+#endif
+
+template <typename type>
+struct zmm_vector;
+
+template <typename type>
+struct ymm_vector;
+
+// Regular quicksort routines:
+template <typename T>
+void avx512_qsort(T *arr, int64_t arrsize);
+void avx512_qsort_fp16(uint16_t *arr, int64_t arrsize);
+
+template <typename T>
+void avx512_qselect(T *arr, int64_t k, int64_t arrsize);
+void avx512_qselect_fp16(uint16_t *arr, int64_t k, int64_t arrsize);
+
+template <typename T>
+inline void avx512_partial_qsort(T *arr, int64_t k, int64_t arrsize) {
+    avx512_qselect<T>(arr, k - 1, arrsize);
+    avx512_qsort<T>(arr, k - 1);
+}
+inline void avx512_partial_qsort_fp16(uint16_t *arr, int64_t k,
+                                      int64_t arrsize) {
+    avx512_qselect_fp16(arr, k - 1, arrsize);
+    avx512_qsort_fp16(arr, k - 1);
+}
+
+// key-value sort routines
+template <typename T>
+void avx512_qsort_kv(T *keys, uint64_t *indexes, int64_t arrsize);
+
+template <typename vtype, typename T = typename vtype::type_t>
+bool comparison_func(const T &a, const T &b) {
+    return a < b;
+}
+
+/*
+ * COEX == Compare and Exchange two registers by swapping min and max values
+ */
+template <typename vtype, typename mm_t>
+static void COEX(mm_t &a, mm_t &b) {
+    mm_t temp = a;
+    a = vtype::min(a, b);
+    b = vtype::max(temp, b);
+}
+template <typename vtype, typename zmm_t = typename vtype::zmm_t,
+          typename opmask_t = typename vtype::opmask_t>
+static inline zmm_t cmp_merge(zmm_t in1, zmm_t in2, opmask_t mask) {
+    zmm_t min = vtype::min(in2, in1);
+    zmm_t max = vtype::max(in2, in1);
+    return vtype::mask_mov(min, mask, max);  // 0 -> min, 1 -> max
+}
+/*
+ * Parition one ZMM register based on the pivot and returns the
+ * number of elements that are greater than or equal to the pivot.
+ */
+template <typename vtype, typename type_t, typename zmm_t>
+static inline int32_t partition_vec(type_t *arr, int64_t left, int64_t right,
+                                    const zmm_t curr_vec, const zmm_t pivot_vec,
+                                    zmm_t *smallest_vec, zmm_t *biggest_vec) {
+    /* which elements are larger than or equal to the pivot */
+    typename vtype::opmask_t ge_mask = vtype::ge(curr_vec, pivot_vec);
+    int32_t amount_ge_pivot = _mm_popcnt_u32((int32_t)ge_mask);
+    vtype::mask_compressstoreu(arr + left, vtype::knot_opmask(ge_mask),
+                               curr_vec);
+    vtype::mask_compressstoreu(arr + right - amount_ge_pivot, ge_mask,
+                               curr_vec);
+    *smallest_vec = vtype::min(curr_vec, *smallest_vec);
+    *biggest_vec = vtype::max(curr_vec, *biggest_vec);
+    return amount_ge_pivot;
+}
+/*
+ * Parition an array based on the pivot and returns the index of the
+ * first element that is greater than or equal to the pivot.
+ */
+template <typename vtype, typename type_t>
+static inline int64_t partition_avx512(type_t *arr, int64_t left, int64_t right,
+                                       type_t pivot, type_t *smallest,
+                                       type_t *biggest) {
+    /* make array length divisible by vtype::numlanes , shortening the array */
+    for (int32_t i = (right - left) % vtype::numlanes; i > 0; --i) {
+        *smallest = std::min(*smallest, arr[left], comparison_func<vtype>);
+        *biggest = std::max(*biggest, arr[left], comparison_func<vtype>);
+        if (!comparison_func<vtype>(arr[left], pivot)) {
+            std::swap(arr[left], arr[--right]);
+        } else {
+            ++left;
+        }
+    }
+
+    if (left == right)
+        return left; /* less than vtype::numlanes elements in the array */
+
+    using zmm_t = typename vtype::zmm_t;
+    zmm_t pivot_vec = vtype::set1(pivot);
+    zmm_t min_vec = vtype::set1(*smallest);
+    zmm_t max_vec = vtype::set1(*biggest);
+
+    if (right - left == vtype::numlanes) {
+        zmm_t vec = vtype::loadu(arr + left);
+        int32_t amount_ge_pivot =
+            partition_vec<vtype>(arr, left, left + vtype::numlanes, vec,
+                                 pivot_vec, &min_vec, &max_vec);
+        *smallest = vtype::reducemin(min_vec);
+        *biggest = vtype::reducemax(max_vec);
+        return left + (vtype::numlanes - amount_ge_pivot);
+    }
+
+    // first and last vtype::numlanes values are partitioned at the end
+    zmm_t vec_left = vtype::loadu(arr + left);
+    zmm_t vec_right = vtype::loadu(arr + (right - vtype::numlanes));
+    // store points of the vectors
+    int64_t r_store = right - vtype::numlanes;
+    int64_t l_store = left;
+    // indices for loading the elements
+    left += vtype::numlanes;
+    right -= vtype::numlanes;
+    while (right - left != 0) {
+        zmm_t curr_vec;
+        /*
+         * if fewer elements are stored on the right side of the array,
+         * then next elements are loaded from the right side,
+         * otherwise from the left side
+         */
+        if ((r_store + vtype::numlanes) - right < left - l_store) {
+            right -= vtype::numlanes;
+            curr_vec = vtype::loadu(arr + right);
+        } else {
+            curr_vec = vtype::loadu(arr + left);
+            left += vtype::numlanes;
+        }
+        // partition the current vector and save it on both sides of the array
+        int32_t amount_ge_pivot =
+            partition_vec<vtype>(arr, l_store, r_store + vtype::numlanes,
+                                 curr_vec, pivot_vec, &min_vec, &max_vec);
+        ;
+        r_store -= amount_ge_pivot;
+        l_store += (vtype::numlanes - amount_ge_pivot);
+    }
+
+    /* partition and save vec_left and vec_right */
+    int32_t amount_ge_pivot =
+        partition_vec<vtype>(arr, l_store, r_store + vtype::numlanes, vec_left,
+                             pivot_vec, &min_vec, &max_vec);
+    l_store += (vtype::numlanes - amount_ge_pivot);
+    amount_ge_pivot =
+        partition_vec<vtype>(arr, l_store, l_store + vtype::numlanes, vec_right,
+                             pivot_vec, &min_vec, &max_vec);
+    l_store += (vtype::numlanes - amount_ge_pivot);
+    *smallest = vtype::reducemin(min_vec);
+    *biggest = vtype::reducemax(max_vec);
+    return l_store;
+}
+
+template <typename vtype, int num_unroll,
+          typename type_t = typename vtype::type_t>
+static inline int64_t partition_avx512_unrolled(type_t *arr, int64_t left,
+                                                int64_t right, type_t pivot,
+                                                type_t *smallest,
+                                                type_t *biggest) {
+    if (right - left <= 2 * num_unroll * vtype::numlanes) {
+        return partition_avx512<vtype>(arr, left, right, pivot, smallest,
+                                       biggest);
+    }
+    /* make array length divisible by 8*vtype::numlanes , shortening the array
+     */
+    for (int32_t i = ((right - left) % (num_unroll * vtype::numlanes)); i > 0;
+         --i) {
+        *smallest = std::min(*smallest, arr[left], comparison_func<vtype>);
+        *biggest = std::max(*biggest, arr[left], comparison_func<vtype>);
+        if (!comparison_func<vtype>(arr[left], pivot)) {
+            std::swap(arr[left], arr[--right]);
+        } else {
+            ++left;
+        }
+    }
+
+    if (left == right)
+        return left; /* less than vtype::numlanes elements in the array */
+
+    using zmm_t = typename vtype::zmm_t;
+    zmm_t pivot_vec = vtype::set1(pivot);
+    zmm_t min_vec = vtype::set1(*smallest);
+    zmm_t max_vec = vtype::set1(*biggest);
+
+    // We will now have atleast 16 registers worth of data to process:
+    // left and right vtype::numlanes values are partitioned at the end
+    zmm_t vec_left[num_unroll], vec_right[num_unroll];
+#pragma GCC unroll 8
+    for (int ii = 0; ii < num_unroll; ++ii) {
+        vec_left[ii] = vtype::loadu(arr + left + vtype::numlanes * ii);
+        vec_right[ii] =
+            vtype::loadu(arr + (right - vtype::numlanes * (num_unroll - ii)));
+    }
+    // store points of the vectors
+    int64_t r_store = right - vtype::numlanes;
+    int64_t l_store = left;
+    // indices for loading the elements
+    left += num_unroll * vtype::numlanes;
+    right -= num_unroll * vtype::numlanes;
+    while (right - left != 0) {
+        zmm_t curr_vec[num_unroll];
+        /*
+         * if fewer elements are stored on the right side of the array,
+         * then next elements are loaded from the right side,
+         * otherwise from the left side
+         */
+        if ((r_store + vtype::numlanes) - right < left - l_store) {
+            right -= num_unroll * vtype::numlanes;
+#pragma GCC unroll 8
+            for (int ii = 0; ii < num_unroll; ++ii) {
+                curr_vec[ii] = vtype::loadu(arr + right + ii * vtype::numlanes);
+            }
+        } else {
+#pragma GCC unroll 8
+            for (int ii = 0; ii < num_unroll; ++ii) {
+                curr_vec[ii] = vtype::loadu(arr + left + ii * vtype::numlanes);
+            }
+            left += num_unroll * vtype::numlanes;
+        }
+// partition the current vector and save it on both sides of the array
+#pragma GCC unroll 8
+        for (int ii = 0; ii < num_unroll; ++ii) {
+            int32_t amount_ge_pivot = partition_vec<vtype>(
+                arr, l_store, r_store + vtype::numlanes, curr_vec[ii],
+                pivot_vec, &min_vec, &max_vec);
+            l_store += (vtype::numlanes - amount_ge_pivot);
+            r_store -= amount_ge_pivot;
+        }
+    }
+
+/* partition and save vec_left[8] and vec_right[8] */
+#pragma GCC unroll 8
+    for (int ii = 0; ii < num_unroll; ++ii) {
+        int32_t amount_ge_pivot =
+            partition_vec<vtype>(arr, l_store, r_store + vtype::numlanes,
+                                 vec_left[ii], pivot_vec, &min_vec, &max_vec);
+        l_store += (vtype::numlanes - amount_ge_pivot);
+        r_store -= amount_ge_pivot;
+    }
+#pragma GCC unroll 8
+    for (int ii = 0; ii < num_unroll; ++ii) {
+        int32_t amount_ge_pivot =
+            partition_vec<vtype>(arr, l_store, r_store + vtype::numlanes,
+                                 vec_right[ii], pivot_vec, &min_vec, &max_vec);
+        l_store += (vtype::numlanes - amount_ge_pivot);
+        r_store -= amount_ge_pivot;
+    }
+    *smallest = vtype::reducemin(min_vec);
+    *biggest = vtype::reducemax(max_vec);
+    return l_store;
+}
+
+// Key-value sort helper functions
+
+template <typename vtype1, typename vtype2,
+          typename zmm_t1 = typename vtype1::zmm_t,
+          typename zmm_t2 = typename vtype2::zmm_t>
+static void COEX(zmm_t1 &key1, zmm_t1 &key2, zmm_t2 &index1, zmm_t2 &index2) {
+    zmm_t1 key_t1 = vtype1::min(key1, key2);
+    zmm_t1 key_t2 = vtype1::max(key1, key2);
+
+    zmm_t2 index_t1 =
+        vtype2::mask_mov(index2, vtype1::eq(key_t1, key1), index1);
+    zmm_t2 index_t2 =
+        vtype2::mask_mov(index1, vtype1::eq(key_t1, key1), index2);
+
+    key1 = key_t1;
+    key2 = key_t2;
+    index1 = index_t1;
+    index2 = index_t2;
+}
+template <typename vtype1, typename vtype2,
+          typename zmm_t1 = typename vtype1::zmm_t,
+          typename zmm_t2 = typename vtype2::zmm_t,
+          typename opmask_t = typename vtype1::opmask_t>
+static inline zmm_t1 cmp_merge(zmm_t1 in1, zmm_t1 in2, zmm_t2 &indexes1,
+                               zmm_t2 indexes2, opmask_t mask) {
+    zmm_t1 tmp_keys = cmp_merge<vtype1>(in1, in2, mask);
+    indexes1 = vtype2::mask_mov(indexes2, vtype1::eq(tmp_keys, in1), indexes1);
+    return tmp_keys;  // 0 -> min, 1 -> max
+}
+
+/*
+ * Parition one ZMM register based on the pivot and returns the index of the
+ * last element that is less than equal to the pivot.
+ */
+template <typename vtype1, typename vtype2,
+          typename type_t1 = typename vtype1::type_t,
+          typename type_t2 = typename vtype2::type_t,
+          typename zmm_t1 = typename vtype1::zmm_t,
+          typename zmm_t2 = typename vtype2::zmm_t>
+static inline int32_t partition_vec(type_t1 *keys, type_t2 *indexes,
+                                    int64_t left, int64_t right,
+                                    const zmm_t1 keys_vec,
+                                    const zmm_t2 indexes_vec,
+                                    const zmm_t1 pivot_vec,
+                                    zmm_t1 *smallest_vec, zmm_t1 *biggest_vec) {
+    /* which elements are larger than the pivot */
+    typename vtype1::opmask_t gt_mask = vtype1::ge(keys_vec, pivot_vec);
+    int32_t amount_gt_pivot = _mm_popcnt_u32((int32_t)gt_mask);
+    vtype1::mask_compressstoreu(keys + left, vtype1::knot_opmask(gt_mask),
+                                keys_vec);
+    vtype1::mask_compressstoreu(keys + right - amount_gt_pivot, gt_mask,
+                                keys_vec);
+    vtype2::mask_compressstoreu(indexes + left, vtype2::knot_opmask(gt_mask),
+                                indexes_vec);
+    vtype2::mask_compressstoreu(indexes + right - amount_gt_pivot, gt_mask,
+                                indexes_vec);
+    *smallest_vec = vtype1::min(keys_vec, *smallest_vec);
+    *biggest_vec = vtype1::max(keys_vec, *biggest_vec);
+    return amount_gt_pivot;
+}
+/*
+ * Parition an array based on the pivot and returns the index of the
+ * last element that is less than equal to the pivot.
+ */
+template <typename vtype1, typename vtype2,
+          typename type_t1 = typename vtype1::type_t,
+          typename type_t2 = typename vtype2::type_t,
+          typename zmm_t1 = typename vtype1::zmm_t,
+          typename zmm_t2 = typename vtype2::zmm_t>
+static inline int64_t partition_avx512(type_t1 *keys, type_t2 *indexes,
+                                       int64_t left, int64_t right,
+                                       type_t1 pivot, type_t1 *smallest,
+                                       type_t1 *biggest) {
+    /* make array length divisible by vtype1::numlanes , shortening the array */
+    for (int32_t i = (right - left) % vtype1::numlanes; i > 0; --i) {
+        *smallest = std::min(*smallest, keys[left]);
+        *biggest = std::max(*biggest, keys[left]);
+        if (keys[left] > pivot) {
+            right--;
+            std::swap(keys[left], keys[right]);
+            std::swap(indexes[left], indexes[right]);
+        } else {
+            ++left;
+        }
+    }
+
+    if (left == right)
+        return left; /* less than vtype1::numlanes elements in the array */
+
+    zmm_t1 pivot_vec = vtype1::set1(pivot);
+    zmm_t1 min_vec = vtype1::set1(*smallest);
+    zmm_t1 max_vec = vtype1::set1(*biggest);
+
+    if (right - left == vtype1::numlanes) {
+        zmm_t1 keys_vec = vtype1::loadu(keys + left);
+        int32_t amount_gt_pivot;
+
+        zmm_t2 indexes_vec = vtype2::loadu(indexes + left);
+        amount_gt_pivot = partition_vec<vtype1, vtype2>(
+            keys, indexes, left, left + vtype1::numlanes, keys_vec, indexes_vec,
+            pivot_vec, &min_vec, &max_vec);
+
+        *smallest = vtype1::reducemin(min_vec);
+        *biggest = vtype1::reducemax(max_vec);
+        return left + (vtype1::numlanes - amount_gt_pivot);
+    }
+
+    // first and last vtype1::numlanes values are partitioned at the end
+    zmm_t1 keys_vec_left = vtype1::loadu(keys + left);
+    zmm_t1 keys_vec_right = vtype1::loadu(keys + (right - vtype1::numlanes));
+    zmm_t2 indexes_vec_left;
+    zmm_t2 indexes_vec_right;
+    indexes_vec_left = vtype2::loadu(indexes + left);
+    indexes_vec_right = vtype2::loadu(indexes + (right - vtype1::numlanes));
+
+    // store points of the vectors
+    int64_t r_store = right - vtype1::numlanes;
+    int64_t l_store = left;
+    // indices for loading the elements
+    left += vtype1::numlanes;
+    right -= vtype1::numlanes;
+    while (right - left != 0) {
+        zmm_t1 keys_vec;
+        zmm_t2 indexes_vec;
+        /*
+         * if fewer elements are stored on the right side of the array,
+         * then next elements are loaded from the right side,
+         * otherwise from the left side
+         */
+        if ((r_store + vtype1::numlanes) - right < left - l_store) {
+            right -= vtype1::numlanes;
+            keys_vec = vtype1::loadu(keys + right);
+            indexes_vec = vtype2::loadu(indexes + right);
+        } else {
+            keys_vec = vtype1::loadu(keys + left);
+            indexes_vec = vtype2::loadu(indexes + left);
+            left += vtype1::numlanes;
+        }
+        // partition the current vector and save it on both sides of the array
+        int32_t amount_gt_pivot;
+
+        amount_gt_pivot = partition_vec<vtype1, vtype2>(
+            keys, indexes, l_store, r_store + vtype1::numlanes, keys_vec,
+            indexes_vec, pivot_vec, &min_vec, &max_vec);
+        r_store -= amount_gt_pivot;
+        l_store += (vtype1::numlanes - amount_gt_pivot);
+    }
+
+    /* partition and save vec_left and vec_right */
+    int32_t amount_gt_pivot;
+    amount_gt_pivot = partition_vec<vtype1, vtype2>(
+        keys, indexes, l_store, r_store + vtype1::numlanes, keys_vec_left,
+        indexes_vec_left, pivot_vec, &min_vec, &max_vec);
+    l_store += (vtype1::numlanes - amount_gt_pivot);
+    amount_gt_pivot = partition_vec<vtype1, vtype2>(
+        keys, indexes, l_store, l_store + vtype1::numlanes, keys_vec_right,
+        indexes_vec_right, pivot_vec, &min_vec, &max_vec);
+    l_store += (vtype1::numlanes - amount_gt_pivot);
+    *smallest = vtype1::reducemin(min_vec);
+    *biggest = vtype1::reducemax(max_vec);
+    return l_store;
+}
+#endif  // AVX512_QSORT_COMMON
diff --git a/src/java.base/linux/native/libavx512_x86_64/avxsort_linux_x86.cpp b/src/java.base/linux/native/libavx512_x86_64/avxsort_linux_x86.cpp
new file mode 100644
index 0000000000000..ec436bb49eee6
--- /dev/null
+++ b/src/java.base/linux/native/libavx512_x86_64/avxsort_linux_x86.cpp
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2023 Intel Corporation. All rights reserved.
+ * Intel x86-simd-sort source code.
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "avx512-32bit-qsort.hpp"
+#include "avx512-64bit-qsort.hpp"
+
+#define DLL_PUBLIC __attribute__((visibility("default")))
+
+extern "C" {
+
+    DLL_PUBLIC void avx512_sort_int(int32_t *array_fromIndex, int64_t fromIndex,
+                                    int64_t toIndex) {
+        avx512_qsort<int32_t>(array_fromIndex, toIndex - fromIndex);
+    }
+
+    DLL_PUBLIC void avx512_sort_long(int64_t *array_fromIndex, int64_t fromIndex,
+                                    int64_t toIndex) {
+        avx512_qsort<int64_t>(array_fromIndex, toIndex - fromIndex);
+    }
+
+    DLL_PUBLIC void avx512_sort_float(float *array_fromIndex, int64_t fromIndex,
+                                    int64_t toIndex) {
+        avx512_qsort<float>(array_fromIndex, toIndex - fromIndex);
+    }
+
+    DLL_PUBLIC void avx512_sort_double(double *array_fromIndex, int64_t fromIndex,
+                                    int64_t toIndex) {
+        avx512_qsort<double>(array_fromIndex, toIndex - fromIndex);
+    }
+
+}
diff --git a/src/java.base/share/classes/java/util/Arrays.java b/src/java.base/share/classes/java/util/Arrays.java
index 9d1034e6fef6c..927def142476d 100644
--- a/src/java.base/share/classes/java/util/Arrays.java
+++ b/src/java.base/share/classes/java/util/Arrays.java
@@ -78,6 +78,26 @@ public final class Arrays {
     // Suppresses default constructor, ensuring non-instantiability.
     private Arrays() {}
 
+    @IntrinsicCandidate
+    private static void arraySort(int[] array, int fromIndex, int toIndex) {
+        DualPivotQuicksort.sort(array, 0, fromIndex, toIndex);
+    }
+
+    @IntrinsicCandidate
+    private static void arraySort(long[] array, int fromIndex, int toIndex) {
+        DualPivotQuicksort.sort(array, 0, fromIndex, toIndex);
+    }
+
+    @IntrinsicCandidate
+    private static void arraySort(float[] array, int fromIndex, int toIndex) {
+        DualPivotQuicksort.sort(array, 0, fromIndex, toIndex);
+    }
+
+    @IntrinsicCandidate
+    private static void arraySort(double[] array, int fromIndex, int toIndex) {
+        DualPivotQuicksort.sort(array, 0, fromIndex, toIndex);
+    }
+
     /*
      * Sorting methods. Note that all public "sort" methods take the
      * same form: performing argument checks if necessary, and then
@@ -97,7 +117,7 @@ private Arrays() {}
      * @param a the array to be sorted
      */
     public static void sort(int[] a) {
-        DualPivotQuicksort.sort(a, 0, 0, a.length);
+        arraySort(a, 0, a.length);
     }
 
     /**
@@ -121,7 +141,7 @@ public static void sort(int[] a) {
      */
     public static void sort(int[] a, int fromIndex, int toIndex) {
         rangeCheck(a.length, fromIndex, toIndex);
-        DualPivotQuicksort.sort(a, 0, fromIndex, toIndex);
+        arraySort(a, fromIndex, toIndex);
     }
 
     /**
@@ -135,7 +155,7 @@ public static void sort(int[] a, int fromIndex, int toIndex) {
      * @param a the array to be sorted
      */
     public static void sort(long[] a) {
-        DualPivotQuicksort.sort(a, 0, 0, a.length);
+        arraySort(a, 0, a.length);
     }
 
     /**
@@ -159,7 +179,7 @@ public static void sort(long[] a) {
      */
     public static void sort(long[] a, int fromIndex, int toIndex) {
         rangeCheck(a.length, fromIndex, toIndex);
-        DualPivotQuicksort.sort(a, 0, fromIndex, toIndex);
+        arraySort(a, fromIndex, toIndex);
     }
 
     /**
@@ -295,7 +315,7 @@ public static void sort(byte[] a, int fromIndex, int toIndex) {
      * @param a the array to be sorted
      */
     public static void sort(float[] a) {
-        DualPivotQuicksort.sort(a, 0, 0, a.length);
+        arraySort(a, 0, a.length);
     }
 
     /**
@@ -327,7 +347,7 @@ public static void sort(float[] a) {
      */
     public static void sort(float[] a, int fromIndex, int toIndex) {
         rangeCheck(a.length, fromIndex, toIndex);
-        DualPivotQuicksort.sort(a, 0, fromIndex, toIndex);
+        arraySort(a, fromIndex, toIndex);
     }
 
     /**
@@ -349,7 +369,7 @@ public static void sort(float[] a, int fromIndex, int toIndex) {
      * @param a the array to be sorted
      */
     public static void sort(double[] a) {
-        DualPivotQuicksort.sort(a, 0, 0, a.length);
+        arraySort(a, 0, a.length);
     }
 
     /**
@@ -381,7 +401,7 @@ public static void sort(double[] a) {
      */
     public static void sort(double[] a, int fromIndex, int toIndex) {
         rangeCheck(a.length, fromIndex, toIndex);
-        DualPivotQuicksort.sort(a, 0, fromIndex, toIndex);
+        arraySort(a, fromIndex, toIndex);
     }
 
     /**
diff --git a/test/micro/org/openjdk/bench/java/util/ArraysSort.java b/test/micro/org/openjdk/bench/java/util/ArraysSort.java
new file mode 100644
index 0000000000000..2780a70b66926
--- /dev/null
+++ b/test/micro/org/openjdk/bench/java/util/ArraysSort.java
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2022, 2023, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+package org.openjdk.bench.java.lang;
+
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OperationsPerInvocation;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+
+import java.util.Arrays;
+import java.util.Random;
+import java.util.concurrent.TimeUnit;
+import java.io.UnsupportedEncodingException;
+import java.lang.invoke.MethodHandle;
+import java.lang.invoke.MethodHandles;
+import java.lang.reflect.Method;
+
+/**
+ * Performance test of Arrays.sort() methods
+ */
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.MICROSECONDS)
+@State(Scope.Thread)
+@Warmup(iterations = 3, time=60)
+@Measurement(iterations = 3, time=120)
+@Fork(value = 1)
+public class ArraysSort {
+
+
+    @Param({"10", "100", "1000", "10000", "100000", "1000000"})
+    private int size;
+
+    private int[] ints_unsorted;
+    private long[] longs_unsorted;
+    private float[] floats_unsorted;
+    private double[] doubles_unsorted;
+
+    private int[] ints_sorted;
+    private long[] longs_sorted;
+    private float[] floats_sorted;
+    private double[] doubles_sorted;
+
+
+    @Setup
+    public void setup() throws UnsupportedEncodingException, ClassNotFoundException, NoSuchMethodException, Throwable {
+        Random rnd = new Random(42);
+
+        ints_unsorted = new int[size];
+        longs_unsorted = new long[size];
+        floats_unsorted = new float[size];
+        doubles_unsorted = new double[size];
+
+        for (int i = 0; i < size; i++) {
+            ints_unsorted[i] = rnd.nextInt();
+            longs_unsorted[i] = rnd.nextLong();
+            floats_unsorted[i] = rnd.nextFloat();
+            doubles_unsorted[i] = rnd.nextDouble();
+        }
+    }
+
+    @Benchmark
+    public void intSort() throws Throwable {
+        ints_sorted = ints_unsorted.clone();
+        Arrays.sort(ints_sorted);
+    }
+
+    @Benchmark
+    public void longSort() throws Throwable {
+        longs_sorted = longs_unsorted.clone();
+        Arrays.sort(longs_sorted);
+    }
+
+    @Benchmark
+    public void floatSort() throws Throwable {
+        floats_sorted = floats_unsorted.clone();
+        Arrays.sort(floats_sorted);
+    }
+
+    @Benchmark
+    public void doubleSort() throws Throwable {
+        doubles_sorted = doubles_unsorted.clone();
+        Arrays.sort(doubles_sorted);
+    }
+
+}

From 923a7cae3d328a76f50354202243e0a592535469 Mon Sep 17 00:00:00 2001
From: vamsi-parasa <srinivas.vamsi.parasa@intel.com>
Date: Tue, 30 May 2023 12:54:54 -0700
Subject: [PATCH 02/40] remove libstdc++

---
 make/modules/java.base/Lib.gmk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/make/modules/java.base/Lib.gmk b/make/modules/java.base/Lib.gmk
index 5ec5d03d59c07..4cbd39546261c 100644
--- a/make/modules/java.base/Lib.gmk
+++ b/make/modules/java.base/Lib.gmk
@@ -244,7 +244,7 @@ ifeq ($(call isTargetOs, linux)+$(call isTargetCpu, x86_64)+$(INCLUDE_COMPILER2)
       LDFLAGS_linux := -Wl$(COMMA)--no-as-needed, \
       LDFLAGS_windows := -defaultlib:msvcrt, \
       LIBS := $(LIBCXX), \
-      LIBS_linux := -lc -lm -ldl -lstdc++, \
+      LIBS_linux := -lc -lm -ldl, \
   ))
 
   TARGETS += $(BUILD_LIBAVX512_X86_64)

From 30a50d998b0e7276ceefb102def660ed449fbe2d Mon Sep 17 00:00:00 2001
From: vamsi-parasa <srinivas.vamsi.parasa@intel.com>
Date: Wed, 31 May 2023 18:09:09 -0700
Subject: [PATCH 03/40] fix license

---
 .../linux/native/libavx512_x86_64/avx512-32bit-qsort.hpp     | 5 ++++-
 .../linux/native/libavx512_x86_64/avx512-64bit-qsort.hpp     | 4 +++-
 .../linux/native/libavx512_x86_64/avx512-common-qsort.h      | 5 ++++-
 test/micro/org/openjdk/bench/java/util/ArraysSort.java       | 2 +-
 4 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/java.base/linux/native/libavx512_x86_64/avx512-32bit-qsort.hpp b/src/java.base/linux/native/libavx512_x86_64/avx512-32bit-qsort.hpp
index 05efac20cbdb2..d2240b29292f9 100644
--- a/src/java.base/linux/native/libavx512_x86_64/avx512-32bit-qsort.hpp
+++ b/src/java.base/linux/native/libavx512_x86_64/avx512-32bit-qsort.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Intel Corporation. All rights reserved.
+ * Copyright (c) 2021, 2023, Intel Corporation. All rights reserved.
  * Intel x86-simd-sort source code.
  *
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
@@ -23,6 +23,9 @@
  * questions.
  *
  */
+
+// This implementation is based on x86-simd-sort(https://github.com/intel/x86-simd-sort)
+
 #ifndef AVX512_QSORT_32BIT
 #define AVX512_QSORT_32BIT
 
diff --git a/src/java.base/linux/native/libavx512_x86_64/avx512-64bit-qsort.hpp b/src/java.base/linux/native/libavx512_x86_64/avx512-64bit-qsort.hpp
index 893f2ce8363c8..1b4cb0a1936a0 100644
--- a/src/java.base/linux/native/libavx512_x86_64/avx512-64bit-qsort.hpp
+++ b/src/java.base/linux/native/libavx512_x86_64/avx512-64bit-qsort.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Intel Corporation. All rights reserved.
+ * Copyright (c) 2021, 2023, Intel Corporation. All rights reserved.
  * Intel x86-simd-sort source code.
  *
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
@@ -24,6 +24,8 @@
  *
  */
 
+// This implementation is based on x86-simd-sort(https://github.com/intel/x86-simd-sort)
+
 #ifndef AVX512_QSORT_64BIT
 #define AVX512_QSORT_64BIT
 
diff --git a/src/java.base/linux/native/libavx512_x86_64/avx512-common-qsort.h b/src/java.base/linux/native/libavx512_x86_64/avx512-common-qsort.h
index b477f9e65c233..3c5806db607d2 100644
--- a/src/java.base/linux/native/libavx512_x86_64/avx512-common-qsort.h
+++ b/src/java.base/linux/native/libavx512_x86_64/avx512-common-qsort.h
@@ -1,5 +1,6 @@
 /*
- * Copyright (c) 2023 Intel Corporation. All rights reserved.
+ * Copyright (c) 2021, 2023, Intel Corporation. All rights reserved.
+ * Copyright (c) 2021 Serge Sans Paille. All rights reserved.
  * Intel x86-simd-sort source code.
  *
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
@@ -24,6 +25,8 @@
  *
  */
 
+// This implementation is based on x86-simd-sort(https://github.com/intel/x86-simd-sort)
+
 #ifndef AVX512_QSORT_COMMON
 #define AVX512_QSORT_COMMON
 
diff --git a/test/micro/org/openjdk/bench/java/util/ArraysSort.java b/test/micro/org/openjdk/bench/java/util/ArraysSort.java
index 2780a70b66926..2bc213977aac1 100644
--- a/test/micro/org/openjdk/bench/java/util/ArraysSort.java
+++ b/test/micro/org/openjdk/bench/java/util/ArraysSort.java
@@ -56,7 +56,7 @@
 public class ArraysSort {
 
 
-    @Param({"10", "100", "1000", "10000", "100000", "1000000"})
+    @Param({"100", "1000", "10000", "100000"})
     private int size;
 
     private int[] ints_unsorted;

From a7c2b6e9add098c933d4936ec35008a7cc657739 Mon Sep 17 00:00:00 2001
From: Srinivas Vamsi Parasa <srinivas.vamsi.parasa@intel.com>
Date: Thu, 1 Jun 2023 08:52:04 -0700
Subject: [PATCH 04/40] Update
 test/micro/org/openjdk/bench/java/util/ArraysSort.java

Co-authored-by: Andrew Haley <aph-open@littlepinkcloud.com>
---
 test/micro/org/openjdk/bench/java/util/ArraysSort.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/micro/org/openjdk/bench/java/util/ArraysSort.java b/test/micro/org/openjdk/bench/java/util/ArraysSort.java
index 2bc213977aac1..2e66bf6291be7 100644
--- a/test/micro/org/openjdk/bench/java/util/ArraysSort.java
+++ b/test/micro/org/openjdk/bench/java/util/ArraysSort.java
@@ -56,7 +56,7 @@
 public class ArraysSort {
 
 
-    @Param({"100", "1000", "10000", "100000"})
+    @Param({"10","25","50","75","100", "1000", "10000", "100000"})
     private int size;
 
     private int[] ints_unsorted;

From 1dc9589eb084049a5fb585458f0f50c524e604d2 Mon Sep 17 00:00:00 2001
From: vamsi-parasa <srinivas.vamsi.parasa@intel.com>
Date: Thu, 1 Jun 2023 10:16:33 -0700
Subject: [PATCH 05/40] fix license in one file

---
 .../linux/native/libavx512_x86_64/avx512-64bit-common.h       | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/java.base/linux/native/libavx512_x86_64/avx512-64bit-common.h b/src/java.base/linux/native/libavx512_x86_64/avx512-64bit-common.h
index 88fee99c0d79e..c435d100e7579 100644
--- a/src/java.base/linux/native/libavx512_x86_64/avx512-64bit-common.h
+++ b/src/java.base/linux/native/libavx512_x86_64/avx512-64bit-common.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Intel Corporation. All rights reserved.
+ * Copyright (c) 2021, 2023, Intel Corporation. All rights reserved.
  * Intel x86-simd-sort source code.
  *
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
@@ -24,6 +24,8 @@
  *
  */
 
+// This implementation is based on x86-simd-sort(https://github.com/intel/x86-simd-sort)
+
 #ifndef AVX512_64BIT_COMMON
 #define AVX512_64BIT_COMMON
 #include "avx512-common-qsort.h"

From 53a5309dc643e2f476364566a0d1b8f8404817ef Mon Sep 17 00:00:00 2001
From: vamsi-parasa <srinivas.vamsi.parasa@intel.com>
Date: Fri, 23 Jun 2023 14:06:41 -0700
Subject: [PATCH 06/40] replace multiple intrinsics with one general intrinsic

---
 make/modules/java.base/Lib.gmk                |  1 -
 src/hotspot/share/classfile/vmIntrinsics.hpp  | 12 ++----
 src/hotspot/share/opto/c2compiler.cpp         |  5 +--
 src/hotspot/share/opto/library_call.cpp       | 34 +++++-----------
 .../share/classes/java/util/Arrays.java       | 39 +++++++------------
 .../openjdk/bench/java/util/ArraysSort.java   | 25 ++++++++----
 6 files changed, 44 insertions(+), 72 deletions(-)

diff --git a/make/modules/java.base/Lib.gmk b/make/modules/java.base/Lib.gmk
index 466a79b0c4d50..a7546fee64018 100644
--- a/make/modules/java.base/Lib.gmk
+++ b/make/modules/java.base/Lib.gmk
@@ -244,7 +244,6 @@ ifeq ($(call isTargetOs, linux)+$(call isTargetCpu, x86_64)+$(INCLUDE_COMPILER2)
       LDFLAGS := $(LDFLAGS_JDKLIB) \
           $(call SET_SHARED_LIBRARY_ORIGIN), \
       LDFLAGS_linux := -Wl$(COMMA)--no-as-needed, \
-      LDFLAGS_windows := -defaultlib:msvcrt, \
       LIBS := $(LIBCXX), \
       LIBS_linux := -lc -lm -ldl, \
   ))
diff --git a/src/hotspot/share/classfile/vmIntrinsics.hpp b/src/hotspot/share/classfile/vmIntrinsics.hpp
index de02d4bad0092..6fd273203bb23 100644
--- a/src/hotspot/share/classfile/vmIntrinsics.hpp
+++ b/src/hotspot/share/classfile/vmIntrinsics.hpp
@@ -341,15 +341,9 @@ class methodHandle;
    do_name(     copyOf_name,                                     "copyOf")                                              \
    do_signature(copyOf_signature,             "([Ljava/lang/Object;ILjava/lang/Class;)[Ljava/lang/Object;")             \
                                                                                                                         \
-  do_intrinsic(_arraySortI,                 java_util_Arrays,     arraySort_name, arraySortI_signature,          F_S)   \
-   do_name(     arraySort_name,                                   "arraySort")                                          \
-   do_signature(arraySortI_signature,                             "([III)V")                                            \
-  do_intrinsic(_arraySortL,                 java_util_Arrays,     arraySort_name, arraySortL_signature,          F_S)   \
-   do_signature(arraySortL_signature,                             "([JII)V")                                            \
-  do_intrinsic(_arraySortF,                 java_util_Arrays,     arraySort_name, arraySortF_signature,          F_S)   \
-   do_signature(arraySortF_signature,                             "([FII)V")                                            \
-  do_intrinsic(_arraySortD,                 java_util_Arrays,     arraySort_name, arraySortD_signature,          F_S)   \
-   do_signature(arraySortD_signature,                             "([DII)V")                                            \
+  do_intrinsic(_arraySort,                java_util_Arrays,       arraySort_name, arraySort_signature,           F_S)   \
+   do_name(     arraySort_name,                                  "arraySort")                                           \
+   do_signature(arraySort_signature,                             "(Ljava/lang/Class;Ljava/lang/Object;II)V")            \
                                                                                                                         \
   do_intrinsic(_copyOfRange,              java_util_Arrays,       copyOfRange_name, copyOfRange_signature,       F_S)   \
    do_name(     copyOfRange_name,                                "copyOfRange")                                         \
diff --git a/src/hotspot/share/opto/c2compiler.cpp b/src/hotspot/share/opto/c2compiler.cpp
index c904c49d22046..8224d3efe41ee 100644
--- a/src/hotspot/share/opto/c2compiler.cpp
+++ b/src/hotspot/share/opto/c2compiler.cpp
@@ -575,10 +575,7 @@ bool C2Compiler::is_intrinsic_supported(const methodHandle& method) {
   case vmIntrinsics::_min_strict:
   case vmIntrinsics::_max_strict:
   case vmIntrinsics::_arraycopy:
-  case vmIntrinsics::_arraySortI:
-  case vmIntrinsics::_arraySortL:
-  case vmIntrinsics::_arraySortF:
-  case vmIntrinsics::_arraySortD:
+  case vmIntrinsics::_arraySort:
   case vmIntrinsics::_indexOfL:
   case vmIntrinsics::_indexOfU:
   case vmIntrinsics::_indexOfUL:
diff --git a/src/hotspot/share/opto/library_call.cpp b/src/hotspot/share/opto/library_call.cpp
index 21eb6b4483064..3e18246ff7509 100644
--- a/src/hotspot/share/opto/library_call.cpp
+++ b/src/hotspot/share/opto/library_call.cpp
@@ -292,10 +292,7 @@ bool LibraryCallKit::try_to_inline(int predicate) {
 
   case vmIntrinsics::_arraycopy:                return inline_arraycopy();
 
-  case vmIntrinsics::_arraySortI:
-  case vmIntrinsics::_arraySortL:
-  case vmIntrinsics::_arraySortF:
-  case vmIntrinsics::_arraySortD:               return inline_arraysort(intrinsic_id());
+  case vmIntrinsics::_arraySort:               return inline_arraysort(intrinsic_id());
 
   case vmIntrinsics::_compareToL:               return inline_string_compareTo(StrIntrinsicNode::LL);
   case vmIntrinsics::_compareToU:               return inline_string_compareTo(StrIntrinsicNode::UU);
@@ -5203,32 +5200,19 @@ bool LibraryCallKit::inline_arraysort(vmIntrinsics::ID id) {
   address stubAddr = nullptr;
   const char *stubName;
   stubName = "arraysort_stub";
-  BasicType bt;
 
-  switch(id) {
-    case vmIntrinsics::_arraySortI:
-      bt = T_INT;
-      break;
-    case vmIntrinsics::_arraySortL:
-      bt = T_LONG;
-      break;
-    case vmIntrinsics::_arraySortF:
-      bt = T_FLOAT;
-      break;
-    case vmIntrinsics::_arraySortD:
-      bt = T_DOUBLE;
-      break;
-    default:
-      break;
-  }
+  Node* elementType     = argument(0);
+  Node* array           = argument(1);
+  Node* fromIndex       = argument(2);
+  Node* toIndex         = argument(3);
+
+  const TypeInstPtr* elem_klass = gvn().type(elementType)->isa_instptr();
+  ciType* elem_type = elem_klass->const_oop()->as_instance()->java_mirror_type();
+  BasicType bt = elem_type->basic_type();
 
   stubAddr = StubRoutines::select_arraysort_function(bt);
   if (stubAddr == nullptr) return false;
 
-  Node* array           = argument(0);
-  Node* fromIndex       = argument(1);
-  Node* toIndex         = argument(2);
-
   array = must_be_not_null(array, true);
 
   const TypeAryPtr* array_type = array->Value(&_gvn)->isa_aryptr();
diff --git a/src/java.base/share/classes/java/util/Arrays.java b/src/java.base/share/classes/java/util/Arrays.java
index 927def142476d..2691a7180caca 100644
--- a/src/java.base/share/classes/java/util/Arrays.java
+++ b/src/java.base/share/classes/java/util/Arrays.java
@@ -79,23 +79,12 @@ public final class Arrays {
     private Arrays() {}
 
     @IntrinsicCandidate
-    private static void arraySort(int[] array, int fromIndex, int toIndex) {
-        DualPivotQuicksort.sort(array, 0, fromIndex, toIndex);
-    }
-
-    @IntrinsicCandidate
-    private static void arraySort(long[] array, int fromIndex, int toIndex) {
-        DualPivotQuicksort.sort(array, 0, fromIndex, toIndex);
-    }
-
-    @IntrinsicCandidate
-    private static void arraySort(float[] array, int fromIndex, int toIndex) {
-        DualPivotQuicksort.sort(array, 0, fromIndex, toIndex);
-    }
-
-    @IntrinsicCandidate
-    private static void arraySort(double[] array, int fromIndex, int toIndex) {
-        DualPivotQuicksort.sort(array, 0, fromIndex, toIndex);
+    private static void arraySort(Class<?> elemType, Object array, int fromIndex, int toIndex) {
+        if (elemType == int.class) DualPivotQuicksort.sort((int[]) array, 0, fromIndex, toIndex);
+        else if (elemType == long.class) DualPivotQuicksort.sort((long[]) array, 0, fromIndex, toIndex);
+        else if (elemType == float.class) DualPivotQuicksort.sort((float[]) array, 0, fromIndex, toIndex);
+        else if (elemType == double.class) DualPivotQuicksort.sort((double[]) array, 0, fromIndex, toIndex);
+        else throw new UnsupportedOperationException("arraySort intrinsic not supported for this type: " + elemType.toString());
     }
 
     /*
@@ -117,7 +106,7 @@ private static void arraySort(double[] array, int fromIndex, int toIndex) {
      * @param a the array to be sorted
      */
     public static void sort(int[] a) {
-        arraySort(a, 0, a.length);
+        arraySort(int.class, a, 0, a.length);
     }
 
     /**
@@ -141,7 +130,7 @@ public static void sort(int[] a) {
      */
     public static void sort(int[] a, int fromIndex, int toIndex) {
         rangeCheck(a.length, fromIndex, toIndex);
-        arraySort(a, fromIndex, toIndex);
+        arraySort(int.class, a, fromIndex, toIndex);
     }
 
     /**
@@ -155,7 +144,7 @@ public static void sort(int[] a, int fromIndex, int toIndex) {
      * @param a the array to be sorted
      */
     public static void sort(long[] a) {
-        arraySort(a, 0, a.length);
+        arraySort(long.class, a, 0, a.length);
     }
 
     /**
@@ -179,7 +168,7 @@ public static void sort(long[] a) {
      */
     public static void sort(long[] a, int fromIndex, int toIndex) {
         rangeCheck(a.length, fromIndex, toIndex);
-        arraySort(a, fromIndex, toIndex);
+        arraySort(long.class, a, fromIndex, toIndex);
     }
 
     /**
@@ -315,7 +304,7 @@ public static void sort(byte[] a, int fromIndex, int toIndex) {
      * @param a the array to be sorted
      */
     public static void sort(float[] a) {
-        arraySort(a, 0, a.length);
+        arraySort(float.class, a, 0, a.length);
     }
 
     /**
@@ -347,7 +336,7 @@ public static void sort(float[] a) {
      */
     public static void sort(float[] a, int fromIndex, int toIndex) {
         rangeCheck(a.length, fromIndex, toIndex);
-        arraySort(a, fromIndex, toIndex);
+        arraySort(float.class, a, fromIndex, toIndex);
     }
 
     /**
@@ -369,7 +358,7 @@ public static void sort(float[] a, int fromIndex, int toIndex) {
      * @param a the array to be sorted
      */
     public static void sort(double[] a) {
-        arraySort(a, 0, a.length);
+        arraySort(double.class, a, 0, a.length);
     }
 
     /**
@@ -401,7 +390,7 @@ public static void sort(double[] a) {
      */
     public static void sort(double[] a, int fromIndex, int toIndex) {
         rangeCheck(a.length, fromIndex, toIndex);
-        arraySort(a, fromIndex, toIndex);
+        arraySort(double.class, a, fromIndex, toIndex);
     }
 
     /**
diff --git a/test/micro/org/openjdk/bench/java/util/ArraysSort.java b/test/micro/org/openjdk/bench/java/util/ArraysSort.java
index 2e66bf6291be7..48d03db2d60a1 100644
--- a/test/micro/org/openjdk/bench/java/util/ArraysSort.java
+++ b/test/micro/org/openjdk/bench/java/util/ArraysSort.java
@@ -33,6 +33,7 @@
 import org.openjdk.jmh.annotations.Scope;
 import org.openjdk.jmh.annotations.Setup;
 import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Level;
 import org.openjdk.jmh.annotations.Warmup;
 import org.openjdk.jmh.infra.Blackhole;
 
@@ -87,28 +88,36 @@ public void setup() throws UnsupportedEncodingException, ClassNotFoundException,
         }
     }
 
-    @Benchmark
-    public void intSort() throws Throwable {
+    @Setup(Level.Invocation)
+    public void init() {
         ints_sorted = ints_unsorted.clone();
+        longs_sorted = longs_unsorted.clone();
+        floats_sorted = floats_unsorted.clone();
+        doubles_sorted = doubles_unsorted.clone();
+    }
+
+    @Benchmark
+    public int[] intSort() throws Throwable {
         Arrays.sort(ints_sorted);
+        return ints_sorted;
     }
 
     @Benchmark
-    public void longSort() throws Throwable {
-        longs_sorted = longs_unsorted.clone();
+    public long[] longSort() throws Throwable {
         Arrays.sort(longs_sorted);
+        return longs_sorted;
     }
 
     @Benchmark
-    public void floatSort() throws Throwable {
-        floats_sorted = floats_unsorted.clone();
+    public float[] floatSort() throws Throwable {
         Arrays.sort(floats_sorted);
+        return floats_sorted;
     }
 
     @Benchmark
-    public void doubleSort() throws Throwable {
-        doubles_sorted = doubles_unsorted.clone();
+    public double[] doubleSort() throws Throwable {
         Arrays.sort(doubles_sorted);
+        return doubles_sorted;
     }
 
 }

From 2bd0419167c889048f1186a8ef72cf2761599603 Mon Sep 17 00:00:00 2001
From: vamsi-parasa <srinivas.vamsi.parasa@intel.com>
Date: Tue, 27 Jun 2023 09:53:32 -0700
Subject: [PATCH 07/40] minor cleanups

---
 src/hotspot/share/opto/library_call.cpp                | 4 ++--
 src/hotspot/share/opto/library_call.hpp                | 2 +-
 test/micro/org/openjdk/bench/java/util/ArraysSort.java | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/hotspot/share/opto/library_call.cpp b/src/hotspot/share/opto/library_call.cpp
index 3e18246ff7509..981606ef701cb 100644
--- a/src/hotspot/share/opto/library_call.cpp
+++ b/src/hotspot/share/opto/library_call.cpp
@@ -292,7 +292,7 @@ bool LibraryCallKit::try_to_inline(int predicate) {
 
   case vmIntrinsics::_arraycopy:                return inline_arraycopy();
 
-  case vmIntrinsics::_arraySort:               return inline_arraysort(intrinsic_id());
+  case vmIntrinsics::_arraySort:               return inline_arraysort();
 
   case vmIntrinsics::_compareToL:               return inline_string_compareTo(StrIntrinsicNode::LL);
   case vmIntrinsics::_compareToU:               return inline_string_compareTo(StrIntrinsicNode::UU);
@@ -5195,7 +5195,7 @@ void LibraryCallKit::create_new_uncommon_trap(CallStaticJavaNode* uncommon_trap_
 }
 
 //------------------------------inline_arraysort-----------------------
-bool LibraryCallKit::inline_arraysort(vmIntrinsics::ID id) {
+bool LibraryCallKit::inline_arraysort() {
 
   address stubAddr = nullptr;
   const char *stubName;
diff --git a/src/hotspot/share/opto/library_call.hpp b/src/hotspot/share/opto/library_call.hpp
index 52725e87080f1..53d697f6b2078 100644
--- a/src/hotspot/share/opto/library_call.hpp
+++ b/src/hotspot/share/opto/library_call.hpp
@@ -279,7 +279,7 @@ class LibraryCallKit : public GraphKit {
   JVMState* arraycopy_restore_alloc_state(AllocateArrayNode* alloc, int& saved_reexecute_sp);
   void arraycopy_move_allocation_here(AllocateArrayNode* alloc, Node* dest, JVMState* saved_jvms_before_guards, int saved_reexecute_sp,
                                       uint new_idx);
-  bool inline_arraysort(vmIntrinsics::ID id);
+  bool inline_arraysort();
   typedef enum { LS_get_add, LS_get_set, LS_cmp_swap, LS_cmp_swap_weak, LS_cmp_exchange } LoadStoreKind;
   bool inline_unsafe_load_store(BasicType type,  LoadStoreKind kind, AccessKind access_kind);
   bool inline_unsafe_fence(vmIntrinsics::ID id);
diff --git a/test/micro/org/openjdk/bench/java/util/ArraysSort.java b/test/micro/org/openjdk/bench/java/util/ArraysSort.java
index 48d03db2d60a1..cfe59e34e801e 100644
--- a/test/micro/org/openjdk/bench/java/util/ArraysSort.java
+++ b/test/micro/org/openjdk/bench/java/util/ArraysSort.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, 2023, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2023 Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it

From e09c05015afe176c6699f9fe7f95caccf86eb952 Mon Sep 17 00:00:00 2001
From: vamsi-parasa <srinivas.vamsi.parasa@intel.com>
Date: Tue, 25 Jul 2023 12:19:14 -0700
Subject: [PATCH 08/40] change API to enable MemorySegment

---
 src/hotspot/share/classfile/vmIntrinsics.hpp  |  2 +-
 src/hotspot/share/opto/library_call.cpp       | 28 ++++-------
 .../libavx512_x86_64/avx512-32bit-qsort.hpp   | 22 +++++----
 .../libavx512_x86_64/avx512-64bit-qsort.hpp   | 21 ++++----
 .../libavx512_x86_64/avx512-common-qsort.h    | 48 +++++++++++++++----
 .../share/classes/java/util/Arrays.java       | 38 +++++++++++----
 6 files changed, 106 insertions(+), 53 deletions(-)

diff --git a/src/hotspot/share/classfile/vmIntrinsics.hpp b/src/hotspot/share/classfile/vmIntrinsics.hpp
index 6fd273203bb23..aa9f9660bbba4 100644
--- a/src/hotspot/share/classfile/vmIntrinsics.hpp
+++ b/src/hotspot/share/classfile/vmIntrinsics.hpp
@@ -343,7 +343,7 @@ class methodHandle;
                                                                                                                         \
   do_intrinsic(_arraySort,                java_util_Arrays,       arraySort_name, arraySort_signature,           F_S)   \
    do_name(     arraySort_name,                                  "arraySort")                                           \
-   do_signature(arraySort_signature,                             "(Ljava/lang/Class;Ljava/lang/Object;II)V")            \
+   do_signature(arraySort_signature,                             "(Ljava/lang/Class;Ljava/lang/Object;JII)V")           \
                                                                                                                         \
   do_intrinsic(_copyOfRange,              java_util_Arrays,       copyOfRange_name, copyOfRange_signature,       F_S)   \
    do_name(     copyOfRange_name,                                "copyOfRange")                                         \
diff --git a/src/hotspot/share/opto/library_call.cpp b/src/hotspot/share/opto/library_call.cpp
index 981606ef701cb..132835ba4aff8 100644
--- a/src/hotspot/share/opto/library_call.cpp
+++ b/src/hotspot/share/opto/library_call.cpp
@@ -5201,35 +5201,27 @@ bool LibraryCallKit::inline_arraysort() {
   const char *stubName;
   stubName = "arraysort_stub";
 
-  Node* elementType     = argument(0);
-  Node* array           = argument(1);
-  Node* fromIndex       = argument(2);
-  Node* toIndex         = argument(3);
+  Node* elementType     = null_check(argument(0));
+  Node* obj             = argument(1);
+  Node* offset          = argument(2);
+  Node* fromIndex       = argument(4);
+  Node* toIndex         = argument(5);
 
   const TypeInstPtr* elem_klass = gvn().type(elementType)->isa_instptr();
   ciType* elem_type = elem_klass->const_oop()->as_instance()->java_mirror_type();
   BasicType bt = elem_type->basic_type();
-
   stubAddr = StubRoutines::select_arraysort_function(bt);
   if (stubAddr == nullptr) return false;
 
-  array = must_be_not_null(array, true);
-
-  const TypeAryPtr* array_type = array->Value(&_gvn)->isa_aryptr();
-  assert(array_type != nullptr &&  array_type->elem() != Type::BOTTOM, "args are strange");
-
-  // for the quick and dirty code we will skip all the checks.
-  // we are just trying to get the call to be generated.
-  Node* array_fromIndex  = array;
-  if (fromIndex != nullptr || toIndex != nullptr) {
-    assert(fromIndex != nullptr && toIndex != nullptr, "");
-    array_fromIndex = array_element_address(array, fromIndex, bt);
+  const TypeAryPtr* obj_t = _gvn.type(obj)->isa_aryptr();
+  if (obj_t == nullptr || obj_t->elem() == Type::BOTTOM ) {
+    return false; // failed input validation
   }
-
+  Node* obj_adr = make_unsafe_address(obj, offset);
   // Call the stub.
   make_runtime_call(RC_LEAF|RC_NO_FP, OptoRuntime::array_sort_Type(),
                     stubAddr, stubName, TypePtr::BOTTOM,
-                    array_fromIndex, fromIndex, toIndex);
+                    obj_adr, fromIndex, toIndex);
 
   return true;
 }
diff --git a/src/java.base/linux/native/libavx512_x86_64/avx512-32bit-qsort.hpp b/src/java.base/linux/native/libavx512_x86_64/avx512-32bit-qsort.hpp
index d2240b29292f9..9c8f1af6a9caf 100644
--- a/src/java.base/linux/native/libavx512_x86_64/avx512-32bit-qsort.hpp
+++ b/src/java.base/linux/native/libavx512_x86_64/avx512-32bit-qsort.hpp
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2021, 2023, Intel Corporation. All rights reserved.
+ * Copyright (c) 2021 Serge Sans Paille. All rights reserved.
  * Intel x86-simd-sort source code.
  *
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
@@ -550,7 +551,8 @@ X86_SIMD_SORT_INLINE void replace_inf_with_nan(float *arr, int64_t arrsize,
 }
 
 template <>
-void avx512_qselect<int32_t>(int32_t *arr, int64_t k, int64_t arrsize) {
+void avx512_qselect<int32_t>(int32_t *arr, int64_t k, int64_t arrsize,
+                             bool hasnan) {
     if (arrsize > 1) {
         qselect_32bit_<zmm_vector<int32_t>, int32_t>(
             arr, k, 0, arrsize - 1, 2 * (int64_t)log2(arrsize));
@@ -558,7 +560,8 @@ void avx512_qselect<int32_t>(int32_t *arr, int64_t k, int64_t arrsize) {
 }
 
 template <>
-void avx512_qselect<uint32_t>(uint32_t *arr, int64_t k, int64_t arrsize) {
+void avx512_qselect<uint32_t>(uint32_t *arr, int64_t k, int64_t arrsize,
+                              bool hasnan) {
     if (arrsize > 1) {
         qselect_32bit_<zmm_vector<uint32_t>, uint32_t>(
             arr, k, 0, arrsize - 1, 2 * (int64_t)log2(arrsize));
@@ -566,12 +569,15 @@ void avx512_qselect<uint32_t>(uint32_t *arr, int64_t k, int64_t arrsize) {
 }
 
 template <>
-void avx512_qselect<float>(float *arr, int64_t k, int64_t arrsize) {
-    if (arrsize > 1) {
-        int64_t nan_count = replace_nan_with_inf(arr, arrsize);
-        qselect_32bit_<zmm_vector<float>, float>(arr, k, 0, arrsize - 1,
-                                                 2 * (int64_t)log2(arrsize));
-        replace_inf_with_nan(arr, arrsize, nan_count);
+void avx512_qselect<float>(float *arr, int64_t k, int64_t arrsize,
+                           bool hasnan) {
+    int64_t indx_last_elem = arrsize - 1;
+    if (UNLIKELY(hasnan)) {
+        indx_last_elem = move_nans_to_end_of_array(arr, arrsize);
+    }
+    if (indx_last_elem >= k) {
+        qselect_32bit_<zmm_vector<float>, float>(
+            arr, k, 0, indx_last_elem, 2 * (int64_t)log2(indx_last_elem));
     }
 }
 
diff --git a/src/java.base/linux/native/libavx512_x86_64/avx512-64bit-qsort.hpp b/src/java.base/linux/native/libavx512_x86_64/avx512-64bit-qsort.hpp
index 1b4cb0a1936a0..2d01663923a9b 100644
--- a/src/java.base/linux/native/libavx512_x86_64/avx512-64bit-qsort.hpp
+++ b/src/java.base/linux/native/libavx512_x86_64/avx512-64bit-qsort.hpp
@@ -783,7 +783,8 @@ static void qselect_64bit_(type_t *arr, int64_t pos, int64_t left,
 }
 
 template <>
-void avx512_qselect<int64_t>(int64_t *arr, int64_t k, int64_t arrsize) {
+void avx512_qselect<int64_t>(int64_t *arr, int64_t k, int64_t arrsize,
+                             bool hasnan) {
     if (arrsize > 1) {
         qselect_64bit_<zmm_vector<int64_t>, int64_t>(
             arr, k, 0, arrsize - 1, 2 * (int64_t)log2(arrsize));
@@ -791,7 +792,8 @@ void avx512_qselect<int64_t>(int64_t *arr, int64_t k, int64_t arrsize) {
 }
 
 template <>
-void avx512_qselect<uint64_t>(uint64_t *arr, int64_t k, int64_t arrsize) {
+void avx512_qselect<uint64_t>(uint64_t *arr, int64_t k, int64_t arrsize,
+                              bool hasnan) {
     if (arrsize > 1) {
         qselect_64bit_<zmm_vector<uint64_t>, uint64_t>(
             arr, k, 0, arrsize - 1, 2 * (int64_t)log2(arrsize));
@@ -799,12 +801,15 @@ void avx512_qselect<uint64_t>(uint64_t *arr, int64_t k, int64_t arrsize) {
 }
 
 template <>
-void avx512_qselect<double>(double *arr, int64_t k, int64_t arrsize) {
-    if (arrsize > 1) {
-        int64_t nan_count = replace_nan_with_inf(arr, arrsize);
-        qselect_64bit_<zmm_vector<double>, double>(arr, k, 0, arrsize - 1,
-                                                   2 * (int64_t)log2(arrsize));
-        replace_inf_with_nan(arr, arrsize, nan_count);
+void avx512_qselect<double>(double *arr, int64_t k, int64_t arrsize,
+                            bool hasnan) {
+    int64_t indx_last_elem = arrsize - 1;
+    if (UNLIKELY(hasnan)) {
+        indx_last_elem = move_nans_to_end_of_array(arr, arrsize);
+    }
+    if (indx_last_elem >= k) {
+        qselect_64bit_<zmm_vector<double>, double>(
+            arr, k, 0, indx_last_elem, 2 * (int64_t)log2(indx_last_elem));
     }
 }
 
diff --git a/src/java.base/linux/native/libavx512_x86_64/avx512-common-qsort.h b/src/java.base/linux/native/libavx512_x86_64/avx512-common-qsort.h
index 3c5806db607d2..9eb09689f72c2 100644
--- a/src/java.base/linux/native/libavx512_x86_64/avx512-common-qsort.h
+++ b/src/java.base/linux/native/libavx512_x86_64/avx512-common-qsort.h
@@ -56,12 +56,11 @@
  *
  */
 
-#include <immintrin.h>
-
 #include <algorithm>
 #include <cmath>
 #include <cstdint>
 #include <cstring>
+#include <immintrin.h>
 #include <limits>
 
 #define X86_SIMD_SORT_INFINITY std::numeric_limits<double>::infinity()
@@ -107,6 +106,9 @@
 #define X86_SIMD_SORT_FINLINE static
 #endif
 
+#define LIKELY(x) __builtin_expect((x), 1)
+#define UNLIKELY(x) __builtin_expect((x), 0)
+
 template <typename type>
 struct zmm_vector;
 
@@ -119,17 +121,19 @@ void avx512_qsort(T *arr, int64_t arrsize);
 void avx512_qsort_fp16(uint16_t *arr, int64_t arrsize);
 
 template <typename T>
-void avx512_qselect(T *arr, int64_t k, int64_t arrsize);
-void avx512_qselect_fp16(uint16_t *arr, int64_t k, int64_t arrsize);
+void avx512_qselect(T *arr, int64_t k, int64_t arrsize, bool hasnan = false);
+void avx512_qselect_fp16(uint16_t *arr, int64_t k, int64_t arrsize,
+                         bool hasnan = false);
 
 template <typename T>
-inline void avx512_partial_qsort(T *arr, int64_t k, int64_t arrsize) {
-    avx512_qselect<T>(arr, k - 1, arrsize);
+inline void avx512_partial_qsort(T *arr, int64_t k, int64_t arrsize,
+                                 bool hasnan = false) {
+    avx512_qselect<T>(arr, k - 1, arrsize, hasnan);
     avx512_qsort<T>(arr, k - 1);
 }
-inline void avx512_partial_qsort_fp16(uint16_t *arr, int64_t k,
-                                      int64_t arrsize) {
-    avx512_qselect_fp16(arr, k - 1, arrsize);
+inline void avx512_partial_qsort_fp16(uint16_t *arr, int64_t k, int64_t arrsize,
+                                      bool hasnan = false) {
+    avx512_qselect_fp16(arr, k - 1, arrsize, hasnan);
     avx512_qsort_fp16(arr, k - 1);
 }
 
@@ -137,6 +141,32 @@ inline void avx512_partial_qsort_fp16(uint16_t *arr, int64_t k,
 template <typename T>
 void avx512_qsort_kv(T *keys, uint64_t *indexes, int64_t arrsize);
 
+template <typename T>
+bool is_a_nan(T elem) {
+    return std::isnan(elem);
+}
+
+/*
+ * Sort all the NAN's to end of the array and return the index of the last elem
+ * in the array which is not a nan
+ */
+template <typename T>
+int64_t move_nans_to_end_of_array(T *arr, int64_t arrsize) {
+    int64_t jj = arrsize - 1;
+    int64_t ii = 0;
+    int64_t count = 0;
+    while (ii <= jj) {
+        if (is_a_nan(arr[ii])) {
+            std::swap(arr[ii], arr[jj]);
+            jj -= 1;
+            count++;
+        } else {
+            ii += 1;
+        }
+    }
+    return arrsize - count - 1;
+}
+
 template <typename vtype, typename T = typename vtype::type_t>
 bool comparison_func(const T &a, const T &b) {
     return a < b;
diff --git a/src/java.base/share/classes/java/util/Arrays.java b/src/java.base/share/classes/java/util/Arrays.java
index cd1d19690562d..91bfbd6e5e388 100644
--- a/src/java.base/share/classes/java/util/Arrays.java
+++ b/src/java.base/share/classes/java/util/Arrays.java
@@ -30,6 +30,7 @@
 
 import java.io.Serializable;
 import java.lang.reflect.Array;
+import java.util.Arrays.NaturalOrder;
 import java.util.concurrent.ForkJoinPool;
 import java.util.function.BinaryOperator;
 import java.util.function.Consumer;
@@ -46,6 +47,7 @@
 import java.util.stream.LongStream;
 import java.util.stream.Stream;
 import java.util.stream.StreamSupport;
+import jdk.internal.misc.Unsafe;
 
 /**
  * This class contains various methods for manipulating arrays (such as
@@ -77,8 +79,18 @@ public final class Arrays {
     // Suppresses default constructor, ensuring non-instantiability.
     private Arrays() {}
 
+    /**
+     * Sorts the specified array into ascending numerical order.
+     *
+     *
+     * @param elemType the class of the array to be sorted
+     * @param array the array to be sorted
+     * @param offset the array offset
+     * @param fromIndex from Index
+     * @param toIndex to Index
+     */
     @IntrinsicCandidate
-    private static void arraySort(Class<?> elemType, Object array, int fromIndex, int toIndex) {
+    public static void arraySort(Class<?> elemType, Object array, long offset, int fromIndex, int toIndex) {
         if (elemType == int.class) DualPivotQuicksort.sort((int[]) array, 0, fromIndex, toIndex);
         else if (elemType == long.class) DualPivotQuicksort.sort((long[]) array, 0, fromIndex, toIndex);
         else if (elemType == float.class) DualPivotQuicksort.sort((float[]) array, 0, fromIndex, toIndex);
@@ -105,7 +117,8 @@ private static void arraySort(Class<?> elemType, Object array, int fromIndex, in
      * @param a the array to be sorted
      */
     public static void sort(int[] a) {
-        arraySort(int.class, a, 0, a.length);
+        int offset = Unsafe.ARRAY_INT_BASE_OFFSET;
+        arraySort(int.class, a, offset, 0, a.length);
     }
 
     /**
@@ -129,7 +142,8 @@ public static void sort(int[] a) {
      */
     public static void sort(int[] a, int fromIndex, int toIndex) {
         rangeCheck(a.length, fromIndex, toIndex);
-        arraySort(int.class, a, fromIndex, toIndex);
+        int offset = Unsafe.ARRAY_INT_BASE_OFFSET + (fromIndex << ArraysSupport.LOG2_ARRAY_INT_INDEX_SCALE);
+        arraySort(int.class, a, offset, fromIndex, toIndex);
     }
 
     /**
@@ -143,7 +157,8 @@ public static void sort(int[] a, int fromIndex, int toIndex) {
      * @param a the array to be sorted
      */
     public static void sort(long[] a) {
-        arraySort(long.class, a, 0, a.length);
+        int offset = Unsafe.ARRAY_LONG_BASE_OFFSET;
+        arraySort(long.class, a, offset, 0, a.length);
     }
 
     /**
@@ -167,7 +182,8 @@ public static void sort(long[] a) {
      */
     public static void sort(long[] a, int fromIndex, int toIndex) {
         rangeCheck(a.length, fromIndex, toIndex);
-        arraySort(long.class, a, fromIndex, toIndex);
+        int offset = Unsafe.ARRAY_LONG_BASE_OFFSET + (fromIndex << ArraysSupport.LOG2_ARRAY_LONG_INDEX_SCALE);
+        arraySort(long.class, a, offset, fromIndex, toIndex);
     }
 
     /**
@@ -303,7 +319,8 @@ public static void sort(byte[] a, int fromIndex, int toIndex) {
      * @param a the array to be sorted
      */
     public static void sort(float[] a) {
-        arraySort(float.class, a, 0, a.length);
+        int offset = Unsafe.ARRAY_FLOAT_BASE_OFFSET;
+        arraySort(float.class, a, offset, 0, a.length);
     }
 
     /**
@@ -335,7 +352,8 @@ public static void sort(float[] a) {
      */
     public static void sort(float[] a, int fromIndex, int toIndex) {
         rangeCheck(a.length, fromIndex, toIndex);
-        arraySort(float.class, a, fromIndex, toIndex);
+        int offset = Unsafe.ARRAY_FLOAT_BASE_OFFSET + (fromIndex << ArraysSupport.LOG2_ARRAY_FLOAT_INDEX_SCALE);
+        arraySort(float.class, a, offset, fromIndex, toIndex);
     }
 
     /**
@@ -357,7 +375,8 @@ public static void sort(float[] a, int fromIndex, int toIndex) {
      * @param a the array to be sorted
      */
     public static void sort(double[] a) {
-        arraySort(double.class, a, 0, a.length);
+        int offset = Unsafe.ARRAY_DOUBLE_BASE_OFFSET;
+        arraySort(double.class, a, offset, 0, a.length);
     }
 
     /**
@@ -389,7 +408,8 @@ public static void sort(double[] a) {
      */
     public static void sort(double[] a, int fromIndex, int toIndex) {
         rangeCheck(a.length, fromIndex, toIndex);
-        arraySort(double.class, a, fromIndex, toIndex);
+        int offset = Unsafe.ARRAY_DOUBLE_BASE_OFFSET + (fromIndex << ArraysSupport.LOG2_ARRAY_DOUBLE_INDEX_SCALE);
+        arraySort(double.class, a, offset, fromIndex, toIndex);
     }
 
     /**

From 5eac7b327a2e32c27285954457c06b1e44b84756 Mon Sep 17 00:00:00 2001
From: vamsi-parasa <srinivas.vamsi.parasa@intel.com>
Date: Tue, 25 Jul 2023 12:43:51 -0700
Subject: [PATCH 09/40] update arraySort docstring

---
 src/java.base/share/classes/java/util/Arrays.java | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/java.base/share/classes/java/util/Arrays.java b/src/java.base/share/classes/java/util/Arrays.java
index 91bfbd6e5e388..bd198960ea2b9 100644
--- a/src/java.base/share/classes/java/util/Arrays.java
+++ b/src/java.base/share/classes/java/util/Arrays.java
@@ -85,9 +85,11 @@ private Arrays() {}
      *
      * @param elemType the class of the array to be sorted
      * @param array the array to be sorted
-     * @param offset the array offset
-     * @param fromIndex from Index
-     * @param toIndex to Index
+     * @param offset the relative offset, in bytes, from the base address of
+     * the array to sort, otherwise if the array is {@code null},an absolute
+     * address pointing to the first element to sort from.
+     * @param fromIndex the index of the first element, inclusive, to be sorted
+     * @param toIndex the index of the last element, exclusive, to be sorted
      */
     @IntrinsicCandidate
     public static void arraySort(Class<?> elemType, Object array, long offset, int fromIndex, int toIndex) {

From 240fde1840e9f55500cfbf4bc8b067b69786d5fe Mon Sep 17 00:00:00 2001
From: vamsi-parasa <srinivas.vamsi.parasa@intel.com>
Date: Tue, 25 Jul 2023 13:24:05 -0700
Subject: [PATCH 10/40] add special cases to float and double arrays

---
 .../org/openjdk/bench/java/util/ArraysSort.java     | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/test/micro/org/openjdk/bench/java/util/ArraysSort.java b/test/micro/org/openjdk/bench/java/util/ArraysSort.java
index cfe59e34e801e..a77e025602289 100644
--- a/test/micro/org/openjdk/bench/java/util/ArraysSort.java
+++ b/test/micro/org/openjdk/bench/java/util/ArraysSort.java
@@ -80,11 +80,20 @@ public void setup() throws UnsupportedEncodingException, ClassNotFoundException,
         floats_unsorted = new float[size];
         doubles_unsorted = new double[size];
 
+        float[] floatSpecialCases = {+0.0f, -0.0f, Float.POSITIVE_INFINITY, Float.NEGATIVE_INFINITY, Float.NaN};
+        double[] doubleSpecialCases = {+0.0, -0.0, Double.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY, Double.NaN};
+
         for (int i = 0; i < size; i++) {
             ints_unsorted[i] = rnd.nextInt();
             longs_unsorted[i] = rnd.nextLong();
-            floats_unsorted[i] = rnd.nextFloat();
-            doubles_unsorted[i] = rnd.nextDouble();
+            if (i % 10 != 0) {
+                floats_unsorted[i] = rnd.nextFloat();
+                doubles_unsorted[i] = rnd.nextDouble();
+            } else {
+                int rndIdx = rnd.nextInt(doubleSpecialCases.length);
+                floats_unsorted[i] = floatSpecialCases[rndIdx];
+                doubles_unsorted[i] = doubleSpecialCases[rndIdx];
+            }
         }
     }
 

From 17b51270a84beee1e053f6ab031b5ecc5706303c Mon Sep 17 00:00:00 2001
From: Srinivas Vamsi Parasa <srinivas.vamsi.parasa@intel.com>
Date: Tue, 1 Aug 2023 10:46:33 -0700
Subject: [PATCH 11/40] Update
 src/java.base/share/classes/java/util/Arrays.java

Co-authored-by: David Schlosnagle <schlosna@gmail.com>
---
 src/java.base/share/classes/java/util/Arrays.java | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/java.base/share/classes/java/util/Arrays.java b/src/java.base/share/classes/java/util/Arrays.java
index bd198960ea2b9..13def8cd35261 100644
--- a/src/java.base/share/classes/java/util/Arrays.java
+++ b/src/java.base/share/classes/java/util/Arrays.java
@@ -93,11 +93,14 @@ private Arrays() {}
      */
     @IntrinsicCandidate
     public static void arraySort(Class<?> elemType, Object array, long offset, int fromIndex, int toIndex) {
-        if (elemType == int.class) DualPivotQuicksort.sort((int[]) array, 0, fromIndex, toIndex);
-        else if (elemType == long.class) DualPivotQuicksort.sort((long[]) array, 0, fromIndex, toIndex);
-        else if (elemType == float.class) DualPivotQuicksort.sort((float[]) array, 0, fromIndex, toIndex);
-        else if (elemType == double.class) DualPivotQuicksort.sort((double[]) array, 0, fromIndex, toIndex);
-        else throw new UnsupportedOperationException("arraySort intrinsic not supported for this type: " + elemType.toString());
+        switch (array) {
+            case int[] arr -> DualPivotQuicksort.sort(arr, 0, fromIndex, toIndex);
+            case long[] arr -> DualPivotQuicksort.sort(arr, 0, fromIndex, toIndex);
+            case float[] arr -> DualPivotQuicksort.sort(arr, 0, fromIndex, toIndex);
+            case double[] arr -> DualPivotQuicksort.sort(arr, 0, fromIndex, toIndex);
+            default -> throw new UnsupportedOperationException(
+                    "arraySort intrinsic not supported for this type: " + elemType);
+        }
     }
 
     /*

From a2e14d45b24af5412fc52129ab5f9953e87a63d6 Mon Sep 17 00:00:00 2001
From: vamsi-parasa <srinivas.vamsi.parasa@intel.com>
Date: Fri, 4 Aug 2023 11:18:30 -0700
Subject: [PATCH 12/40] fix arraySort API and fastdebug issue

---
 src/hotspot/share/opto/escape.cpp                 | 1 +
 src/java.base/share/classes/java/util/Arrays.java | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/hotspot/share/opto/escape.cpp b/src/hotspot/share/opto/escape.cpp
index 138511cffc9ca..eed159ff6c8e7 100644
--- a/src/hotspot/share/opto/escape.cpp
+++ b/src/hotspot/share/opto/escape.cpp
@@ -1190,6 +1190,7 @@ void ConnectionGraph::process_call_arguments(CallNode *call) {
                   strcmp(call->as_CallLeaf()->_name, "bigIntegerRightShiftWorker") == 0 ||
                   strcmp(call->as_CallLeaf()->_name, "bigIntegerLeftShiftWorker") == 0 ||
                   strcmp(call->as_CallLeaf()->_name, "vectorizedMismatch") == 0 ||
+                  strcmp(call->as_CallLeaf()->_name, "arraysort_stub") == 0 ||
                   strcmp(call->as_CallLeaf()->_name, "get_class_id_intrinsic") == 0)
                  ))) {
             call->dump();
diff --git a/src/java.base/share/classes/java/util/Arrays.java b/src/java.base/share/classes/java/util/Arrays.java
index 13def8cd35261..d5ce85d336074 100644
--- a/src/java.base/share/classes/java/util/Arrays.java
+++ b/src/java.base/share/classes/java/util/Arrays.java
@@ -83,7 +83,7 @@ private Arrays() {}
      * Sorts the specified array into ascending numerical order.
      *
      *
-     * @param elemType the class of the array to be sorted
+     * @param elemType the class of the elements of the array to be sorted
      * @param array the array to be sorted
      * @param offset the relative offset, in bytes, from the base address of
      * the array to sort, otherwise if the array is {@code null},an absolute
@@ -92,7 +92,7 @@ private Arrays() {}
      * @param toIndex the index of the last element, exclusive, to be sorted
      */
     @IntrinsicCandidate
-    public static void arraySort(Class<?> elemType, Object array, long offset, int fromIndex, int toIndex) {
+    private static void arraySort(Class<?> elemType, Object array, long offset, int fromIndex, int toIndex) {
         switch (array) {
             case int[] arr -> DualPivotQuicksort.sort(arr, 0, fromIndex, toIndex);
             case long[] arr -> DualPivotQuicksort.sort(arr, 0, fromIndex, toIndex);

From 7065f1cf6126224c7f64f3f26fc4446df0913995 Mon Sep 17 00:00:00 2001
From: vamsi-parasa <srinivas.vamsi.parasa@intel.com>
Date: Fri, 4 Aug 2023 11:34:30 -0700
Subject: [PATCH 13/40] moved stubroutines definitions to vmStructs_jvmci.cpp

---
 src/hotspot/share/jvmci/vmStructs_jvmci.cpp | 4 ++++
 src/hotspot/share/runtime/vmStructs.cpp     | 4 ----
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/hotspot/share/jvmci/vmStructs_jvmci.cpp b/src/hotspot/share/jvmci/vmStructs_jvmci.cpp
index a4195a04f1866..86753a501d2e7 100644
--- a/src/hotspot/share/jvmci/vmStructs_jvmci.cpp
+++ b/src/hotspot/share/jvmci/vmStructs_jvmci.cpp
@@ -327,6 +327,10 @@
   static_field(StubRoutines,                _checkcast_arraycopy_uninit,                      address)                               \
   static_field(StubRoutines,                _unsafe_arraycopy,                                address)                               \
   static_field(StubRoutines,                _generic_arraycopy,                               address)                               \
+  static_field(StubRoutines,                _arraysort_int,                                   address)                               \
+  static_field(StubRoutines,                _arraysort_long,                                  address)                               \
+  static_field(StubRoutines,                _arraysort_float,                                 address)                               \
+  static_field(StubRoutines,                _arraysort_double,                                address)                               \
                                                                                                                                      \
   static_field(StubRoutines,                _aescrypt_encryptBlock,                           address)                               \
   static_field(StubRoutines,                _aescrypt_decryptBlock,                           address)                               \
diff --git a/src/hotspot/share/runtime/vmStructs.cpp b/src/hotspot/share/runtime/vmStructs.cpp
index 54f500f502a1d..537f063a2b13d 100644
--- a/src/hotspot/share/runtime/vmStructs.cpp
+++ b/src/hotspot/share/runtime/vmStructs.cpp
@@ -529,10 +529,6 @@
   /***********************************/                                                                                              \
                                                                                                                                      \
      static_field(StubRoutines,                _call_stub_return_address,                     address)                               \
-     static_field(StubRoutines,                _arraysort_int,                                address)                               \
-     static_field(StubRoutines,                _arraysort_long,                               address)                               \
-     static_field(StubRoutines,                _arraysort_float,                              address)                               \
-     static_field(StubRoutines,                _arraysort_double,                             address)                               \
                                                                                                                                      \
   /***************************************/                                                                                          \
   /* PcDesc and other compiled code info */                                                                                          \

From 37f3c52728e649c72ffd3c2147af9db05a4d128e Mon Sep 17 00:00:00 2001
From: vamsi-parasa <srinivas.vamsi.parasa@intel.com>
Date: Fri, 4 Aug 2023 15:26:01 -0700
Subject: [PATCH 14/40] Update avx512 sort, benchmarks, shenandoahSupport

---
 make/modules/java.base/Lib.gmk                |   4 +-
 .../gc/shenandoah/c2/shenandoahSupport.cpp    |   3 +
 .../libavx512_x86_64/avx512-32bit-qsort.hpp   |  57 ------
 .../libavx512_x86_64/avx512-64bit-common.h    | 152 ---------------
 .../libavx512_x86_64/avx512-64bit-qsort.hpp   |  68 -------
 .../libavx512_x86_64/avx512-common-qsort.h    | 184 ------------------
 .../libavx512_x86_64/avxsort_linux_x86.cpp    |   1 +
 .../openjdk/bench/java/util/ArraysSort.java   |  60 +++++-
 8 files changed, 56 insertions(+), 473 deletions(-)

diff --git a/make/modules/java.base/Lib.gmk b/make/modules/java.base/Lib.gmk
index 0e0f7b2a134c1..1e86c2541e59a 100644
--- a/make/modules/java.base/Lib.gmk
+++ b/make/modules/java.base/Lib.gmk
@@ -240,8 +240,8 @@ ifeq ($(call isTargetOs, linux)+$(call isTargetCpu, x86_64)+$(INCLUDE_COMPILER2)
   $(eval $(call SetupJdkLibrary, BUILD_LIBAVX512_X86_64, \
       NAME := avx512_x86_64, \
       OPTIMIZATION := HIGH, \
-      CFLAGS := $(CFLAGS_JDKLIB) -mavx512f -mavx512dq, \
-      CXXFLAGS := $(CXXFLAGS_JDKLIB) -mavx512f -mavx512dq, \
+      CFLAGS := $(CFLAGS_JDKLIB), \
+      CXXFLAGS := $(CXXFLAGS_JDKLIB), \
       LDFLAGS := $(LDFLAGS_JDKLIB) \
           $(call SET_SHARED_LIBRARY_ORIGIN), \
       LDFLAGS_linux := -Wl$(COMMA)--no-as-needed, \
diff --git a/src/hotspot/share/gc/shenandoah/c2/shenandoahSupport.cpp b/src/hotspot/share/gc/shenandoah/c2/shenandoahSupport.cpp
index 71068f76043c9..10b2fe13ab529 100644
--- a/src/hotspot/share/gc/shenandoah/c2/shenandoahSupport.cpp
+++ b/src/hotspot/share/gc/shenandoah/c2/shenandoahSupport.cpp
@@ -387,6 +387,9 @@ void ShenandoahBarrierC2Support::verify(RootNode* root) {
           verify_type t;
         } args[6];
       } calls[] = {
+        "arraysort_stub",
+        { { TypeFunc::Parms, ShenandoahLoad },   { TypeFunc::Parms+1, ShenandoahStore },  { TypeFunc::Parms+2, ShenandoahLoad },
+          { TypeFunc::Parms+4, ShenandoahLoad }, { TypeFunc::Parms+5, ShenandoahLoad } },
         "aescrypt_encryptBlock",
         { { TypeFunc::Parms, ShenandoahLoad },   { TypeFunc::Parms+1, ShenandoahStore },  { TypeFunc::Parms+2, ShenandoahLoad },
           { -1,  ShenandoahNone},                 { -1,  ShenandoahNone},                 { -1,  ShenandoahNone} },
diff --git a/src/java.base/linux/native/libavx512_x86_64/avx512-32bit-qsort.hpp b/src/java.base/linux/native/libavx512_x86_64/avx512-32bit-qsort.hpp
index 9c8f1af6a9caf..2b4525b22a4ce 100644
--- a/src/java.base/linux/native/libavx512_x86_64/avx512-32bit-qsort.hpp
+++ b/src/java.base/linux/native/libavx512_x86_64/avx512-32bit-qsort.hpp
@@ -496,34 +496,7 @@ static void qsort_32bit_(type_t *arr, int64_t left, int64_t right,
         qsort_32bit_<vtype>(arr, pivot_index, right, max_iters - 1);
 }
 
-template <typename vtype, typename type_t>
-static void qselect_32bit_(type_t *arr, int64_t pos, int64_t left,
-                           int64_t right, int64_t max_iters) {
-    /*
-     * Resort to std::sort if quicksort isnt making any progress
-     */
-    if (max_iters <= 0) {
-        std::sort(arr + left, arr + right + 1);
-        return;
-    }
-    /*
-     * Base case: use bitonic networks to sort arrays <= 128
-     */
-    if (right + 1 - left <= 128) {
-        sort_128_32bit<vtype>(arr + left, (int32_t)(right + 1 - left));
-        return;
-    }
 
-    type_t pivot = get_pivot_32bit<vtype>(arr, left, right);
-    type_t smallest = vtype::type_max();
-    type_t biggest = vtype::type_min();
-    int64_t pivot_index = partition_avx512_unrolled<vtype, 2>(
-        arr, left, right + 1, pivot, &smallest, &biggest);
-    if ((pivot != smallest) && (pos < pivot_index))
-        qselect_32bit_<vtype>(arr, pos, left, pivot_index - 1, max_iters - 1);
-    else if ((pivot != biggest) && (pos >= pivot_index))
-        qselect_32bit_<vtype>(arr, pos, pivot_index, right, max_iters - 1);
-}
 
 X86_SIMD_SORT_INLINE int64_t replace_nan_with_inf(float *arr, int64_t arrsize) {
     int64_t nan_count = 0;
@@ -550,36 +523,6 @@ X86_SIMD_SORT_INLINE void replace_inf_with_nan(float *arr, int64_t arrsize,
     }
 }
 
-template <>
-void avx512_qselect<int32_t>(int32_t *arr, int64_t k, int64_t arrsize,
-                             bool hasnan) {
-    if (arrsize > 1) {
-        qselect_32bit_<zmm_vector<int32_t>, int32_t>(
-            arr, k, 0, arrsize - 1, 2 * (int64_t)log2(arrsize));
-    }
-}
-
-template <>
-void avx512_qselect<uint32_t>(uint32_t *arr, int64_t k, int64_t arrsize,
-                              bool hasnan) {
-    if (arrsize > 1) {
-        qselect_32bit_<zmm_vector<uint32_t>, uint32_t>(
-            arr, k, 0, arrsize - 1, 2 * (int64_t)log2(arrsize));
-    }
-}
-
-template <>
-void avx512_qselect<float>(float *arr, int64_t k, int64_t arrsize,
-                           bool hasnan) {
-    int64_t indx_last_elem = arrsize - 1;
-    if (UNLIKELY(hasnan)) {
-        indx_last_elem = move_nans_to_end_of_array(arr, arrsize);
-    }
-    if (indx_last_elem >= k) {
-        qselect_32bit_<zmm_vector<float>, float>(
-            arr, k, 0, indx_last_elem, 2 * (int64_t)log2(indx_last_elem));
-    }
-}
 
 template <>
 void avx512_qsort<int32_t>(int32_t *arr, int64_t arrsize) {
diff --git a/src/java.base/linux/native/libavx512_x86_64/avx512-64bit-common.h b/src/java.base/linux/native/libavx512_x86_64/avx512-64bit-common.h
index c435d100e7579..b8b17c68afbdf 100644
--- a/src/java.base/linux/native/libavx512_x86_64/avx512-64bit-common.h
+++ b/src/java.base/linux/native/libavx512_x86_64/avx512-64bit-common.h
@@ -142,96 +142,6 @@ struct ymm_vector<float> {
     }
 };
 template <>
-struct ymm_vector<uint32_t> {
-    using type_t = uint32_t;
-    using zmm_t = __m256i;
-    using zmmi_t = __m256i;
-    using opmask_t = __mmask8;
-    static const uint8_t numlanes = 8;
-
-    static type_t type_max() { return X86_SIMD_SORT_MAX_UINT32; }
-    static type_t type_min() { return 0; }
-    static zmm_t zmm_max() { return _mm256_set1_epi32(type_max()); }
-
-    static zmmi_t seti(int v1, int v2, int v3, int v4, int v5, int v6, int v7,
-                       int v8) {
-        return _mm256_set_epi32(v1, v2, v3, v4, v5, v6, v7, v8);
-    }
-    static opmask_t kxor_opmask(opmask_t x, opmask_t y) {
-        return _kxor_mask8(x, y);
-    }
-    static opmask_t knot_opmask(opmask_t x) { return _knot_mask8(x); }
-    static opmask_t le(zmm_t x, zmm_t y) {
-        return _mm256_cmp_epu32_mask(x, y, _MM_CMPINT_LE);
-    }
-    static opmask_t ge(zmm_t x, zmm_t y) {
-        return _mm256_cmp_epu32_mask(x, y, _MM_CMPINT_NLT);
-    }
-    static opmask_t eq(zmm_t x, zmm_t y) {
-        return _mm256_cmp_epu32_mask(x, y, _MM_CMPINT_EQ);
-    }
-    template <int scale>
-    static zmm_t mask_i64gather(zmm_t src, opmask_t mask, __m512i index,
-                                void const *base) {
-        return _mm512_mask_i64gather_epi32(src, mask, index, base, scale);
-    }
-    template <int scale>
-    static zmm_t i64gather(__m512i index, void const *base) {
-        return _mm512_i64gather_epi32(index, base, scale);
-    }
-    static zmm_t loadu(void const *mem) {
-        return _mm256_loadu_si256((__m256i *)mem);
-    }
-    static zmm_t max(zmm_t x, zmm_t y) { return _mm256_max_epu32(x, y); }
-    static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x) {
-        return _mm256_mask_compressstoreu_epi32(mem, mask, x);
-    }
-    static zmm_t maskz_loadu(opmask_t mask, void const *mem) {
-        return _mm256_maskz_loadu_epi32(mask, mem);
-    }
-    static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem) {
-        return _mm256_mask_loadu_epi32(x, mask, mem);
-    }
-    static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y) {
-        return _mm256_mask_mov_epi32(x, mask, y);
-    }
-    static void mask_storeu(void *mem, opmask_t mask, zmm_t x) {
-        return _mm256_mask_storeu_epi32(mem, mask, x);
-    }
-    static zmm_t min(zmm_t x, zmm_t y) { return _mm256_min_epu32(x, y); }
-    static zmm_t permutexvar(__m256i idx, zmm_t zmm) {
-        return _mm256_permutexvar_epi32(idx, zmm);
-    }
-    static type_t reducemax(zmm_t v) {
-        __m128i v128 = _mm_max_epu32(_mm256_castsi256_si128(v),
-                                     _mm256_extracti128_si256(v, 1));
-        __m128i v64 = _mm_max_epu32(
-            v128, _mm_shuffle_epi32(v128, _MM_SHUFFLE(1, 0, 3, 2)));
-        __m128i v32 =
-            _mm_max_epu32(v64, _mm_shuffle_epi32(v64, _MM_SHUFFLE(0, 0, 0, 1)));
-        return (type_t)_mm_cvtsi128_si32(v32);
-    }
-    static type_t reducemin(zmm_t v) {
-        __m128i v128 = _mm_min_epu32(_mm256_castsi256_si128(v),
-                                     _mm256_extracti128_si256(v, 1));
-        __m128i v64 = _mm_min_epu32(
-            v128, _mm_shuffle_epi32(v128, _MM_SHUFFLE(1, 0, 3, 2)));
-        __m128i v32 =
-            _mm_min_epu32(v64, _mm_shuffle_epi32(v64, _MM_SHUFFLE(0, 0, 0, 1)));
-        return (type_t)_mm_cvtsi128_si32(v32);
-    }
-    static zmm_t set1(type_t v) { return _mm256_set1_epi32(v); }
-    template <uint8_t mask, bool = (mask == 0b01010101)>
-    static zmm_t shuffle(zmm_t zmm) {
-        /* Hack!: have to make shuffles within 128-bit lanes work for both
-         * 32-bit and 64-bit */
-        return _mm256_shuffle_epi32(zmm, 0b10110001);
-    }
-    static void storeu(void *mem, zmm_t x) {
-        _mm256_storeu_si256((__m256i *)mem, x);
-    }
-};
-template <>
 struct ymm_vector<int32_t> {
     using type_t = int32_t;
     using zmm_t = __m256i;
@@ -397,68 +307,6 @@ struct zmm_vector<int64_t> {
     static void storeu(void *mem, zmm_t x) { _mm512_storeu_si512(mem, x); }
 };
 template <>
-struct zmm_vector<uint64_t> {
-    using type_t = uint64_t;
-    using zmm_t = __m512i;
-    using zmmi_t = __m512i;
-    using ymm_t = __m512i;
-    using opmask_t = __mmask8;
-    static const uint8_t numlanes = 8;
-
-    static type_t type_max() { return X86_SIMD_SORT_MAX_UINT64; }
-    static type_t type_min() { return 0; }
-    static zmm_t zmm_max() { return _mm512_set1_epi64(type_max()); }
-
-    static zmmi_t seti(int v1, int v2, int v3, int v4, int v5, int v6, int v7,
-                       int v8) {
-        return _mm512_set_epi64(v1, v2, v3, v4, v5, v6, v7, v8);
-    }
-    template <int scale>
-    static zmm_t mask_i64gather(zmm_t src, opmask_t mask, __m512i index,
-                                void const *base) {
-        return _mm512_mask_i64gather_epi64(src, mask, index, base, scale);
-    }
-    template <int scale>
-    static zmm_t i64gather(__m512i index, void const *base) {
-        return _mm512_i64gather_epi64(index, base, scale);
-    }
-    static opmask_t knot_opmask(opmask_t x) { return _knot_mask8(x); }
-    static opmask_t ge(zmm_t x, zmm_t y) {
-        return _mm512_cmp_epu64_mask(x, y, _MM_CMPINT_NLT);
-    }
-    static opmask_t eq(zmm_t x, zmm_t y) {
-        return _mm512_cmp_epu64_mask(x, y, _MM_CMPINT_EQ);
-    }
-    static zmm_t loadu(void const *mem) { return _mm512_loadu_si512(mem); }
-    static zmm_t max(zmm_t x, zmm_t y) { return _mm512_max_epu64(x, y); }
-    static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x) {
-        return _mm512_mask_compressstoreu_epi64(mem, mask, x);
-    }
-    static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem) {
-        return _mm512_mask_loadu_epi64(x, mask, mem);
-    }
-    static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y) {
-        return _mm512_mask_mov_epi64(x, mask, y);
-    }
-    static void mask_storeu(void *mem, opmask_t mask, zmm_t x) {
-        return _mm512_mask_storeu_epi64(mem, mask, x);
-    }
-    static zmm_t min(zmm_t x, zmm_t y) { return _mm512_min_epu64(x, y); }
-    static zmm_t permutexvar(__m512i idx, zmm_t zmm) {
-        return _mm512_permutexvar_epi64(idx, zmm);
-    }
-    static type_t reducemax(zmm_t v) { return _mm512_reduce_max_epu64(v); }
-    static type_t reducemin(zmm_t v) { return _mm512_reduce_min_epu64(v); }
-    static zmm_t set1(type_t v) { return _mm512_set1_epi64(v); }
-    template <uint8_t mask>
-    static zmm_t shuffle(zmm_t zmm) {
-        __m512d temp = _mm512_castsi512_pd(zmm);
-        return _mm512_castpd_si512(
-            _mm512_shuffle_pd(temp, temp, (_MM_PERM_ENUM)mask));
-    }
-    static void storeu(void *mem, zmm_t x) { _mm512_storeu_si512(mem, x); }
-};
-template <>
 struct zmm_vector<double> {
     using type_t = double;
     using zmm_t = __m512d;
diff --git a/src/java.base/linux/native/libavx512_x86_64/avx512-64bit-qsort.hpp b/src/java.base/linux/native/libavx512_x86_64/avx512-64bit-qsort.hpp
index 2d01663923a9b..45497e268a3c5 100644
--- a/src/java.base/linux/native/libavx512_x86_64/avx512-64bit-qsort.hpp
+++ b/src/java.base/linux/native/libavx512_x86_64/avx512-64bit-qsort.hpp
@@ -753,66 +753,6 @@ static void qsort_64bit_(type_t *arr, int64_t left, int64_t right,
         qsort_64bit_<vtype>(arr, pivot_index, right, max_iters - 1);
 }
 
-template <typename vtype, typename type_t>
-static void qselect_64bit_(type_t *arr, int64_t pos, int64_t left,
-                           int64_t right, int64_t max_iters) {
-    /*
-     * Resort to std::sort if quicksort isnt making any progress
-     */
-    if (max_iters <= 0) {
-        std::sort(arr + left, arr + right + 1);
-        return;
-    }
-    /*
-     * Base case: use bitonic networks to sort arrays <= 128
-     */
-    if (right + 1 - left <= 128) {
-        sort_128_64bit<vtype>(arr + left, (int32_t)(right + 1 - left));
-        return;
-    }
-
-    type_t pivot = get_pivot_64bit<vtype>(arr, left, right);
-    type_t smallest = vtype::type_max();
-    type_t biggest = vtype::type_min();
-    int64_t pivot_index = partition_avx512_unrolled<vtype, 8>(
-        arr, left, right + 1, pivot, &smallest, &biggest);
-    if ((pivot != smallest) && (pos < pivot_index))
-        qselect_64bit_<vtype>(arr, pos, left, pivot_index - 1, max_iters - 1);
-    else if ((pivot != biggest) && (pos >= pivot_index))
-        qselect_64bit_<vtype>(arr, pos, pivot_index, right, max_iters - 1);
-}
-
-template <>
-void avx512_qselect<int64_t>(int64_t *arr, int64_t k, int64_t arrsize,
-                             bool hasnan) {
-    if (arrsize > 1) {
-        qselect_64bit_<zmm_vector<int64_t>, int64_t>(
-            arr, k, 0, arrsize - 1, 2 * (int64_t)log2(arrsize));
-    }
-}
-
-template <>
-void avx512_qselect<uint64_t>(uint64_t *arr, int64_t k, int64_t arrsize,
-                              bool hasnan) {
-    if (arrsize > 1) {
-        qselect_64bit_<zmm_vector<uint64_t>, uint64_t>(
-            arr, k, 0, arrsize - 1, 2 * (int64_t)log2(arrsize));
-    }
-}
-
-template <>
-void avx512_qselect<double>(double *arr, int64_t k, int64_t arrsize,
-                            bool hasnan) {
-    int64_t indx_last_elem = arrsize - 1;
-    if (UNLIKELY(hasnan)) {
-        indx_last_elem = move_nans_to_end_of_array(arr, arrsize);
-    }
-    if (indx_last_elem >= k) {
-        qselect_64bit_<zmm_vector<double>, double>(
-            arr, k, 0, indx_last_elem, 2 * (int64_t)log2(indx_last_elem));
-    }
-}
-
 template <>
 void avx512_qsort<int64_t>(int64_t *arr, int64_t arrsize) {
     if (arrsize > 1) {
@@ -821,14 +761,6 @@ void avx512_qsort<int64_t>(int64_t *arr, int64_t arrsize) {
     }
 }
 
-template <>
-void avx512_qsort<uint64_t>(uint64_t *arr, int64_t arrsize) {
-    if (arrsize > 1) {
-        qsort_64bit_<zmm_vector<uint64_t>, uint64_t>(
-            arr, 0, arrsize - 1, 2 * (int64_t)log2(arrsize));
-    }
-}
-
 template <>
 void avx512_qsort<double>(double *arr, int64_t arrsize) {
     if (arrsize > 1) {
diff --git a/src/java.base/linux/native/libavx512_x86_64/avx512-common-qsort.h b/src/java.base/linux/native/libavx512_x86_64/avx512-common-qsort.h
index 9eb09689f72c2..f2ad8b039070b 100644
--- a/src/java.base/linux/native/libavx512_x86_64/avx512-common-qsort.h
+++ b/src/java.base/linux/native/libavx512_x86_64/avx512-common-qsort.h
@@ -118,28 +118,6 @@ struct ymm_vector;
 // Regular quicksort routines:
 template <typename T>
 void avx512_qsort(T *arr, int64_t arrsize);
-void avx512_qsort_fp16(uint16_t *arr, int64_t arrsize);
-
-template <typename T>
-void avx512_qselect(T *arr, int64_t k, int64_t arrsize, bool hasnan = false);
-void avx512_qselect_fp16(uint16_t *arr, int64_t k, int64_t arrsize,
-                         bool hasnan = false);
-
-template <typename T>
-inline void avx512_partial_qsort(T *arr, int64_t k, int64_t arrsize,
-                                 bool hasnan = false) {
-    avx512_qselect<T>(arr, k - 1, arrsize, hasnan);
-    avx512_qsort<T>(arr, k - 1);
-}
-inline void avx512_partial_qsort_fp16(uint16_t *arr, int64_t k, int64_t arrsize,
-                                      bool hasnan = false) {
-    avx512_qselect_fp16(arr, k - 1, arrsize, hasnan);
-    avx512_qsort_fp16(arr, k - 1);
-}
-
-// key-value sort routines
-template <typename T>
-void avx512_qsort_kv(T *keys, uint64_t *indexes, int64_t arrsize);
 
 template <typename T>
 bool is_a_nan(T elem) {
@@ -389,166 +367,4 @@ static inline int64_t partition_avx512_unrolled(type_t *arr, int64_t left,
     return l_store;
 }
 
-// Key-value sort helper functions
-
-template <typename vtype1, typename vtype2,
-          typename zmm_t1 = typename vtype1::zmm_t,
-          typename zmm_t2 = typename vtype2::zmm_t>
-static void COEX(zmm_t1 &key1, zmm_t1 &key2, zmm_t2 &index1, zmm_t2 &index2) {
-    zmm_t1 key_t1 = vtype1::min(key1, key2);
-    zmm_t1 key_t2 = vtype1::max(key1, key2);
-
-    zmm_t2 index_t1 =
-        vtype2::mask_mov(index2, vtype1::eq(key_t1, key1), index1);
-    zmm_t2 index_t2 =
-        vtype2::mask_mov(index1, vtype1::eq(key_t1, key1), index2);
-
-    key1 = key_t1;
-    key2 = key_t2;
-    index1 = index_t1;
-    index2 = index_t2;
-}
-template <typename vtype1, typename vtype2,
-          typename zmm_t1 = typename vtype1::zmm_t,
-          typename zmm_t2 = typename vtype2::zmm_t,
-          typename opmask_t = typename vtype1::opmask_t>
-static inline zmm_t1 cmp_merge(zmm_t1 in1, zmm_t1 in2, zmm_t2 &indexes1,
-                               zmm_t2 indexes2, opmask_t mask) {
-    zmm_t1 tmp_keys = cmp_merge<vtype1>(in1, in2, mask);
-    indexes1 = vtype2::mask_mov(indexes2, vtype1::eq(tmp_keys, in1), indexes1);
-    return tmp_keys;  // 0 -> min, 1 -> max
-}
-
-/*
- * Parition one ZMM register based on the pivot and returns the index of the
- * last element that is less than equal to the pivot.
- */
-template <typename vtype1, typename vtype2,
-          typename type_t1 = typename vtype1::type_t,
-          typename type_t2 = typename vtype2::type_t,
-          typename zmm_t1 = typename vtype1::zmm_t,
-          typename zmm_t2 = typename vtype2::zmm_t>
-static inline int32_t partition_vec(type_t1 *keys, type_t2 *indexes,
-                                    int64_t left, int64_t right,
-                                    const zmm_t1 keys_vec,
-                                    const zmm_t2 indexes_vec,
-                                    const zmm_t1 pivot_vec,
-                                    zmm_t1 *smallest_vec, zmm_t1 *biggest_vec) {
-    /* which elements are larger than the pivot */
-    typename vtype1::opmask_t gt_mask = vtype1::ge(keys_vec, pivot_vec);
-    int32_t amount_gt_pivot = _mm_popcnt_u32((int32_t)gt_mask);
-    vtype1::mask_compressstoreu(keys + left, vtype1::knot_opmask(gt_mask),
-                                keys_vec);
-    vtype1::mask_compressstoreu(keys + right - amount_gt_pivot, gt_mask,
-                                keys_vec);
-    vtype2::mask_compressstoreu(indexes + left, vtype2::knot_opmask(gt_mask),
-                                indexes_vec);
-    vtype2::mask_compressstoreu(indexes + right - amount_gt_pivot, gt_mask,
-                                indexes_vec);
-    *smallest_vec = vtype1::min(keys_vec, *smallest_vec);
-    *biggest_vec = vtype1::max(keys_vec, *biggest_vec);
-    return amount_gt_pivot;
-}
-/*
- * Parition an array based on the pivot and returns the index of the
- * last element that is less than equal to the pivot.
- */
-template <typename vtype1, typename vtype2,
-          typename type_t1 = typename vtype1::type_t,
-          typename type_t2 = typename vtype2::type_t,
-          typename zmm_t1 = typename vtype1::zmm_t,
-          typename zmm_t2 = typename vtype2::zmm_t>
-static inline int64_t partition_avx512(type_t1 *keys, type_t2 *indexes,
-                                       int64_t left, int64_t right,
-                                       type_t1 pivot, type_t1 *smallest,
-                                       type_t1 *biggest) {
-    /* make array length divisible by vtype1::numlanes , shortening the array */
-    for (int32_t i = (right - left) % vtype1::numlanes; i > 0; --i) {
-        *smallest = std::min(*smallest, keys[left]);
-        *biggest = std::max(*biggest, keys[left]);
-        if (keys[left] > pivot) {
-            right--;
-            std::swap(keys[left], keys[right]);
-            std::swap(indexes[left], indexes[right]);
-        } else {
-            ++left;
-        }
-    }
-
-    if (left == right)
-        return left; /* less than vtype1::numlanes elements in the array */
-
-    zmm_t1 pivot_vec = vtype1::set1(pivot);
-    zmm_t1 min_vec = vtype1::set1(*smallest);
-    zmm_t1 max_vec = vtype1::set1(*biggest);
-
-    if (right - left == vtype1::numlanes) {
-        zmm_t1 keys_vec = vtype1::loadu(keys + left);
-        int32_t amount_gt_pivot;
-
-        zmm_t2 indexes_vec = vtype2::loadu(indexes + left);
-        amount_gt_pivot = partition_vec<vtype1, vtype2>(
-            keys, indexes, left, left + vtype1::numlanes, keys_vec, indexes_vec,
-            pivot_vec, &min_vec, &max_vec);
-
-        *smallest = vtype1::reducemin(min_vec);
-        *biggest = vtype1::reducemax(max_vec);
-        return left + (vtype1::numlanes - amount_gt_pivot);
-    }
-
-    // first and last vtype1::numlanes values are partitioned at the end
-    zmm_t1 keys_vec_left = vtype1::loadu(keys + left);
-    zmm_t1 keys_vec_right = vtype1::loadu(keys + (right - vtype1::numlanes));
-    zmm_t2 indexes_vec_left;
-    zmm_t2 indexes_vec_right;
-    indexes_vec_left = vtype2::loadu(indexes + left);
-    indexes_vec_right = vtype2::loadu(indexes + (right - vtype1::numlanes));
-
-    // store points of the vectors
-    int64_t r_store = right - vtype1::numlanes;
-    int64_t l_store = left;
-    // indices for loading the elements
-    left += vtype1::numlanes;
-    right -= vtype1::numlanes;
-    while (right - left != 0) {
-        zmm_t1 keys_vec;
-        zmm_t2 indexes_vec;
-        /*
-         * if fewer elements are stored on the right side of the array,
-         * then next elements are loaded from the right side,
-         * otherwise from the left side
-         */
-        if ((r_store + vtype1::numlanes) - right < left - l_store) {
-            right -= vtype1::numlanes;
-            keys_vec = vtype1::loadu(keys + right);
-            indexes_vec = vtype2::loadu(indexes + right);
-        } else {
-            keys_vec = vtype1::loadu(keys + left);
-            indexes_vec = vtype2::loadu(indexes + left);
-            left += vtype1::numlanes;
-        }
-        // partition the current vector and save it on both sides of the array
-        int32_t amount_gt_pivot;
-
-        amount_gt_pivot = partition_vec<vtype1, vtype2>(
-            keys, indexes, l_store, r_store + vtype1::numlanes, keys_vec,
-            indexes_vec, pivot_vec, &min_vec, &max_vec);
-        r_store -= amount_gt_pivot;
-        l_store += (vtype1::numlanes - amount_gt_pivot);
-    }
-
-    /* partition and save vec_left and vec_right */
-    int32_t amount_gt_pivot;
-    amount_gt_pivot = partition_vec<vtype1, vtype2>(
-        keys, indexes, l_store, r_store + vtype1::numlanes, keys_vec_left,
-        indexes_vec_left, pivot_vec, &min_vec, &max_vec);
-    l_store += (vtype1::numlanes - amount_gt_pivot);
-    amount_gt_pivot = partition_vec<vtype1, vtype2>(
-        keys, indexes, l_store, l_store + vtype1::numlanes, keys_vec_right,
-        indexes_vec_right, pivot_vec, &min_vec, &max_vec);
-    l_store += (vtype1::numlanes - amount_gt_pivot);
-    *smallest = vtype1::reducemin(min_vec);
-    *biggest = vtype1::reducemax(max_vec);
-    return l_store;
-}
 #endif  // AVX512_QSORT_COMMON
diff --git a/src/java.base/linux/native/libavx512_x86_64/avxsort_linux_x86.cpp b/src/java.base/linux/native/libavx512_x86_64/avxsort_linux_x86.cpp
index ec436bb49eee6..67d6285cea552 100644
--- a/src/java.base/linux/native/libavx512_x86_64/avxsort_linux_x86.cpp
+++ b/src/java.base/linux/native/libavx512_x86_64/avxsort_linux_x86.cpp
@@ -24,6 +24,7 @@
  *
  */
 
+#pragma GCC target("avx512dq", "avx512f")
 #include "avx512-32bit-qsort.hpp"
 #include "avx512-64bit-qsort.hpp"
 
diff --git a/test/micro/org/openjdk/bench/java/util/ArraysSort.java b/test/micro/org/openjdk/bench/java/util/ArraysSort.java
index a77e025602289..d5c7953d51492 100644
--- a/test/micro/org/openjdk/bench/java/util/ArraysSort.java
+++ b/test/micro/org/openjdk/bench/java/util/ArraysSort.java
@@ -36,7 +36,6 @@
 import org.openjdk.jmh.annotations.Level;
 import org.openjdk.jmh.annotations.Warmup;
 import org.openjdk.jmh.infra.Blackhole;
-
 import java.util.Arrays;
 import java.util.Random;
 import java.util.concurrent.TimeUnit;
@@ -51,15 +50,9 @@
 @BenchmarkMode(Mode.AverageTime)
 @OutputTimeUnit(TimeUnit.MICROSECONDS)
 @State(Scope.Thread)
-@Warmup(iterations = 3, time=60)
-@Measurement(iterations = 3, time=120)
 @Fork(value = 1)
 public class ArraysSort {
 
-
-    @Param({"10","25","50","75","100", "1000", "10000", "100000"})
-    private int size;
-
     private int[] ints_unsorted;
     private long[] longs_unsorted;
     private float[] floats_unsorted;
@@ -71,8 +64,7 @@ public class ArraysSort {
     private double[] doubles_sorted;
 
 
-    @Setup
-    public void setup() throws UnsupportedEncodingException, ClassNotFoundException, NoSuchMethodException, Throwable {
+    public void initialize(int size) {
         Random rnd = new Random(42);
 
         ints_unsorted = new int[size];
@@ -98,7 +90,7 @@ public void setup() throws UnsupportedEncodingException, ClassNotFoundException,
     }
 
     @Setup(Level.Invocation)
-    public void init() {
+    public void clear() {
         ints_sorted = ints_unsorted.clone();
         longs_sorted = longs_unsorted.clone();
         floats_sorted = floats_unsorted.clone();
@@ -129,4 +121,52 @@ public double[] doubleSort() throws Throwable {
         return doubles_sorted;
     }
 
+    @Warmup(iterations = 3, time=2)
+    @Measurement(iterations = 3, time=5)
+    public static class Small extends ArraysSort {
+        @Param({"10","25","50","75","100"})
+        private int size;
+
+        @Setup
+        public void setup() throws UnsupportedEncodingException, ClassNotFoundException, NoSuchMethodException, Throwable {
+            initialize(size);
+        }
+    }
+
+    @Warmup(iterations = 3, time=2)
+    @Measurement(iterations = 3, time=5)
+    public static class Medium extends ArraysSort {
+        @Param({"1000", "10000"})
+        private int size;
+
+        @Setup
+        public void setup() throws UnsupportedEncodingException, ClassNotFoundException, NoSuchMethodException, Throwable {
+            initialize(size);
+        }
+    }
+
+    @Warmup(iterations = 3, time=20)
+    @Measurement(iterations = 3, time=30)
+    public static class Large extends ArraysSort {
+        @Param({"50000", "100000"})
+        private int size;
+
+        @Setup
+        public void setup() throws UnsupportedEncodingException, ClassNotFoundException, NoSuchMethodException, Throwable {
+            initialize(size);
+        }
+    }
+
+    @Warmup(iterations = 3, time=120)
+    @Measurement(iterations = 3, time=30)
+    public static class VeryLarge extends ArraysSort {
+        @Param({"1000000"})
+        private int size;
+
+        @Setup
+        public void setup() throws UnsupportedEncodingException, ClassNotFoundException, NoSuchMethodException, Throwable {
+            initialize(size);
+        }
+    }
+
 }

From e0ffc81de2d8a6c61d8ad4d6591de7dc3358686a Mon Sep 17 00:00:00 2001
From: vamsi-parasa <srinivas.vamsi.parasa@intel.com>
Date: Fri, 4 Aug 2023 15:48:27 -0700
Subject: [PATCH 15/40] More avx512 sort cleanups

---
 .../libavx512_x86_64/avx512-32bit-qsort.hpp   |  66 ------
 .../libavx512_x86_64/avx512-64bit-common.h    | 192 ------------------
 2 files changed, 258 deletions(-)

diff --git a/src/java.base/linux/native/libavx512_x86_64/avx512-32bit-qsort.hpp b/src/java.base/linux/native/libavx512_x86_64/avx512-32bit-qsort.hpp
index 2b4525b22a4ce..663a885305c2c 100644
--- a/src/java.base/linux/native/libavx512_x86_64/avx512-32bit-qsort.hpp
+++ b/src/java.base/linux/native/libavx512_x86_64/avx512-32bit-qsort.hpp
@@ -102,64 +102,6 @@ struct zmm_vector<int32_t> {
     static ymm_t min(ymm_t x, ymm_t y) { return _mm256_min_epi32(x, y); }
 };
 template <>
-struct zmm_vector<uint32_t> {
-    using type_t = uint32_t;
-    using zmm_t = __m512i;
-    using ymm_t = __m256i;
-    using opmask_t = __mmask16;
-    static const uint8_t numlanes = 16;
-
-    static type_t type_max() { return X86_SIMD_SORT_MAX_UINT32; }
-    static type_t type_min() { return 0; }
-    static zmm_t zmm_max() {
-        return _mm512_set1_epi32(type_max());
-    }  // TODO: this should broadcast bits as is?
-
-    template <int scale>
-    static ymm_t i64gather(__m512i index, void const *base) {
-        return _mm512_i64gather_epi32(index, base, scale);
-    }
-    static zmm_t merge(ymm_t y1, ymm_t y2) {
-        zmm_t z1 = _mm512_castsi256_si512(y1);
-        return _mm512_inserti32x8(z1, y2, 1);
-    }
-    static opmask_t knot_opmask(opmask_t x) { return _mm512_knot(x); }
-    static opmask_t ge(zmm_t x, zmm_t y) {
-        return _mm512_cmp_epu32_mask(x, y, _MM_CMPINT_NLT);
-    }
-    static zmm_t loadu(void const *mem) { return _mm512_loadu_si512(mem); }
-    static zmm_t max(zmm_t x, zmm_t y) { return _mm512_max_epu32(x, y); }
-    static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x) {
-        return _mm512_mask_compressstoreu_epi32(mem, mask, x);
-    }
-    static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem) {
-        return _mm512_mask_loadu_epi32(x, mask, mem);
-    }
-    static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y) {
-        return _mm512_mask_mov_epi32(x, mask, y);
-    }
-    static void mask_storeu(void *mem, opmask_t mask, zmm_t x) {
-        return _mm512_mask_storeu_epi32(mem, mask, x);
-    }
-    static zmm_t min(zmm_t x, zmm_t y) { return _mm512_min_epu32(x, y); }
-    static zmm_t permutexvar(__m512i idx, zmm_t zmm) {
-        return _mm512_permutexvar_epi32(idx, zmm);
-    }
-    static type_t reducemax(zmm_t v) { return _mm512_reduce_max_epu32(v); }
-    static type_t reducemin(zmm_t v) { return _mm512_reduce_min_epu32(v); }
-    static zmm_t set1(type_t v) { return _mm512_set1_epi32(v); }
-    template <uint8_t mask>
-    static zmm_t shuffle(zmm_t zmm) {
-        return _mm512_shuffle_epi32(zmm, (_MM_PERM_ENUM)mask);
-    }
-    static void storeu(void *mem, zmm_t x) {
-        return _mm512_storeu_si512(mem, x);
-    }
-
-    static ymm_t max(ymm_t x, ymm_t y) { return _mm256_max_epu32(x, y); }
-    static ymm_t min(ymm_t x, ymm_t y) { return _mm256_min_epu32(x, y); }
-};
-template <>
 struct zmm_vector<float> {
     using type_t = float;
     using zmm_t = __m512;
@@ -532,14 +474,6 @@ void avx512_qsort<int32_t>(int32_t *arr, int64_t arrsize) {
     }
 }
 
-template <>
-void avx512_qsort<uint32_t>(uint32_t *arr, int64_t arrsize) {
-    if (arrsize > 1) {
-        qsort_32bit_<zmm_vector<uint32_t>, uint32_t>(
-            arr, 0, arrsize - 1, 2 * (int64_t)log2(arrsize));
-    }
-}
-
 template <>
 void avx512_qsort<float>(float *arr, int64_t arrsize) {
     if (arrsize > 1) {
diff --git a/src/java.base/linux/native/libavx512_x86_64/avx512-64bit-common.h b/src/java.base/linux/native/libavx512_x86_64/avx512-64bit-common.h
index b8b17c68afbdf..527b4351eb7e7 100644
--- a/src/java.base/linux/native/libavx512_x86_64/avx512-64bit-common.h
+++ b/src/java.base/linux/native/libavx512_x86_64/avx512-64bit-common.h
@@ -41,198 +41,6 @@
 #define NETWORK_64BIT_3 5, 4, 7, 6, 1, 0, 3, 2
 #define NETWORK_64BIT_4 3, 2, 1, 0, 7, 6, 5, 4
 
-template <>
-struct ymm_vector<float> {
-    using type_t = float;
-    using zmm_t = __m256;
-    using zmmi_t = __m256i;
-    using opmask_t = __mmask8;
-    static const uint8_t numlanes = 8;
-
-    static type_t type_max() { return X86_SIMD_SORT_INFINITYF; }
-    static type_t type_min() { return -X86_SIMD_SORT_INFINITYF; }
-    static zmm_t zmm_max() { return _mm256_set1_ps(type_max()); }
-
-    static zmmi_t seti(int v1, int v2, int v3, int v4, int v5, int v6, int v7,
-                       int v8) {
-        return _mm256_set_epi32(v1, v2, v3, v4, v5, v6, v7, v8);
-    }
-    static opmask_t kxor_opmask(opmask_t x, opmask_t y) {
-        return _kxor_mask8(x, y);
-    }
-    static opmask_t knot_opmask(opmask_t x) { return _knot_mask8(x); }
-    static opmask_t le(zmm_t x, zmm_t y) {
-        return _mm256_cmp_ps_mask(x, y, _CMP_LE_OQ);
-    }
-    static opmask_t ge(zmm_t x, zmm_t y) {
-        return _mm256_cmp_ps_mask(x, y, _CMP_GE_OQ);
-    }
-    static opmask_t eq(zmm_t x, zmm_t y) {
-        return _mm256_cmp_ps_mask(x, y, _CMP_EQ_OQ);
-    }
-    template <int type>
-    static opmask_t fpclass(zmm_t x) {
-        return _mm256_fpclass_ps_mask(x, type);
-    }
-    template <int scale>
-    static zmm_t mask_i64gather(zmm_t src, opmask_t mask, __m512i index,
-                                void const *base) {
-        return _mm512_mask_i64gather_ps(src, mask, index, base, scale);
-    }
-    template <int scale>
-    static zmm_t i64gather(__m512i index, void const *base) {
-        return _mm512_i64gather_ps(index, base, scale);
-    }
-    static zmm_t loadu(void const *mem) {
-        return _mm256_loadu_ps((float *)mem);
-    }
-    static zmm_t max(zmm_t x, zmm_t y) { return _mm256_max_ps(x, y); }
-    static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x) {
-        return _mm256_mask_compressstoreu_ps(mem, mask, x);
-    }
-    static zmm_t maskz_loadu(opmask_t mask, void const *mem) {
-        return _mm256_maskz_loadu_ps(mask, mem);
-    }
-    static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem) {
-        return _mm256_mask_loadu_ps(x, mask, mem);
-    }
-    static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y) {
-        return _mm256_mask_mov_ps(x, mask, y);
-    }
-    static void mask_storeu(void *mem, opmask_t mask, zmm_t x) {
-        return _mm256_mask_storeu_ps(mem, mask, x);
-    }
-    static zmm_t min(zmm_t x, zmm_t y) { return _mm256_min_ps(x, y); }
-    static zmm_t permutexvar(__m256i idx, zmm_t zmm) {
-        return _mm256_permutexvar_ps(idx, zmm);
-    }
-    static type_t reducemax(zmm_t v) {
-        __m128 v128 =
-            _mm_max_ps(_mm256_castps256_ps128(v), _mm256_extractf32x4_ps(v, 1));
-        __m128 v64 = _mm_max_ps(
-            v128, _mm_shuffle_ps(v128, v128, _MM_SHUFFLE(1, 0, 3, 2)));
-        __m128 v32 =
-            _mm_max_ps(v64, _mm_shuffle_ps(v64, v64, _MM_SHUFFLE(0, 0, 0, 1)));
-        return _mm_cvtss_f32(v32);
-    }
-    static type_t reducemin(zmm_t v) {
-        __m128 v128 =
-            _mm_min_ps(_mm256_castps256_ps128(v), _mm256_extractf32x4_ps(v, 1));
-        __m128 v64 = _mm_min_ps(
-            v128, _mm_shuffle_ps(v128, v128, _MM_SHUFFLE(1, 0, 3, 2)));
-        __m128 v32 =
-            _mm_min_ps(v64, _mm_shuffle_ps(v64, v64, _MM_SHUFFLE(0, 0, 0, 1)));
-        return _mm_cvtss_f32(v32);
-    }
-    static zmm_t set1(type_t v) { return _mm256_set1_ps(v); }
-    template <uint8_t mask, bool = (mask == 0b01010101)>
-    static zmm_t shuffle(zmm_t zmm) {
-        /* Hack!: have to make shuffles within 128-bit lanes work for both
-         * 32-bit and 64-bit */
-        return _mm256_shuffle_ps(zmm, zmm, 0b10110001);
-        // if constexpr (mask == 0b01010101) {
-        // }
-        // else {
-        //     /* Not used, so far */
-        //     return _mm256_shuffle_ps(zmm, zmm, mask);
-        // }
-    }
-    static void storeu(void *mem, zmm_t x) {
-        _mm256_storeu_ps((float *)mem, x);
-    }
-};
-template <>
-struct ymm_vector<int32_t> {
-    using type_t = int32_t;
-    using zmm_t = __m256i;
-    using zmmi_t = __m256i;
-    using opmask_t = __mmask8;
-    static const uint8_t numlanes = 8;
-
-    static type_t type_max() { return X86_SIMD_SORT_MAX_INT32; }
-    static type_t type_min() { return X86_SIMD_SORT_MIN_INT32; }
-    static zmm_t zmm_max() {
-        return _mm256_set1_epi32(type_max());
-    }  // TODO: this should broadcast bits as is?
-
-    static zmmi_t seti(int v1, int v2, int v3, int v4, int v5, int v6, int v7,
-                       int v8) {
-        return _mm256_set_epi32(v1, v2, v3, v4, v5, v6, v7, v8);
-    }
-    static opmask_t kxor_opmask(opmask_t x, opmask_t y) {
-        return _kxor_mask8(x, y);
-    }
-    static opmask_t knot_opmask(opmask_t x) { return _knot_mask8(x); }
-    static opmask_t le(zmm_t x, zmm_t y) {
-        return _mm256_cmp_epi32_mask(x, y, _MM_CMPINT_LE);
-    }
-    static opmask_t ge(zmm_t x, zmm_t y) {
-        return _mm256_cmp_epi32_mask(x, y, _MM_CMPINT_NLT);
-    }
-    static opmask_t eq(zmm_t x, zmm_t y) {
-        return _mm256_cmp_epi32_mask(x, y, _MM_CMPINT_EQ);
-    }
-    template <int scale>
-    static zmm_t mask_i64gather(zmm_t src, opmask_t mask, __m512i index,
-                                void const *base) {
-        return _mm512_mask_i64gather_epi32(src, mask, index, base, scale);
-    }
-    template <int scale>
-    static zmm_t i64gather(__m512i index, void const *base) {
-        return _mm512_i64gather_epi32(index, base, scale);
-    }
-    static zmm_t loadu(void const *mem) {
-        return _mm256_loadu_si256((__m256i *)mem);
-    }
-    static zmm_t max(zmm_t x, zmm_t y) { return _mm256_max_epi32(x, y); }
-    static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x) {
-        return _mm256_mask_compressstoreu_epi32(mem, mask, x);
-    }
-    static zmm_t maskz_loadu(opmask_t mask, void const *mem) {
-        return _mm256_maskz_loadu_epi32(mask, mem);
-    }
-    static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem) {
-        return _mm256_mask_loadu_epi32(x, mask, mem);
-    }
-    static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y) {
-        return _mm256_mask_mov_epi32(x, mask, y);
-    }
-    static void mask_storeu(void *mem, opmask_t mask, zmm_t x) {
-        return _mm256_mask_storeu_epi32(mem, mask, x);
-    }
-    static zmm_t min(zmm_t x, zmm_t y) { return _mm256_min_epi32(x, y); }
-    static zmm_t permutexvar(__m256i idx, zmm_t zmm) {
-        return _mm256_permutexvar_epi32(idx, zmm);
-    }
-    static type_t reducemax(zmm_t v) {
-        __m128i v128 = _mm_max_epi32(_mm256_castsi256_si128(v),
-                                     _mm256_extracti128_si256(v, 1));
-        __m128i v64 = _mm_max_epi32(
-            v128, _mm_shuffle_epi32(v128, _MM_SHUFFLE(1, 0, 3, 2)));
-        __m128i v32 =
-            _mm_max_epi32(v64, _mm_shuffle_epi32(v64, _MM_SHUFFLE(0, 0, 0, 1)));
-        return (type_t)_mm_cvtsi128_si32(v32);
-    }
-    static type_t reducemin(zmm_t v) {
-        __m128i v128 = _mm_min_epi32(_mm256_castsi256_si128(v),
-                                     _mm256_extracti128_si256(v, 1));
-        __m128i v64 = _mm_min_epi32(
-            v128, _mm_shuffle_epi32(v128, _MM_SHUFFLE(1, 0, 3, 2)));
-        __m128i v32 =
-            _mm_min_epi32(v64, _mm_shuffle_epi32(v64, _MM_SHUFFLE(0, 0, 0, 1)));
-        return (type_t)_mm_cvtsi128_si32(v32);
-    }
-    static zmm_t set1(type_t v) { return _mm256_set1_epi32(v); }
-    template <uint8_t mask, bool = (mask == 0b01010101)>
-    static zmm_t shuffle(zmm_t zmm) {
-        /* Hack!: have to make shuffles within 128-bit lanes work for both
-         * 32-bit and 64-bit */
-        return _mm256_shuffle_epi32(zmm, 0b10110001);
-    }
-    static void storeu(void *mem, zmm_t x) {
-        _mm256_storeu_si256((__m256i *)mem, x);
-    }
-};
 template <>
 struct zmm_vector<int64_t> {
     using type_t = int64_t;

From 13f4aaf45bd3b504f4cf59ab75fa0db0f9dd1c93 Mon Sep 17 00:00:00 2001
From: vamsi-parasa <srinivas.vamsi.parasa@intel.com>
Date: Fri, 4 Aug 2023 16:13:45 -0700
Subject: [PATCH 16/40] Change name from libavx512_x86_64 to libx86_64

---
 make/modules/java.base/Lib.gmk                 |  6 +++---
 src/hotspot/cpu/x86/stubGenerator_x86_64.cpp   | 18 +++++++++---------
 .../gc/shenandoah/c2/shenandoahSupport.cpp     |  4 ++--
 .../avx512-32bit-qsort.hpp                     |  0
 .../avx512-64bit-common.h                      |  0
 .../avx512-64bit-qsort.hpp                     |  0
 .../avx512-common-qsort.h                      |  0
 .../avxsort_linux_x86.cpp                      |  0
 8 files changed, 14 insertions(+), 14 deletions(-)
 rename src/java.base/linux/native/{libavx512_x86_64 => libx86_64}/avx512-32bit-qsort.hpp (100%)
 rename src/java.base/linux/native/{libavx512_x86_64 => libx86_64}/avx512-64bit-common.h (100%)
 rename src/java.base/linux/native/{libavx512_x86_64 => libx86_64}/avx512-64bit-qsort.hpp (100%)
 rename src/java.base/linux/native/{libavx512_x86_64 => libx86_64}/avx512-common-qsort.h (100%)
 rename src/java.base/linux/native/{libavx512_x86_64 => libx86_64}/avxsort_linux_x86.cpp (100%)

diff --git a/make/modules/java.base/Lib.gmk b/make/modules/java.base/Lib.gmk
index 1e86c2541e59a..85a86372dbf1f 100644
--- a/make/modules/java.base/Lib.gmk
+++ b/make/modules/java.base/Lib.gmk
@@ -237,8 +237,8 @@ endif
 ################################################################################
 
 ifeq ($(call isTargetOs, linux)+$(call isTargetCpu, x86_64)+$(INCLUDE_COMPILER2), true+true+true)
-  $(eval $(call SetupJdkLibrary, BUILD_LIBAVX512_X86_64, \
-      NAME := avx512_x86_64, \
+  $(eval $(call SetupJdkLibrary, BUILD_LIB_X86_64, \
+      NAME := x86_64, \
       OPTIMIZATION := HIGH, \
       CFLAGS := $(CFLAGS_JDKLIB), \
       CXXFLAGS := $(CXXFLAGS_JDKLIB), \
@@ -249,7 +249,7 @@ ifeq ($(call isTargetOs, linux)+$(call isTargetCpu, x86_64)+$(INCLUDE_COMPILER2)
       LIBS_linux := -lc -lm -ldl, \
   ))
 
-  TARGETS += $(BUILD_LIBAVX512_X86_64)
+  TARGETS += $(BUILD_LIB_X86_64)
 endif
 
 ################################################################################
diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
index 0b9f91ecc97bd..2a7170224c6b5 100644
--- a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
+++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
@@ -4131,28 +4131,28 @@ void StubGenerator::generate_compiler_stubs() {
   }
 
   // Get avx512 sort stub routine addresses
-  void *libavx512_x86_64 = nullptr;
+  void *lib_x86_64 = nullptr;
   char ebuf_avx512[1024];
   char dll_name_avx512[JVM_MAXPATHLEN];
-  if (os::dll_locate_lib(dll_name_avx512, sizeof(dll_name_avx512), Arguments::get_dll_dir(), "avx512_x86_64")) {
-    libavx512_x86_64 = os::dll_load(dll_name_avx512, ebuf_avx512, sizeof ebuf_avx512);
+  if (os::dll_locate_lib(dll_name_avx512, sizeof(dll_name_avx512), Arguments::get_dll_dir(), "x86_64")) {
+    lib_x86_64 = os::dll_load(dll_name_avx512, ebuf_avx512, sizeof ebuf_avx512);
   }
-  if (libavx512_x86_64 != nullptr) {
-    log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "avx512_x86_64" JNI_LIB_SUFFIX, p2i(libavx512_x86_64));
+  if (lib_x86_64 != nullptr) {
+    log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "x86_64" JNI_LIB_SUFFIX, p2i(lib_x86_64));
 
     if (UseAVX > 2 && VM_Version::supports_avx512dq()) {
 
       snprintf(ebuf_avx512, sizeof(ebuf_avx512), "avx512_sort_int");
-      StubRoutines::_arraysort_int = (address)os::dll_lookup(libavx512_x86_64, ebuf_avx512);
+      StubRoutines::_arraysort_int = (address)os::dll_lookup(lib_x86_64, ebuf_avx512);
 
       snprintf(ebuf_avx512, sizeof(ebuf_avx512), "avx512_sort_long");
-      StubRoutines::_arraysort_long = (address)os::dll_lookup(libavx512_x86_64, ebuf_avx512);
+      StubRoutines::_arraysort_long = (address)os::dll_lookup(lib_x86_64, ebuf_avx512);
 
       snprintf(ebuf_avx512, sizeof(ebuf_avx512), "avx512_sort_float");
-      StubRoutines::_arraysort_float = (address)os::dll_lookup(libavx512_x86_64, ebuf_avx512);
+      StubRoutines::_arraysort_float = (address)os::dll_lookup(lib_x86_64, ebuf_avx512);
 
       snprintf(ebuf_avx512, sizeof(ebuf_avx512), "avx512_sort_double");
-      StubRoutines::_arraysort_double = (address)os::dll_lookup(libavx512_x86_64, ebuf_avx512);
+      StubRoutines::_arraysort_double = (address)os::dll_lookup(lib_x86_64, ebuf_avx512);
     }
   }
 
diff --git a/src/hotspot/share/gc/shenandoah/c2/shenandoahSupport.cpp b/src/hotspot/share/gc/shenandoah/c2/shenandoahSupport.cpp
index 10b2fe13ab529..27e854028d7f8 100644
--- a/src/hotspot/share/gc/shenandoah/c2/shenandoahSupport.cpp
+++ b/src/hotspot/share/gc/shenandoah/c2/shenandoahSupport.cpp
@@ -388,8 +388,8 @@ void ShenandoahBarrierC2Support::verify(RootNode* root) {
         } args[6];
       } calls[] = {
         "arraysort_stub",
-        { { TypeFunc::Parms, ShenandoahLoad },   { TypeFunc::Parms+1, ShenandoahStore },  { TypeFunc::Parms+2, ShenandoahLoad },
-          { TypeFunc::Parms+4, ShenandoahLoad }, { TypeFunc::Parms+5, ShenandoahLoad } },
+        { { TypeFunc::Parms, ShenandoahLoad },   { TypeFunc::Parms+1, ShenandoahStore },  { -1, ShenandoahNone },
+          { -1, ShenandoahNone }, { -1, ShenandoahNone } },
         "aescrypt_encryptBlock",
         { { TypeFunc::Parms, ShenandoahLoad },   { TypeFunc::Parms+1, ShenandoahStore },  { TypeFunc::Parms+2, ShenandoahLoad },
           { -1,  ShenandoahNone},                 { -1,  ShenandoahNone},                 { -1,  ShenandoahNone} },
diff --git a/src/java.base/linux/native/libavx512_x86_64/avx512-32bit-qsort.hpp b/src/java.base/linux/native/libx86_64/avx512-32bit-qsort.hpp
similarity index 100%
rename from src/java.base/linux/native/libavx512_x86_64/avx512-32bit-qsort.hpp
rename to src/java.base/linux/native/libx86_64/avx512-32bit-qsort.hpp
diff --git a/src/java.base/linux/native/libavx512_x86_64/avx512-64bit-common.h b/src/java.base/linux/native/libx86_64/avx512-64bit-common.h
similarity index 100%
rename from src/java.base/linux/native/libavx512_x86_64/avx512-64bit-common.h
rename to src/java.base/linux/native/libx86_64/avx512-64bit-common.h
diff --git a/src/java.base/linux/native/libavx512_x86_64/avx512-64bit-qsort.hpp b/src/java.base/linux/native/libx86_64/avx512-64bit-qsort.hpp
similarity index 100%
rename from src/java.base/linux/native/libavx512_x86_64/avx512-64bit-qsort.hpp
rename to src/java.base/linux/native/libx86_64/avx512-64bit-qsort.hpp
diff --git a/src/java.base/linux/native/libavx512_x86_64/avx512-common-qsort.h b/src/java.base/linux/native/libx86_64/avx512-common-qsort.h
similarity index 100%
rename from src/java.base/linux/native/libavx512_x86_64/avx512-common-qsort.h
rename to src/java.base/linux/native/libx86_64/avx512-common-qsort.h
diff --git a/src/java.base/linux/native/libavx512_x86_64/avxsort_linux_x86.cpp b/src/java.base/linux/native/libx86_64/avxsort_linux_x86.cpp
similarity index 100%
rename from src/java.base/linux/native/libavx512_x86_64/avxsort_linux_x86.cpp
rename to src/java.base/linux/native/libx86_64/avxsort_linux_x86.cpp

From c49657ee2809205a08489dba1f2421b310d9936b Mon Sep 17 00:00:00 2001
From: vamsi-parasa <srinivas.vamsi.parasa@intel.com>
Date: Mon, 7 Aug 2023 13:56:09 -0700
Subject: [PATCH 17/40] change names from avx512 to x86_64

---
 src/hotspot/cpu/x86/stubGenerator_x86_64.cpp | 26 ++++++++++----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
index 2a7170224c6b5..c39d237a8f0af 100644
--- a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
+++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
@@ -4131,28 +4131,28 @@ void StubGenerator::generate_compiler_stubs() {
   }
 
   // Get avx512 sort stub routine addresses
-  void *lib_x86_64 = nullptr;
-  char ebuf_avx512[1024];
+  void *libx86_64 = nullptr;
+  char ebuf_x86_64[1024];
   char dll_name_avx512[JVM_MAXPATHLEN];
   if (os::dll_locate_lib(dll_name_avx512, sizeof(dll_name_avx512), Arguments::get_dll_dir(), "x86_64")) {
-    lib_x86_64 = os::dll_load(dll_name_avx512, ebuf_avx512, sizeof ebuf_avx512);
+    libx86_64 = os::dll_load(dll_name_avx512, ebuf_x86_64, sizeof ebuf_x86_64);
   }
-  if (lib_x86_64 != nullptr) {
-    log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "x86_64" JNI_LIB_SUFFIX, p2i(lib_x86_64));
+  if (libx86_64 != nullptr) {
+    log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "x86_64" JNI_LIB_SUFFIX, p2i(libx86_64));
 
     if (UseAVX > 2 && VM_Version::supports_avx512dq()) {
 
-      snprintf(ebuf_avx512, sizeof(ebuf_avx512), "avx512_sort_int");
-      StubRoutines::_arraysort_int = (address)os::dll_lookup(lib_x86_64, ebuf_avx512);
+      snprintf(ebuf_x86_64, sizeof(ebuf_x86_64), "avx512_sort_int");
+      StubRoutines::_arraysort_int = (address)os::dll_lookup(libx86_64, ebuf_x86_64);
 
-      snprintf(ebuf_avx512, sizeof(ebuf_avx512), "avx512_sort_long");
-      StubRoutines::_arraysort_long = (address)os::dll_lookup(lib_x86_64, ebuf_avx512);
+      snprintf(ebuf_x86_64, sizeof(ebuf_x86_64), "avx512_sort_long");
+      StubRoutines::_arraysort_long = (address)os::dll_lookup(libx86_64, ebuf_x86_64);
 
-      snprintf(ebuf_avx512, sizeof(ebuf_avx512), "avx512_sort_float");
-      StubRoutines::_arraysort_float = (address)os::dll_lookup(lib_x86_64, ebuf_avx512);
+      snprintf(ebuf_x86_64, sizeof(ebuf_x86_64), "avx512_sort_float");
+      StubRoutines::_arraysort_float = (address)os::dll_lookup(libx86_64, ebuf_x86_64);
 
-      snprintf(ebuf_avx512, sizeof(ebuf_avx512), "avx512_sort_double");
-      StubRoutines::_arraysort_double = (address)os::dll_lookup(lib_x86_64, ebuf_avx512);
+      snprintf(ebuf_x86_64, sizeof(ebuf_x86_64), "avx512_sort_double");
+      StubRoutines::_arraysort_double = (address)os::dll_lookup(libx86_64, ebuf_x86_64);
     }
   }
 

From 58467994ce7a136ecddfa1ea296234ecc0385753 Mon Sep 17 00:00:00 2001
From: vamsi-parasa <srinivas.vamsi.parasa@intel.com>
Date: Fri, 11 Aug 2023 15:03:22 -0700
Subject: [PATCH 18/40] Fix signature for Shenandoah support

---
 src/hotspot/share/gc/shenandoah/c2/shenandoahSupport.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/hotspot/share/gc/shenandoah/c2/shenandoahSupport.cpp b/src/hotspot/share/gc/shenandoah/c2/shenandoahSupport.cpp
index 27e854028d7f8..98c27b337ac8e 100644
--- a/src/hotspot/share/gc/shenandoah/c2/shenandoahSupport.cpp
+++ b/src/hotspot/share/gc/shenandoah/c2/shenandoahSupport.cpp
@@ -388,8 +388,7 @@ void ShenandoahBarrierC2Support::verify(RootNode* root) {
         } args[6];
       } calls[] = {
         "arraysort_stub",
-        { { TypeFunc::Parms, ShenandoahLoad },   { TypeFunc::Parms+1, ShenandoahStore },  { -1, ShenandoahNone },
-          { -1, ShenandoahNone }, { -1, ShenandoahNone } },
+        { { TypeFunc::Parms, ShenandoahStore }, { -1, ShenandoahNone }, { -1, ShenandoahNone } },
         "aescrypt_encryptBlock",
         { { TypeFunc::Parms, ShenandoahLoad },   { TypeFunc::Parms+1, ShenandoahStore },  { TypeFunc::Parms+2, ShenandoahLoad },
           { -1,  ShenandoahNone},                 { -1,  ShenandoahNone},                 { -1,  ShenandoahNone} },

From 07349ec35d5c56777131e364bddb927117b9f46c Mon Sep 17 00:00:00 2001
From: vamsi-parasa <srinivas.vamsi.parasa@intel.com>
Date: Tue, 15 Aug 2023 12:09:45 -0700
Subject: [PATCH 19/40] Fix preservation of NaNs for floats and doubles

---
 .../native/libx86_64/avx512-32bit-qsort.hpp   | 34 ++-----------------
 .../native/libx86_64/avx512-64bit-common.h    | 24 -------------
 .../native/libx86_64/avx512-64bit-qsort.hpp   |  6 ++--
 .../openjdk/bench/java/util/ArraysSort.java   |  2 +-
 4 files changed, 7 insertions(+), 59 deletions(-)

diff --git a/src/java.base/linux/native/libx86_64/avx512-32bit-qsort.hpp b/src/java.base/linux/native/libx86_64/avx512-32bit-qsort.hpp
index 663a885305c2c..c07caf991ea8e 100644
--- a/src/java.base/linux/native/libx86_64/avx512-32bit-qsort.hpp
+++ b/src/java.base/linux/native/libx86_64/avx512-32bit-qsort.hpp
@@ -438,34 +438,6 @@ static void qsort_32bit_(type_t *arr, int64_t left, int64_t right,
         qsort_32bit_<vtype>(arr, pivot_index, right, max_iters - 1);
 }
 
-
-
-X86_SIMD_SORT_INLINE int64_t replace_nan_with_inf(float *arr, int64_t arrsize) {
-    int64_t nan_count = 0;
-    __mmask16 loadmask = 0xFFFF;
-    while (arrsize > 0) {
-        if (arrsize < 16) {
-            loadmask = (0x0001 << arrsize) - 0x0001;
-        }
-        __m512 in_zmm = _mm512_maskz_loadu_ps(loadmask, arr);
-        __mmask16 nanmask = _mm512_cmp_ps_mask(in_zmm, in_zmm, _CMP_NEQ_UQ);
-        nan_count += _mm_popcnt_u32((int32_t)nanmask);
-        _mm512_mask_storeu_ps(arr, nanmask, ZMM_MAX_FLOAT);
-        arr += 16;
-        arrsize -= 16;
-    }
-    return nan_count;
-}
-
-X86_SIMD_SORT_INLINE void replace_inf_with_nan(float *arr, int64_t arrsize,
-                                               int64_t nan_count) {
-    for (int64_t ii = arrsize - 1; nan_count > 0; --ii) {
-        arr[ii] = std::nanf("1");
-        nan_count -= 1;
-    }
-}
-
-
 template <>
 void avx512_qsort<int32_t>(int32_t *arr, int64_t arrsize) {
     if (arrsize > 1) {
@@ -476,11 +448,11 @@ void avx512_qsort<int32_t>(int32_t *arr, int64_t arrsize) {
 
 template <>
 void avx512_qsort<float>(float *arr, int64_t arrsize) {
+    int64_t idx_last_elem_not_nan = move_nans_to_end_of_array(arr, arrsize);
+    arrsize = idx_last_elem_not_nan + 1;
     if (arrsize > 1) {
-        int64_t nan_count = replace_nan_with_inf(arr, arrsize);
-        qsort_32bit_<zmm_vector<float>, float>(arr, 0, arrsize - 1,
+        qsort_32bit_<zmm_vector<float>, float>(arr, 0, idx_last_elem_not_nan,
                                                2 * (int64_t)log2(arrsize));
-        replace_inf_with_nan(arr, arrsize, nan_count);
     }
 }
 
diff --git a/src/java.base/linux/native/libx86_64/avx512-64bit-common.h b/src/java.base/linux/native/libx86_64/avx512-64bit-common.h
index 527b4351eb7e7..2d07cf1984859 100644
--- a/src/java.base/linux/native/libx86_64/avx512-64bit-common.h
+++ b/src/java.base/linux/native/libx86_64/avx512-64bit-common.h
@@ -182,31 +182,7 @@ struct zmm_vector<double> {
     }
     static void storeu(void *mem, zmm_t x) { _mm512_storeu_pd(mem, x); }
 };
-X86_SIMD_SORT_INLINE int64_t replace_nan_with_inf(double *arr,
-                                                  int64_t arrsize) {
-    int64_t nan_count = 0;
-    __mmask8 loadmask = 0xFF;
-    while (arrsize > 0) {
-        if (arrsize < 8) {
-            loadmask = (0x01 << arrsize) - 0x01;
-        }
-        __m512d in_zmm = _mm512_maskz_loadu_pd(loadmask, arr);
-        __mmask8 nanmask = _mm512_cmp_pd_mask(in_zmm, in_zmm, _CMP_NEQ_UQ);
-        nan_count += _mm_popcnt_u32((int32_t)nanmask);
-        _mm512_mask_storeu_pd(arr, nanmask, ZMM_MAX_DOUBLE);
-        arr += 8;
-        arrsize -= 8;
-    }
-    return nan_count;
-}
 
-X86_SIMD_SORT_INLINE void replace_inf_with_nan(double *arr, int64_t arrsize,
-                                               int64_t nan_count) {
-    for (int64_t ii = arrsize - 1; nan_count > 0; --ii) {
-        arr[ii] = std::nan("1");
-        nan_count -= 1;
-    }
-}
 /*
  * Assumes zmm is random and performs a full sorting network defined in
  * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg
diff --git a/src/java.base/linux/native/libx86_64/avx512-64bit-qsort.hpp b/src/java.base/linux/native/libx86_64/avx512-64bit-qsort.hpp
index 45497e268a3c5..da07dc51427a4 100644
--- a/src/java.base/linux/native/libx86_64/avx512-64bit-qsort.hpp
+++ b/src/java.base/linux/native/libx86_64/avx512-64bit-qsort.hpp
@@ -763,11 +763,11 @@ void avx512_qsort<int64_t>(int64_t *arr, int64_t arrsize) {
 
 template <>
 void avx512_qsort<double>(double *arr, int64_t arrsize) {
+    int64_t idx_last_elem_not_nan = move_nans_to_end_of_array(arr, arrsize);
+    arrsize = idx_last_elem_not_nan + 1;
     if (arrsize > 1) {
-        int64_t nan_count = replace_nan_with_inf(arr, arrsize);
-        qsort_64bit_<zmm_vector<double>, double>(arr, 0, arrsize - 1,
+        qsort_64bit_<zmm_vector<double>, double>(arr, 0, idx_last_elem_not_nan,
                                                  2 * (int64_t)log2(arrsize));
-        replace_inf_with_nan(arr, arrsize, nan_count);
     }
 }
 #endif  // AVX512_QSORT_64BIT
diff --git a/test/micro/org/openjdk/bench/java/util/ArraysSort.java b/test/micro/org/openjdk/bench/java/util/ArraysSort.java
index d5c7953d51492..fb5b2f874ee88 100644
--- a/test/micro/org/openjdk/bench/java/util/ArraysSort.java
+++ b/test/micro/org/openjdk/bench/java/util/ArraysSort.java
@@ -145,7 +145,7 @@ public void setup() throws UnsupportedEncodingException, ClassNotFoundException,
         }
     }
 
-    @Warmup(iterations = 3, time=20)
+    @Warmup(iterations = 3, time=40)
     @Measurement(iterations = 3, time=30)
     public static class Large extends ArraysSort {
         @Param({"50000", "100000"})

From 9153059a4df51a3661aaa719201e1044a80c30fc Mon Sep 17 00:00:00 2001
From: vamsi-parasa <srinivas.vamsi.parasa@intel.com>
Date: Tue, 22 Aug 2023 16:02:22 -0700
Subject: [PATCH 20/40] Decomposed DPQS using AVX512 partitioning and AVX512
 sort (for small arrays). Works for serial and parallel sort.

---
 src/hotspot/cpu/x86/stubGenerator_x86_64.cpp  |   14 +-
 src/hotspot/share/classfile/vmIntrinsics.hpp  |    7 +-
 .../gc/shenandoah/c2/shenandoahSupport.cpp    |    3 +
 src/hotspot/share/jvmci/vmStructs_jvmci.cpp   |    4 +
 src/hotspot/share/opto/c2compiler.cpp         |    1 +
 src/hotspot/share/opto/escape.cpp             |    1 +
 src/hotspot/share/opto/library_call.cpp       |   44 +-
 src/hotspot/share/opto/library_call.hpp       |    1 +
 src/hotspot/share/opto/runtime.cpp            |   21 +
 src/hotspot/share/opto/runtime.hpp            |    1 +
 src/hotspot/share/runtime/stubRoutines.cpp    |   24 +-
 src/hotspot/share/runtime/stubRoutines.hpp    |    5 +
 .../native/libx86_64/avx512-32bit-qsort.hpp   |   20 +-
 .../native/libx86_64/avx512-64bit-common.h    |    6 +
 .../native/libx86_64/avx512-64bit-qsort.hpp   |   15 +-
 .../native/libx86_64/avx512-common-qsort.h    |  135 +-
 .../native/libx86_64/avxsort_linux_x86.cpp    |   36 +-
 .../share/classes/java/util/Arrays.java       |   65 +-
 .../classes/java/util/DualPivotQuicksort.java | 1308 ++++++++++-------
 .../openjdk/bench/java/util/ArraysSort.java   |   73 +-
 20 files changed, 1150 insertions(+), 634 deletions(-)

diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
index c39d237a8f0af..11936ac764126 100644
--- a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
+++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
@@ -4130,7 +4130,7 @@ void StubGenerator::generate_compiler_stubs() {
       = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
   }
 
-  // Get avx512 sort stub routine addresses
+  // Get addresses for avx512 sort and partition routines
   void *libx86_64 = nullptr;
   char ebuf_x86_64[1024];
   char dll_name_avx512[JVM_MAXPATHLEN];
@@ -4153,6 +4153,18 @@ void StubGenerator::generate_compiler_stubs() {
 
       snprintf(ebuf_x86_64, sizeof(ebuf_x86_64), "avx512_sort_double");
       StubRoutines::_arraysort_double = (address)os::dll_lookup(libx86_64, ebuf_x86_64);
+
+      snprintf(ebuf_x86_64, sizeof(ebuf_x86_64), "avx512_partition_int");
+      StubRoutines::_array_partition_int = (address)os::dll_lookup(libx86_64, ebuf_x86_64);
+
+      snprintf(ebuf_x86_64, sizeof(ebuf_x86_64), "avx512_partition_long");
+      StubRoutines::_array_partition_long = (address)os::dll_lookup(libx86_64, ebuf_x86_64);
+
+      snprintf(ebuf_x86_64, sizeof(ebuf_x86_64), "avx512_partition_float");
+      StubRoutines::_array_partition_float = (address)os::dll_lookup(libx86_64, ebuf_x86_64);
+
+      snprintf(ebuf_x86_64, sizeof(ebuf_x86_64), "avx512_partition_double");
+      StubRoutines::_array_partition_double = (address)os::dll_lookup(libx86_64, ebuf_x86_64);
     }
   }
 
diff --git a/src/hotspot/share/classfile/vmIntrinsics.hpp b/src/hotspot/share/classfile/vmIntrinsics.hpp
index aa9f9660bbba4..9fce2446aea19 100644
--- a/src/hotspot/share/classfile/vmIntrinsics.hpp
+++ b/src/hotspot/share/classfile/vmIntrinsics.hpp
@@ -343,7 +343,12 @@ class methodHandle;
                                                                                                                         \
   do_intrinsic(_arraySort,                java_util_Arrays,       arraySort_name, arraySort_signature,           F_S)   \
    do_name(     arraySort_name,                                  "arraySort")                                           \
-   do_signature(arraySort_signature,                             "(Ljava/lang/Class;Ljava/lang/Object;JII)V")           \
+   do_signature(arraySort_signature,                             "(Ljava/lang/Class;Ljava/lang/Object;JIII)V")          \
+                                                                                                                        \
+  do_intrinsic(_arrayPartition, java_util_Arrays, arrayPartition_name, arrayPartition_signature, F_S)                   \
+   do_name(arrayPartition_name, "arrayPartition")                                                                       \
+  do_signature(arrayPartition_signature, "(Ljava/lang/Class;Ljava/lang/Object;JII[IJZ)V")                               \
+                                                                                                                        \
                                                                                                                         \
   do_intrinsic(_copyOfRange,              java_util_Arrays,       copyOfRange_name, copyOfRange_signature,       F_S)   \
    do_name(     copyOfRange_name,                                "copyOfRange")                                         \
diff --git a/src/hotspot/share/gc/shenandoah/c2/shenandoahSupport.cpp b/src/hotspot/share/gc/shenandoah/c2/shenandoahSupport.cpp
index 98c27b337ac8e..9a98ec9cd529d 100644
--- a/src/hotspot/share/gc/shenandoah/c2/shenandoahSupport.cpp
+++ b/src/hotspot/share/gc/shenandoah/c2/shenandoahSupport.cpp
@@ -387,6 +387,9 @@ void ShenandoahBarrierC2Support::verify(RootNode* root) {
           verify_type t;
         } args[6];
       } calls[] = {
+        "array_partition_stub",
+        { { TypeFunc::Parms, ShenandoahStore }, { TypeFunc::Parms+3, ShenandoahStore }, { -1, ShenandoahNone },
+          { -1, ShenandoahNone }, { -1, ShenandoahNone } },
         "arraysort_stub",
         { { TypeFunc::Parms, ShenandoahStore }, { -1, ShenandoahNone }, { -1, ShenandoahNone } },
         "aescrypt_encryptBlock",
diff --git a/src/hotspot/share/jvmci/vmStructs_jvmci.cpp b/src/hotspot/share/jvmci/vmStructs_jvmci.cpp
index 86753a501d2e7..d28ff16b13f17 100644
--- a/src/hotspot/share/jvmci/vmStructs_jvmci.cpp
+++ b/src/hotspot/share/jvmci/vmStructs_jvmci.cpp
@@ -331,6 +331,10 @@
   static_field(StubRoutines,                _arraysort_long,                                  address)                               \
   static_field(StubRoutines,                _arraysort_float,                                 address)                               \
   static_field(StubRoutines,                _arraysort_double,                                address)                               \
+  static_field(StubRoutines,                _array_partition_int,                             address)                               \
+  static_field(StubRoutines,                _array_partition_long,                            address)                               \
+  static_field(StubRoutines,                _array_partition_float,                           address)                               \
+  static_field(StubRoutines,                _array_partition_double,                          address)                               \
                                                                                                                                      \
   static_field(StubRoutines,                _aescrypt_encryptBlock,                           address)                               \
   static_field(StubRoutines,                _aescrypt_decryptBlock,                           address)                               \
diff --git a/src/hotspot/share/opto/c2compiler.cpp b/src/hotspot/share/opto/c2compiler.cpp
index ac2c27cb248dc..7f4e8ee769625 100644
--- a/src/hotspot/share/opto/c2compiler.cpp
+++ b/src/hotspot/share/opto/c2compiler.cpp
@@ -569,6 +569,7 @@ bool C2Compiler::is_intrinsic_supported(const methodHandle& method) {
   case vmIntrinsics::_max_strict:
   case vmIntrinsics::_arraycopy:
   case vmIntrinsics::_arraySort:
+  case vmIntrinsics::_arrayPartition:
   case vmIntrinsics::_indexOfL:
   case vmIntrinsics::_indexOfU:
   case vmIntrinsics::_indexOfUL:
diff --git a/src/hotspot/share/opto/escape.cpp b/src/hotspot/share/opto/escape.cpp
index eed159ff6c8e7..6c165b5ee813e 100644
--- a/src/hotspot/share/opto/escape.cpp
+++ b/src/hotspot/share/opto/escape.cpp
@@ -1191,6 +1191,7 @@ void ConnectionGraph::process_call_arguments(CallNode *call) {
                   strcmp(call->as_CallLeaf()->_name, "bigIntegerLeftShiftWorker") == 0 ||
                   strcmp(call->as_CallLeaf()->_name, "vectorizedMismatch") == 0 ||
                   strcmp(call->as_CallLeaf()->_name, "arraysort_stub") == 0 ||
+                  strcmp(call->as_CallLeaf()->_name, "array_partition_stub") == 0 ||
                   strcmp(call->as_CallLeaf()->_name, "get_class_id_intrinsic") == 0)
                  ))) {
             call->dump();
diff --git a/src/hotspot/share/opto/library_call.cpp b/src/hotspot/share/opto/library_call.cpp
index 132835ba4aff8..10a8734bc1a8a 100644
--- a/src/hotspot/share/opto/library_call.cpp
+++ b/src/hotspot/share/opto/library_call.cpp
@@ -292,7 +292,8 @@ bool LibraryCallKit::try_to_inline(int predicate) {
 
   case vmIntrinsics::_arraycopy:                return inline_arraycopy();
 
-  case vmIntrinsics::_arraySort:               return inline_arraysort();
+  case vmIntrinsics::_arraySort:                return inline_arraysort();
+  case vmIntrinsics::_arrayPartition:           return inline_array_partition();
 
   case vmIntrinsics::_compareToL:               return inline_string_compareTo(StrIntrinsicNode::LL);
   case vmIntrinsics::_compareToU:               return inline_string_compareTo(StrIntrinsicNode::UU);
@@ -5194,6 +5195,47 @@ void LibraryCallKit::create_new_uncommon_trap(CallStaticJavaNode* uncommon_trap_
   uncommon_trap_call->set_req(0, top()); // not used anymore, kill it
 }
 
+//------------------------------inline_array_partition-----------------------
+bool LibraryCallKit::inline_array_partition() {
+
+  address stubAddr = nullptr;
+  const char *stubName;
+  stubName = "array_partition_stub";
+
+  Node* elementType     = null_check(argument(0));
+  Node* obj             = argument(1);
+  Node* offset          = argument(2);
+  Node* fromIndex       = argument(4);
+  Node* toIndex         = argument(5);
+  Node* pivot_indices   = argument(6);
+  Node* pivot_offset    = argument(7);
+  Node* isDualPivot     = argument(9);
+
+  const TypeInstPtr* elem_klass = gvn().type(elementType)->isa_instptr();
+  ciType* elem_type = elem_klass->const_oop()->as_instance()->java_mirror_type();
+  BasicType bt = elem_type->basic_type();
+  stubAddr = StubRoutines::select_array_partition_function(bt);
+  if (stubAddr == nullptr) return false;
+
+  const TypeAryPtr* obj_t = _gvn.type(obj)->isa_aryptr();
+  if (obj_t == nullptr || obj_t->elem() == Type::BOTTOM ) {
+    return false; // failed input validation
+  }
+
+  Node* obj_adr = make_unsafe_address(obj, offset);
+
+  pivot_indices = must_be_not_null(pivot_indices, true);
+  Node* pivot_indices_adr = make_unsafe_address(pivot_indices, pivot_offset); //this offset is not same as array offset
+
+  // Call the stub.
+  make_runtime_call(RC_LEAF|RC_NO_FP, OptoRuntime::array_partition_Type(),
+                    stubAddr, stubName, TypePtr::BOTTOM,
+                    obj_adr, fromIndex, toIndex, pivot_indices_adr, isDualPivot);
+
+  return true;
+}
+
+
 //------------------------------inline_arraysort-----------------------
 bool LibraryCallKit::inline_arraysort() {
 
diff --git a/src/hotspot/share/opto/library_call.hpp b/src/hotspot/share/opto/library_call.hpp
index 53d697f6b2078..4cc9c56afca80 100644
--- a/src/hotspot/share/opto/library_call.hpp
+++ b/src/hotspot/share/opto/library_call.hpp
@@ -280,6 +280,7 @@ class LibraryCallKit : public GraphKit {
   void arraycopy_move_allocation_here(AllocateArrayNode* alloc, Node* dest, JVMState* saved_jvms_before_guards, int saved_reexecute_sp,
                                       uint new_idx);
   bool inline_arraysort();
+  bool inline_array_partition();
   typedef enum { LS_get_add, LS_get_set, LS_cmp_swap, LS_cmp_swap_weak, LS_cmp_exchange } LoadStoreKind;
   bool inline_unsafe_load_store(BasicType type,  LoadStoreKind kind, AccessKind access_kind);
   bool inline_unsafe_fence(vmIntrinsics::ID id);
diff --git a/src/hotspot/share/opto/runtime.cpp b/src/hotspot/share/opto/runtime.cpp
index 3555d0cf31af3..a0e383d95afd0 100644
--- a/src/hotspot/share/opto/runtime.cpp
+++ b/src/hotspot/share/opto/runtime.cpp
@@ -857,6 +857,27 @@ const TypeFunc* OptoRuntime::array_fill_Type() {
   return TypeFunc::make(domain, range);
 }
 
+const TypeFunc* OptoRuntime::array_partition_Type() {
+  // create input type (domain)
+  int num_args = 5;
+  int argcnt = num_args;
+  const Type** fields = TypeTuple::fields(argcnt);
+  int argp = TypeFunc::Parms;
+  fields[argp++] = TypePtr::NOTNULL;    // array
+  fields[argp++] = TypeInt::INT;    // low
+  fields[argp++] = TypeInt::INT;    // end
+  fields[argp++] = TypePtr::NOTNULL;    // pivot_indices (int array)
+  fields[argp++] = TypeInt::BOOL;       // isDualPivot
+  assert(argp == TypeFunc::Parms+argcnt, "correct decoding");
+  const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields);
+
+  // no result type needed
+  fields = TypeTuple::fields(1);
+  fields[TypeFunc::Parms+0] = nullptr; // void
+  const TypeTuple* range = TypeTuple::make(TypeFunc::Parms, fields);
+  return TypeFunc::make(domain, range);
+}
+
 const TypeFunc* OptoRuntime::array_sort_Type() {
   // create input type (domain)
   int num_args      = 3;
diff --git a/src/hotspot/share/opto/runtime.hpp b/src/hotspot/share/opto/runtime.hpp
index e4d5f749d3efa..b85542423e848 100644
--- a/src/hotspot/share/opto/runtime.hpp
+++ b/src/hotspot/share/opto/runtime.hpp
@@ -269,6 +269,7 @@ class OptoRuntime : public AllStatic {
   static const TypeFunc* array_fill_Type();
 
   static const TypeFunc* array_sort_Type();
+  static const TypeFunc* array_partition_Type();
   static const TypeFunc* aescrypt_block_Type();
   static const TypeFunc* cipherBlockChaining_aescrypt_Type();
   static const TypeFunc* electronicCodeBook_aescrypt_Type();
diff --git a/src/hotspot/share/runtime/stubRoutines.cpp b/src/hotspot/share/runtime/stubRoutines.cpp
index 5a3b6168ab122..84e43c52ec3a2 100644
--- a/src/hotspot/share/runtime/stubRoutines.cpp
+++ b/src/hotspot/share/runtime/stubRoutines.cpp
@@ -176,10 +176,14 @@ address StubRoutines::_hf2f = nullptr;
 address StubRoutines::_vector_f_math[VectorSupport::NUM_VEC_SIZES][VectorSupport::NUM_SVML_OP] = {{nullptr}, {nullptr}};
 address StubRoutines::_vector_d_math[VectorSupport::NUM_VEC_SIZES][VectorSupport::NUM_SVML_OP] = {{nullptr}, {nullptr}};
 
-address StubRoutines::_arraysort_int        = nullptr;
-address StubRoutines::_arraysort_long       = nullptr;
-address StubRoutines::_arraysort_float      = nullptr;
-address StubRoutines::_arraysort_double     = nullptr;
+address StubRoutines::_arraysort_int = nullptr;
+address StubRoutines::_arraysort_long = nullptr;
+address StubRoutines::_arraysort_float = nullptr;
+address StubRoutines::_arraysort_double = nullptr;
+address StubRoutines::_array_partition_int  = nullptr;
+address StubRoutines::_array_partition_long = nullptr;
+address StubRoutines::_array_partition_float = nullptr;
+address StubRoutines::_array_partition_double = nullptr;
 
 address StubRoutines::_cont_thaw          = nullptr;
 address StubRoutines::_cont_returnBarrier = nullptr;
@@ -665,3 +669,15 @@ address StubRoutines::select_arraysort_function(BasicType t) {
     return nullptr;
   }
 }
+
+address StubRoutines::select_array_partition_function(BasicType t) {
+  switch(t) {
+    case T_INT:    return _array_partition_int;
+    case T_LONG:   return _array_partition_long;
+    case T_FLOAT:  return _array_partition_float;
+    case T_DOUBLE: return _array_partition_double;
+  default:
+    ShouldNotReachHere();
+    return nullptr;
+  }
+}
diff --git a/src/hotspot/share/runtime/stubRoutines.hpp b/src/hotspot/share/runtime/stubRoutines.hpp
index b58287d6b5cd3..a835169a7d60a 100644
--- a/src/hotspot/share/runtime/stubRoutines.hpp
+++ b/src/hotspot/share/runtime/stubRoutines.hpp
@@ -157,6 +157,10 @@ class StubRoutines: AllStatic {
   static address _arraysort_long;
   static address _arraysort_float;
   static address _arraysort_double;
+  static address _array_partition_int;
+  static address _array_partition_long;
+  static address _array_partition_float;
+  static address _array_partition_double;
   // Leaf routines which implement arraycopy and their addresses
   // arraycopy operands aligned on element type boundary
   static address _jbyte_arraycopy;
@@ -378,6 +382,7 @@ class StubRoutines: AllStatic {
 
   static address generic_arraycopy()   { return _generic_arraycopy; }
   static address select_arraysort_function(BasicType t);
+  static address select_array_partition_function(BasicType t);
 
   static address jbyte_fill()          { return _jbyte_fill; }
   static address jshort_fill()         { return _jshort_fill; }
diff --git a/src/java.base/linux/native/libx86_64/avx512-32bit-qsort.hpp b/src/java.base/linux/native/libx86_64/avx512-32bit-qsort.hpp
index c07caf991ea8e..bc1258debd389 100644
--- a/src/java.base/linux/native/libx86_64/avx512-32bit-qsort.hpp
+++ b/src/java.base/linux/native/libx86_64/avx512-32bit-qsort.hpp
@@ -61,6 +61,9 @@ struct zmm_vector<int32_t> {
     static opmask_t ge(zmm_t x, zmm_t y) {
         return _mm512_cmp_epi32_mask(x, y, _MM_CMPINT_NLT);
     }
+    static opmask_t gt(zmm_t x, zmm_t y) {
+        return _mm512_cmp_epi32_mask(x, y, _MM_CMPINT_GT);
+    }
     template <int scale>
     static ymm_t i64gather(__m512i index, void const *base) {
         return _mm512_i64gather_epi32(index, base, scale);
@@ -117,6 +120,9 @@ struct zmm_vector<float> {
     static opmask_t ge(zmm_t x, zmm_t y) {
         return _mm512_cmp_ps_mask(x, y, _CMP_GE_OQ);
     }
+    static opmask_t gt(zmm_t x, zmm_t y) {
+        return _mm512_cmp_ps_mask(x, y, _CMP_GT_OQ);
+    }
     template <int scale>
     static ymm_t i64gather(__m512i index, void const *base) {
         return _mm512_i64gather_ps(index, base, scale);
@@ -431,7 +437,7 @@ static void qsort_32bit_(type_t *arr, int64_t left, int64_t right,
     type_t smallest = vtype::type_max();
     type_t biggest = vtype::type_min();
     int64_t pivot_index = partition_avx512_unrolled<vtype, 2>(
-        arr, left, right + 1, pivot, &smallest, &biggest);
+        arr, left, right + 1, pivot, &smallest, &biggest, false);
     if (pivot != smallest)
         qsort_32bit_<vtype>(arr, left, pivot_index - 1, max_iters - 1);
     if (pivot != biggest)
@@ -439,19 +445,19 @@ static void qsort_32bit_(type_t *arr, int64_t left, int64_t right,
 }
 
 template <>
-void avx512_qsort<int32_t>(int32_t *arr, int64_t arrsize) {
+inline void avx512_qsort<int32_t>(int32_t *arr, int64_t fromIndex, int64_t toIndex) {
+    int64_t arrsize = toIndex - fromIndex;
     if (arrsize > 1) {
-        qsort_32bit_<zmm_vector<int32_t>, int32_t>(arr, 0, arrsize - 1,
+        qsort_32bit_<zmm_vector<int32_t>, int32_t>(arr, fromIndex, toIndex - 1,
                                                    2 * (int64_t)log2(arrsize));
     }
 }
 
 template <>
-void avx512_qsort<float>(float *arr, int64_t arrsize) {
-    int64_t idx_last_elem_not_nan = move_nans_to_end_of_array(arr, arrsize);
-    arrsize = idx_last_elem_not_nan + 1;
+inline void avx512_qsort<float>(float *arr, int64_t fromIndex, int64_t toIndex) {
+    int64_t arrsize = toIndex - fromIndex;
     if (arrsize > 1) {
-        qsort_32bit_<zmm_vector<float>, float>(arr, 0, idx_last_elem_not_nan,
+        qsort_32bit_<zmm_vector<float>, float>(arr, fromIndex, toIndex - 1,
                                                2 * (int64_t)log2(arrsize));
     }
 }
diff --git a/src/java.base/linux/native/libx86_64/avx512-64bit-common.h b/src/java.base/linux/native/libx86_64/avx512-64bit-common.h
index 2d07cf1984859..2c3bfd97e1960 100644
--- a/src/java.base/linux/native/libx86_64/avx512-64bit-common.h
+++ b/src/java.base/linux/native/libx86_64/avx512-64bit-common.h
@@ -70,6 +70,9 @@ struct zmm_vector<int64_t> {
     static opmask_t ge(zmm_t x, zmm_t y) {
         return _mm512_cmp_epi64_mask(x, y, _MM_CMPINT_NLT);
     }
+    static opmask_t gt(zmm_t x, zmm_t y) {
+        return _mm512_cmp_epi64_mask(x, y, _MM_CMPINT_GT);
+    }
     static opmask_t eq(zmm_t x, zmm_t y) {
         return _mm512_cmp_epi64_mask(x, y, _MM_CMPINT_EQ);
     }
@@ -139,6 +142,9 @@ struct zmm_vector<double> {
     static opmask_t ge(zmm_t x, zmm_t y) {
         return _mm512_cmp_pd_mask(x, y, _CMP_GE_OQ);
     }
+    static opmask_t gt(zmm_t x, zmm_t y) {
+        return _mm512_cmp_pd_mask(x, y, _CMP_GT_OQ);
+    }
     static opmask_t eq(zmm_t x, zmm_t y) {
         return _mm512_cmp_pd_mask(x, y, _CMP_EQ_OQ);
     }
diff --git a/src/java.base/linux/native/libx86_64/avx512-64bit-qsort.hpp b/src/java.base/linux/native/libx86_64/avx512-64bit-qsort.hpp
index da07dc51427a4..61f618f657049 100644
--- a/src/java.base/linux/native/libx86_64/avx512-64bit-qsort.hpp
+++ b/src/java.base/linux/native/libx86_64/avx512-64bit-qsort.hpp
@@ -746,7 +746,7 @@ static void qsort_64bit_(type_t *arr, int64_t left, int64_t right,
     type_t smallest = vtype::type_max();
     type_t biggest = vtype::type_min();
     int64_t pivot_index = partition_avx512_unrolled<vtype, 8>(
-        arr, left, right + 1, pivot, &smallest, &biggest);
+        arr, left, right + 1, pivot, &smallest, &biggest, false);
     if (pivot != smallest)
         qsort_64bit_<vtype>(arr, left, pivot_index - 1, max_iters - 1);
     if (pivot != biggest)
@@ -754,20 +754,21 @@ static void qsort_64bit_(type_t *arr, int64_t left, int64_t right,
 }
 
 template <>
-void avx512_qsort<int64_t>(int64_t *arr, int64_t arrsize) {
+inline void avx512_qsort<int64_t>(int64_t *arr, int64_t fromIndex, int64_t toIndex) {
+    int64_t arrsize = toIndex - fromIndex;
     if (arrsize > 1) {
-        qsort_64bit_<zmm_vector<int64_t>, int64_t>(arr, 0, arrsize - 1,
+        qsort_64bit_<zmm_vector<int64_t>, int64_t>(arr, fromIndex, toIndex - 1,
                                                    2 * (int64_t)log2(arrsize));
     }
 }
 
 template <>
-void avx512_qsort<double>(double *arr, int64_t arrsize) {
-    int64_t idx_last_elem_not_nan = move_nans_to_end_of_array(arr, arrsize);
-    arrsize = idx_last_elem_not_nan + 1;
+inline void avx512_qsort<double>(double *arr, int64_t fromIndex, int64_t toIndex) {
+    int64_t arrsize = toIndex - fromIndex;
     if (arrsize > 1) {
-        qsort_64bit_<zmm_vector<double>, double>(arr, 0, idx_last_elem_not_nan,
+        qsort_64bit_<zmm_vector<double>, double>(arr, fromIndex, toIndex - 1,
                                                  2 * (int64_t)log2(arrsize));
     }
 }
+
 #endif  // AVX512_QSORT_64BIT
diff --git a/src/java.base/linux/native/libx86_64/avx512-common-qsort.h b/src/java.base/linux/native/libx86_64/avx512-common-qsort.h
index f2ad8b039070b..b1a53a054692f 100644
--- a/src/java.base/linux/native/libx86_64/avx512-common-qsort.h
+++ b/src/java.base/linux/native/libx86_64/avx512-common-qsort.h
@@ -26,7 +26,7 @@
  */
 
 // This implementation is based on x86-simd-sort(https://github.com/intel/x86-simd-sort)
-
+#include <iostream>
 #ifndef AVX512_QSORT_COMMON
 #define AVX512_QSORT_COMMON
 
@@ -116,9 +116,18 @@ template <typename type>
 struct ymm_vector;
 
 // Regular quicksort routines:
+template <typename T>
+void avx512_dual_pivot_partition(T *arr, int64_t low, int64_t high, int32_t *pivot_indices, bool isDualPivot);
+
+template <typename T>
+void avx512_single_pivot_partition(T *arr, int64_t low, int64_t high, int32_t *pivot_indices, bool isDualPivot);
+
 template <typename T>
 void avx512_qsort(T *arr, int64_t arrsize);
 
+template <typename T>
+void inline avx512_qsort(T *arr, int64_t from_index, int64_t to_index);
+
 template <typename T>
 bool is_a_nan(T elem) {
     return std::isnan(elem);
@@ -146,10 +155,15 @@ int64_t move_nans_to_end_of_array(T *arr, int64_t arrsize) {
 }
 
 template <typename vtype, typename T = typename vtype::type_t>
-bool comparison_func(const T &a, const T &b) {
+bool comparison_func_ge(const T &a, const T &b) {
     return a < b;
 }
 
+template <typename vtype, typename T = typename vtype::type_t>
+bool comparison_func_gt(const T &a, const T &b) {
+    return a <= b;
+}
+
 /*
  * COEX == Compare and Exchange two registers by swapping min and max values
  */
@@ -173,13 +187,16 @@ static inline zmm_t cmp_merge(zmm_t in1, zmm_t in2, opmask_t mask) {
 template <typename vtype, typename type_t, typename zmm_t>
 static inline int32_t partition_vec(type_t *arr, int64_t left, int64_t right,
                                     const zmm_t curr_vec, const zmm_t pivot_vec,
-                                    zmm_t *smallest_vec, zmm_t *biggest_vec) {
+                                    zmm_t *smallest_vec, zmm_t *biggest_vec, bool use_gt) {
     /* which elements are larger than or equal to the pivot */
-    typename vtype::opmask_t ge_mask = vtype::ge(curr_vec, pivot_vec);
-    int32_t amount_ge_pivot = _mm_popcnt_u32((int32_t)ge_mask);
-    vtype::mask_compressstoreu(arr + left, vtype::knot_opmask(ge_mask),
+    typename vtype::opmask_t mask;
+    if (use_gt) mask = vtype::gt(curr_vec, pivot_vec);
+    else mask = vtype::ge(curr_vec, pivot_vec);
+    //mask = vtype::ge(curr_vec, pivot_vec);
+    int32_t amount_ge_pivot = _mm_popcnt_u32((int32_t)mask);
+    vtype::mask_compressstoreu(arr + left, vtype::knot_opmask(mask),
                                curr_vec);
-    vtype::mask_compressstoreu(arr + right - amount_ge_pivot, ge_mask,
+    vtype::mask_compressstoreu(arr + right - amount_ge_pivot, mask,
                                curr_vec);
     *smallest_vec = vtype::min(curr_vec, *smallest_vec);
     *biggest_vec = vtype::max(curr_vec, *biggest_vec);
@@ -192,12 +209,13 @@ static inline int32_t partition_vec(type_t *arr, int64_t left, int64_t right,
 template <typename vtype, typename type_t>
 static inline int64_t partition_avx512(type_t *arr, int64_t left, int64_t right,
                                        type_t pivot, type_t *smallest,
-                                       type_t *biggest) {
+                                       type_t *biggest, bool use_gt) {
+    auto comparison_func = use_gt ? comparison_func_gt<vtype> : comparison_func_ge<vtype>;
     /* make array length divisible by vtype::numlanes , shortening the array */
     for (int32_t i = (right - left) % vtype::numlanes; i > 0; --i) {
-        *smallest = std::min(*smallest, arr[left], comparison_func<vtype>);
-        *biggest = std::max(*biggest, arr[left], comparison_func<vtype>);
-        if (!comparison_func<vtype>(arr[left], pivot)) {
+        *smallest = std::min(*smallest, arr[left], comparison_func);
+        *biggest = std::max(*biggest, arr[left], comparison_func);
+        if (!comparison_func(arr[left], pivot)) {
             std::swap(arr[left], arr[--right]);
         } else {
             ++left;
@@ -216,7 +234,7 @@ static inline int64_t partition_avx512(type_t *arr, int64_t left, int64_t right,
         zmm_t vec = vtype::loadu(arr + left);
         int32_t amount_ge_pivot =
             partition_vec<vtype>(arr, left, left + vtype::numlanes, vec,
-                                 pivot_vec, &min_vec, &max_vec);
+                                 pivot_vec, &min_vec, &max_vec, use_gt);
         *smallest = vtype::reducemin(min_vec);
         *biggest = vtype::reducemax(max_vec);
         return left + (vtype::numlanes - amount_ge_pivot);
@@ -248,7 +266,7 @@ static inline int64_t partition_avx512(type_t *arr, int64_t left, int64_t right,
         // partition the current vector and save it on both sides of the array
         int32_t amount_ge_pivot =
             partition_vec<vtype>(arr, l_store, r_store + vtype::numlanes,
-                                 curr_vec, pivot_vec, &min_vec, &max_vec);
+                                 curr_vec, pivot_vec, &min_vec, &max_vec, use_gt);
         ;
         r_store -= amount_ge_pivot;
         l_store += (vtype::numlanes - amount_ge_pivot);
@@ -257,11 +275,11 @@ static inline int64_t partition_avx512(type_t *arr, int64_t left, int64_t right,
     /* partition and save vec_left and vec_right */
     int32_t amount_ge_pivot =
         partition_vec<vtype>(arr, l_store, r_store + vtype::numlanes, vec_left,
-                             pivot_vec, &min_vec, &max_vec);
+                             pivot_vec, &min_vec, &max_vec, use_gt);
     l_store += (vtype::numlanes - amount_ge_pivot);
     amount_ge_pivot =
         partition_vec<vtype>(arr, l_store, l_store + vtype::numlanes, vec_right,
-                             pivot_vec, &min_vec, &max_vec);
+                             pivot_vec, &min_vec, &max_vec, use_gt);
     l_store += (vtype::numlanes - amount_ge_pivot);
     *smallest = vtype::reducemin(min_vec);
     *biggest = vtype::reducemax(max_vec);
@@ -273,18 +291,20 @@ template <typename vtype, int num_unroll,
 static inline int64_t partition_avx512_unrolled(type_t *arr, int64_t left,
                                                 int64_t right, type_t pivot,
                                                 type_t *smallest,
-                                                type_t *biggest) {
+                                                type_t *biggest, bool use_gt) {
     if (right - left <= 2 * num_unroll * vtype::numlanes) {
         return partition_avx512<vtype>(arr, left, right, pivot, smallest,
-                                       biggest);
+                                       biggest, use_gt);
     }
+
+    auto comparison_func = use_gt ? comparison_func_gt<vtype> : comparison_func_ge<vtype>;
     /* make array length divisible by 8*vtype::numlanes , shortening the array
      */
     for (int32_t i = ((right - left) % (num_unroll * vtype::numlanes)); i > 0;
          --i) {
-        *smallest = std::min(*smallest, arr[left], comparison_func<vtype>);
-        *biggest = std::max(*biggest, arr[left], comparison_func<vtype>);
-        if (!comparison_func<vtype>(arr[left], pivot)) {
+        *smallest = std::min(*smallest, arr[left], comparison_func);
+        *biggest = std::max(*biggest, arr[left], comparison_func);
+        if (!comparison_func(arr[left], pivot)) {
             std::swap(arr[left], arr[--right]);
         } else {
             ++left;
@@ -339,7 +359,7 @@ static inline int64_t partition_avx512_unrolled(type_t *arr, int64_t left,
         for (int ii = 0; ii < num_unroll; ++ii) {
             int32_t amount_ge_pivot = partition_vec<vtype>(
                 arr, l_store, r_store + vtype::numlanes, curr_vec[ii],
-                pivot_vec, &min_vec, &max_vec);
+                pivot_vec, &min_vec, &max_vec, use_gt);
             l_store += (vtype::numlanes - amount_ge_pivot);
             r_store -= amount_ge_pivot;
         }
@@ -350,7 +370,7 @@ static inline int64_t partition_avx512_unrolled(type_t *arr, int64_t left,
     for (int ii = 0; ii < num_unroll; ++ii) {
         int32_t amount_ge_pivot =
             partition_vec<vtype>(arr, l_store, r_store + vtype::numlanes,
-                                 vec_left[ii], pivot_vec, &min_vec, &max_vec);
+                                 vec_left[ii], pivot_vec, &min_vec, &max_vec, use_gt);
         l_store += (vtype::numlanes - amount_ge_pivot);
         r_store -= amount_ge_pivot;
     }
@@ -358,7 +378,7 @@ static inline int64_t partition_avx512_unrolled(type_t *arr, int64_t left,
     for (int ii = 0; ii < num_unroll; ++ii) {
         int32_t amount_ge_pivot =
             partition_vec<vtype>(arr, l_store, r_store + vtype::numlanes,
-                                 vec_right[ii], pivot_vec, &min_vec, &max_vec);
+                                 vec_right[ii], pivot_vec, &min_vec, &max_vec, use_gt);
         l_store += (vtype::numlanes - amount_ge_pivot);
         r_store -= amount_ge_pivot;
     }
@@ -367,4 +387,73 @@ static inline int64_t partition_avx512_unrolled(type_t *arr, int64_t left,
     return l_store;
 }
 
+// right = to_index (exclusive)
+template <typename vtype, typename type_t>
+static int64_t vectorized_partition(type_t *arr, int64_t left, int64_t right, type_t pivot, bool use_gt) {
+    type_t smallest = vtype::type_max();
+    type_t biggest = vtype::type_min();
+    int64_t pivot_index = partition_avx512_unrolled<vtype, 2>(
+            arr, left, right, pivot, &smallest, &biggest, use_gt);
+    return pivot_index;
+}
+
+// partitioning functions
+template <typename T>
+void avx512_dual_pivot_partition(T *arr, int64_t from_index, int64_t to_index, int32_t *pivot_indices){
+    const int64_t pidx1 = pivot_indices[0];
+    const int64_t pidx2 = pivot_indices[1];
+    const T pivot1 = arr[pidx1];
+    const T pivot2 = arr[pidx2];
+
+    const int64_t low = from_index;
+    const int64_t high = to_index;
+    const int64_t start = low + 1;
+    const int64_t end = high - 1;
+
+
+    std::swap(arr[pidx1], arr[low]);
+    std::swap(arr[pidx2], arr[end]);
+
+
+    const int64_t pivot_index2 = vectorized_partition<zmm_vector<T>, T>(arr, start, end, pivot2, true); // use_gt = true
+    std::swap(arr[end], arr[pivot_index2]);
+    int64_t upper = pivot_index2;
+
+    const int64_t pivot_index1 = vectorized_partition<zmm_vector<T>, T>(arr, start, upper, pivot1, false); // use_ge (use_gt = false)
+    int64_t lower = pivot_index1 - 1;
+    std::swap(arr[low], arr[lower]);
+
+    pivot_indices[0] = lower;
+    pivot_indices[1] = upper;
+}
+
+template <typename T>
+void avx512_single_pivot_partition(T *arr, int64_t from_index, int64_t to_index, int32_t *pivot_indices){
+    const int64_t pidx = pivot_indices[0];
+    const T pivot = arr[pidx];
+
+    const int64_t low = from_index;
+    const int64_t high = to_index;
+    //const int64_t start = low + 1;
+    const int64_t end = high - 1;
+
+
+    const int64_t pivot_index1 = vectorized_partition<zmm_vector<T>, T>(arr, low, high, pivot, false); // use_gt = false (use_ge)
+    int64_t lower = pivot_index1;
+
+    const int64_t pivot_index2 = vectorized_partition<zmm_vector<T>, T>(arr, pivot_index1, high, pivot, true); // use_gt = true
+    int64_t upper = pivot_index2;
+
+    pivot_indices[0] = lower;
+    pivot_indices[1] = upper;
+}
+
+template <typename T>
+inline void avx512_partition(T *arr, int64_t from_index, int64_t to_index, int32_t *pivot_indices, bool is_dual_pviot) {
+    if(is_dual_pviot) avx512_dual_pivot_partition<T>(arr, from_index, to_index, pivot_indices);
+        else avx512_single_pivot_partition<T>(arr, from_index, to_index, pivot_indices);
+}
+
+
+
 #endif  // AVX512_QSORT_COMMON
diff --git a/src/java.base/linux/native/libx86_64/avxsort_linux_x86.cpp b/src/java.base/linux/native/libx86_64/avxsort_linux_x86.cpp
index 67d6285cea552..aeea98006ce48 100644
--- a/src/java.base/linux/native/libx86_64/avxsort_linux_x86.cpp
+++ b/src/java.base/linux/native/libx86_64/avxsort_linux_x86.cpp
@@ -32,24 +32,36 @@
 
 extern "C" {
 
-    DLL_PUBLIC void avx512_sort_int(int32_t *array_fromIndex, int64_t fromIndex,
-                                    int64_t toIndex) {
-        avx512_qsort<int32_t>(array_fromIndex, toIndex - fromIndex);
+    DLL_PUBLIC void avx512_sort_int(int32_t *array, int64_t from_index, int64_t to_index) {
+        avx512_qsort<int32_t>(array, from_index, to_index);
     }
 
-    DLL_PUBLIC void avx512_sort_long(int64_t *array_fromIndex, int64_t fromIndex,
-                                    int64_t toIndex) {
-        avx512_qsort<int64_t>(array_fromIndex, toIndex - fromIndex);
+    DLL_PUBLIC void avx512_sort_long(int64_t *array, int64_t from_index, int64_t to_index) {
+        avx512_qsort<int64_t>(array, from_index, to_index);
     }
 
-    DLL_PUBLIC void avx512_sort_float(float *array_fromIndex, int64_t fromIndex,
-                                    int64_t toIndex) {
-        avx512_qsort<float>(array_fromIndex, toIndex - fromIndex);
+    DLL_PUBLIC void avx512_sort_float(float *array, int64_t from_index, int64_t to_index) {
+        avx512_qsort<float>(array, from_index, to_index);
     }
 
-    DLL_PUBLIC void avx512_sort_double(double *array_fromIndex, int64_t fromIndex,
-                                    int64_t toIndex) {
-        avx512_qsort<double>(array_fromIndex, toIndex - fromIndex);
+    DLL_PUBLIC void avx512_sort_double(double *array, int64_t from_index, int64_t to_index) {
+        avx512_qsort<double>(array, from_index, to_index);
+    }
+
+    DLL_PUBLIC void avx512_partition_int(int32_t *array, int64_t from_index, int64_t to_index, int32_t *pivot_indices, bool is_dual_pivot) {
+        avx512_partition<int32_t>(array, from_index, to_index, pivot_indices, is_dual_pivot);
+    }
+
+    DLL_PUBLIC void avx512_partition_long(int64_t *array, int64_t from_index, int64_t to_index, int32_t *pivot_indices, bool is_dual_pivot) {
+        avx512_partition<int64_t>(array, from_index, to_index, pivot_indices, is_dual_pivot);
+    }
+
+    DLL_PUBLIC void avx512_partition_float(float *array, int64_t from_index, int64_t to_index, int32_t *pivot_indices, bool is_dual_pivot) {
+        avx512_partition<float>(array, from_index, to_index, pivot_indices, is_dual_pivot);
+    }
+
+    DLL_PUBLIC void avx512_partition_double(double *array, int64_t from_index, int64_t to_index, int32_t *pivot_indices, bool is_dual_pivot) {
+        avx512_partition<double>(array, from_index, to_index, pivot_indices, is_dual_pivot);
     }
 
 }
diff --git a/src/java.base/share/classes/java/util/Arrays.java b/src/java.base/share/classes/java/util/Arrays.java
index d5ce85d336074..27f4d38f2e4e2 100644
--- a/src/java.base/share/classes/java/util/Arrays.java
+++ b/src/java.base/share/classes/java/util/Arrays.java
@@ -81,7 +81,9 @@ private Arrays() {}
 
     /**
      * Sorts the specified array into ascending numerical order.
-     *
+     * While the intrinsic is free to choose its own sorting algorithm, the
+     * fallback implementation uses either mixed insertion sort or simple
+     * insertion sort.
      *
      * @param elemType the class of the elements of the array to be sorted
      * @param array the array to be sorted
@@ -90,17 +92,36 @@ private Arrays() {}
      * address pointing to the first element to sort from.
      * @param fromIndex the index of the first element, inclusive, to be sorted
      * @param toIndex the index of the last element, exclusive, to be sorted
+     * @param end the index of the last element for simple insertion sort (in
+     * the case of mixed insertion sort). In the fallback implementation,
+     * if end < 0, we use insertion sort else we use mixed insertion sort.
      */
     @IntrinsicCandidate
-    private static void arraySort(Class<?> elemType, Object array, long offset, int fromIndex, int toIndex) {
-        switch (array) {
-            case int[] arr -> DualPivotQuicksort.sort(arr, 0, fromIndex, toIndex);
-            case long[] arr -> DualPivotQuicksort.sort(arr, 0, fromIndex, toIndex);
-            case float[] arr -> DualPivotQuicksort.sort(arr, 0, fromIndex, toIndex);
-            case double[] arr -> DualPivotQuicksort.sort(arr, 0, fromIndex, toIndex);
-            default -> throw new UnsupportedOperationException(
-                    "arraySort intrinsic not supported for this type: " + elemType);
-        }
+    static void arraySort(Class<?> elemType, Object array, long offset, int fromIndex, int toIndex, int end) {
+        DualPivotQuicksort.smallArraySort(array, fromIndex, toIndex, end);
+    }
+
+    /**
+     * Partitions the specified array based on the pivot(s) provided.
+     *
+     * @param elemType the class of the array to be sorted
+     * @param array the array to be sorted
+     * @param offset the relative offset, in bytes, from the base address of
+     * the array to partition, otherwise if the array is {@code null},an absolute
+     * address pointing to the first element to partition from.
+     * @param fromIndex the index of the first element, inclusive, to be sorted
+     * @param toIndex the index of the last element, exclusive, to be sorted
+     * @param pivotIndices the array containing the indices of the pivots. After
+     * partitioning, this array is updated with the new indices of the pivots.
+     * @param pivot_offset the offset in bytes pointing to the base address of
+     * the array used to store the indices of the pivots.
+     * @param isDualPivot a boolean value to choose between dual pivot
+     * partitioning and single pivot partitioning
+     */
+    @IntrinsicCandidate
+    static void arrayPartition(Class<?> elemType, Object array, long offset, int fromIndex, int toIndex, int[] pivotIndices, long pivot_offset, boolean isDualPivot) {
+        if (isDualPivot) DualPivotQuicksort.partitionDualPivot(array, fromIndex, toIndex, pivotIndices);
+        else DualPivotQuicksort.partitionSinglePivot(array, fromIndex, toIndex, pivotIndices);
     }
 
     /*
@@ -122,8 +143,7 @@ private static void arraySort(Class<?> elemType, Object array, long offset, int
      * @param a the array to be sorted
      */
     public static void sort(int[] a) {
-        int offset = Unsafe.ARRAY_INT_BASE_OFFSET;
-        arraySort(int.class, a, offset, 0, a.length);
+        DualPivotQuicksort.sort(a, 0, 0, a.length);
     }
 
     /**
@@ -147,8 +167,7 @@ public static void sort(int[] a) {
      */
     public static void sort(int[] a, int fromIndex, int toIndex) {
         rangeCheck(a.length, fromIndex, toIndex);
-        int offset = Unsafe.ARRAY_INT_BASE_OFFSET + (fromIndex << ArraysSupport.LOG2_ARRAY_INT_INDEX_SCALE);
-        arraySort(int.class, a, offset, fromIndex, toIndex);
+        DualPivotQuicksort.sort(a, 0, fromIndex, toIndex);
     }
 
     /**
@@ -162,8 +181,7 @@ public static void sort(int[] a, int fromIndex, int toIndex) {
      * @param a the array to be sorted
      */
     public static void sort(long[] a) {
-        int offset = Unsafe.ARRAY_LONG_BASE_OFFSET;
-        arraySort(long.class, a, offset, 0, a.length);
+        DualPivotQuicksort.sort(a, 0, 0, a.length);
     }
 
     /**
@@ -187,8 +205,7 @@ public static void sort(long[] a) {
      */
     public static void sort(long[] a, int fromIndex, int toIndex) {
         rangeCheck(a.length, fromIndex, toIndex);
-        int offset = Unsafe.ARRAY_LONG_BASE_OFFSET + (fromIndex << ArraysSupport.LOG2_ARRAY_LONG_INDEX_SCALE);
-        arraySort(long.class, a, offset, fromIndex, toIndex);
+        DualPivotQuicksort.sort(a, 0, fromIndex, toIndex);
     }
 
     /**
@@ -324,8 +341,7 @@ public static void sort(byte[] a, int fromIndex, int toIndex) {
      * @param a the array to be sorted
      */
     public static void sort(float[] a) {
-        int offset = Unsafe.ARRAY_FLOAT_BASE_OFFSET;
-        arraySort(float.class, a, offset, 0, a.length);
+        DualPivotQuicksort.sort(a, 0, 0, a.length);
     }
 
     /**
@@ -357,8 +373,7 @@ public static void sort(float[] a) {
      */
     public static void sort(float[] a, int fromIndex, int toIndex) {
         rangeCheck(a.length, fromIndex, toIndex);
-        int offset = Unsafe.ARRAY_FLOAT_BASE_OFFSET + (fromIndex << ArraysSupport.LOG2_ARRAY_FLOAT_INDEX_SCALE);
-        arraySort(float.class, a, offset, fromIndex, toIndex);
+        DualPivotQuicksort.sort(a, 0, fromIndex, toIndex);
     }
 
     /**
@@ -380,8 +395,7 @@ public static void sort(float[] a, int fromIndex, int toIndex) {
      * @param a the array to be sorted
      */
     public static void sort(double[] a) {
-        int offset = Unsafe.ARRAY_DOUBLE_BASE_OFFSET;
-        arraySort(double.class, a, offset, 0, a.length);
+         DualPivotQuicksort.sort(a, 0, 0, a.length);
     }
 
     /**
@@ -413,8 +427,7 @@ public static void sort(double[] a) {
      */
     public static void sort(double[] a, int fromIndex, int toIndex) {
         rangeCheck(a.length, fromIndex, toIndex);
-        int offset = Unsafe.ARRAY_DOUBLE_BASE_OFFSET + (fromIndex << ArraysSupport.LOG2_ARRAY_DOUBLE_INDEX_SCALE);
-        arraySort(double.class, a, offset, fromIndex, toIndex);
+        DualPivotQuicksort.sort(a, 0, fromIndex, toIndex);
     }
 
     /**
diff --git a/src/java.base/share/classes/java/util/DualPivotQuicksort.java b/src/java.base/share/classes/java/util/DualPivotQuicksort.java
index 3dcc7fee1f525..7a7a906176ffd 100644
--- a/src/java.base/share/classes/java/util/DualPivotQuicksort.java
+++ b/src/java.base/share/classes/java/util/DualPivotQuicksort.java
@@ -27,6 +27,9 @@
 
 import java.util.concurrent.CountedCompleter;
 import java.util.concurrent.RecursiveTask;
+import java.util.Arrays;
+import jdk.internal.misc.Unsafe;
+
 
 /**
  * This class implements powerful and fully optimized versions, both
@@ -137,6 +140,100 @@ private static int getDepth(int parallelism, int size) {
         return depth;
     }
 
+    /**
+     * Sorts the specified range of the array using either insertion sort
+     * or mixed insertion sort depending on the value of end. if end < 0,
+     * we use insertion sort else we use mixed insertion sort.
+     *
+     * @param array the array to be sorted
+     * @param low the index of the first element, inclusive, to be sorted
+     * @param high the index of the last element, exclusive, to be sorted
+     * @param end the index of the last element for simple insertion sort (in
+     * the case of mixed insertion sort). If end < 0, we use insertion sort
+     * else we use mixed insertion sort.
+     */
+    static void smallArraySort(Object array, int low, int high, int end) {
+       if (end < 0) insertionSort(array, low, high);
+       else mixedInsertionSort(array, low, end, high);
+    }
+
+    /**
+     * Sorts the specified range of the array using insertion sort
+     *
+     * @param array the array to be sorted
+     * @param low the index of the first element, inclusive, to be sorted
+     * @param high the index of the last element, exclusive, to be sorted
+     *
+     */
+    static void insertionSort(Object array, int low, int high) {
+        switch (array) {
+            case int[] arr -> insertionSort(arr, low, high);
+            case long[] arr -> insertionSort(arr, low, high);
+            case float[] arr -> insertionSort(arr, low, high);
+            case double[] arr -> insertionSort(arr, low, high);
+            default -> throw new UnsupportedOperationException();
+        }
+    }
+
+    /**
+     * Sorts the specified range of the array using mixed insertion sort.
+     *
+     * @param array the array to be sorted
+     * @param low the index of the first element, inclusive, to be sorted
+     * @param high the index of the last element, exclusive, to be sorted
+     * @param end the index of the last element for simple insertion sort
+     *
+     */
+    static void mixedInsertionSort(Object array, int low, int end, int high) {
+        switch (array) {
+            case int[] arr -> mixedInsertionSort(arr, low, end, high);
+            case long[] arr ->  mixedInsertionSort(arr, low, end, high);
+            case float[] arr ->  mixedInsertionSort(arr, low, end, high);
+            case double[] arr ->  mixedInsertionSort(arr, low, end, high);
+            default -> throw new UnsupportedOperationException();
+        }
+    }
+
+    /**
+     * Partitions the specified range of the array using the two pivots specified.
+     *
+     * @param array the array to be partitioned
+     * @param low the index of the first element, inclusive, for partitioning
+     * @param high the index of the last element, exclusive, for partitioning
+     * @param pivotIndices an array containing the indices of the two pivots to be used.
+     * After partitioning, the indices of the pivots is updated as well.
+     *
+     */
+    static void partitionDualPivot(Object array, int low, int high, int[] pivotIndices) {
+        switch(array) {
+            case int[] arr -> partitionDualPivot(arr, low, high, pivotIndices);
+            case long[] arr -> partitionDualPivot(arr, low, high, pivotIndices);
+            case float[] arr -> partitionDualPivot(arr, low, high, pivotIndices);
+            case double[] arr -> partitionDualPivot(arr, low, high, pivotIndices);
+            default -> throw new UnsupportedOperationException();
+        }
+    }
+
+    /**
+     * Partitions the specified range of the array using a single pivot specified.
+     *
+     * @param array the array to be partitioned
+     * @param low the index of the first element, inclusive, for partitioning
+     * @param high the index of the last element, exclusive, for partitioning
+     * @param pivotIndices an array containing the indices of the pivot to be used.
+     * After partitioning, the indices of the pivots is updated as well.
+     *
+     */
+    static void partitionSinglePivot(Object array, int low, int high, int[] pivotIndices) {
+        switch(array) {
+            case int[] arr -> partitionSinglePivot(arr, low, high, pivotIndices);
+            case long[] arr -> partitionSinglePivot(arr, low, high, pivotIndices);
+            case float[] arr -> partitionSinglePivot(arr, low, high, pivotIndices);
+            case double[] arr -> partitionSinglePivot(arr, low, high, pivotIndices);
+            default -> throw new UnsupportedOperationException();
+        }
+    }
+
     /**
      * Sorts the specified range of the array using parallel merge
      * sort and/or Dual-Pivot Quicksort.
@@ -178,12 +275,14 @@ static void sort(int[] a, int parallelism, int low, int high) {
     static void sort(Sorter sorter, int[] a, int bits, int low, int high) {
         while (true) {
             int end = high - 1, size = high - low;
+            int[] pivotIndices;
+            int baseOffset = Unsafe.ARRAY_INT_BASE_OFFSET;
 
             /*
              * Run mixed insertion sort on small non-leftmost parts.
              */
             if (size < MAX_MIXED_INSERTION_SORT_SIZE + bits && (bits & 1) > 0) {
-                mixedInsertionSort(a, low, high - 3 * ((size >> 5) << 3), high);
+                Arrays.arraySort(int.class, a, baseOffset, low, high, high - 3 * ((size >> 5) << 3));
                 return;
             }
 
@@ -191,7 +290,7 @@ static void sort(Sorter sorter, int[] a, int bits, int low, int high) {
              * Invoke insertion sort on small leftmost part.
              */
             if (size < MAX_INSERTION_SORT_SIZE) {
-                insertionSort(a, low, high);
+                Arrays.arraySort(int.class, a, baseOffset, low, high, -1);
                 return;
             }
 
@@ -271,78 +370,19 @@ && tryMergeRuns(sorter, a, low, size)) {
             /*
              * Partitioning with 2 pivots in case of different elements.
              */
-            if (a[e1] < a[e2] && a[e2] < a[e3] && a[e3] < a[e4] && a[e4] < a[e5]) {
-
+            boolean isDualPivot = (a[e1] < a[e2] && a[e2] < a[e3] && a[e3] < a[e4] && a[e4] < a[e5]);
+            if (isDualPivot) {
                 /*
                  * Use the first and fifth of the five sorted elements as
                  * the pivots. These values are inexpensive approximation
                  * of tertiles. Note, that pivot1 < pivot2.
                  */
-                int pivot1 = a[e1];
-                int pivot2 = a[e5];
-
-                /*
-                 * The first and the last elements to be sorted are moved
-                 * to the locations formerly occupied by the pivots. When
-                 * partitioning is completed, the pivots are swapped back
-                 * into their final positions, and excluded from the next
-                 * subsequent sorting.
-                 */
-                a[e1] = a[lower];
-                a[e5] = a[upper];
-
-                /*
-                 * Skip elements, which are less or greater than the pivots.
-                 */
-                while (a[++lower] < pivot1);
-                while (a[--upper] > pivot2);
-
-                /*
-                 * Backward 3-interval partitioning
-                 *
-                 *   left part                 central part          right part
-                 * +------------------------------------------------------------+
-                 * |  < pivot1  |   ?   |  pivot1 <= && <= pivot2  |  > pivot2  |
-                 * +------------------------------------------------------------+
-                 *             ^       ^                            ^
-                 *             |       |                            |
-                 *           lower     k                          upper
-                 *
-                 * Invariants:
-                 *
-                 *              all in (low, lower] < pivot1
-                 *    pivot1 <= all in (k, upper)  <= pivot2
-                 *              all in [upper, end) > pivot2
-                 *
-                 * Pointer k is the last index of ?-part
-                 */
-                for (int unused = --lower, k = ++upper; --k > lower; ) {
-                    int ak = a[k];
+                pivotIndices = new int[] {e1, e5};
+                Arrays.arrayPartition(int.class, a, baseOffset, low, high, pivotIndices, Unsafe.ARRAY_INT_BASE_OFFSET, isDualPivot);
+                lower = pivotIndices[0];
+                upper = pivotIndices[1];
 
-                    if (ak < pivot1) { // Move a[k] to the left side
-                        while (lower < k) {
-                            if (a[++lower] >= pivot1) {
-                                if (a[lower] > pivot2) {
-                                    a[k] = a[--upper];
-                                    a[upper] = a[lower];
-                                } else {
-                                    a[k] = a[lower];
-                                }
-                                a[lower] = ak;
-                                break;
-                            }
-                        }
-                    } else if (ak > pivot2) { // Move a[k] to the right side
-                        a[k] = a[--upper];
-                        a[upper] = ak;
-                    }
-                }
 
-                /*
-                 * Swap the pivots into their final positions.
-                 */
-                a[low] = a[lower]; a[lower] = pivot1;
-                a[end] = a[upper]; a[upper] = pivot2;
 
                 /*
                  * Sort non-left parts recursively (possibly in parallel),
@@ -362,73 +402,186 @@ && tryMergeRuns(sorter, a, low, size)) {
                  * Use the third of the five sorted elements as the pivot.
                  * This value is inexpensive approximation of the median.
                  */
-                int pivot = a[e3];
-
+                pivotIndices = new int[] {e3, e3};
+                Arrays.arrayPartition(int.class, a, baseOffset, low, high, pivotIndices, Unsafe.ARRAY_INT_BASE_OFFSET, isDualPivot);
+                lower = pivotIndices[0];
+                upper = pivotIndices[1];
                 /*
-                 * The first element to be sorted is moved to the
-                 * location formerly occupied by the pivot. After
-                 * completion of partitioning the pivot is swapped
-                 * back into its final position, and excluded from
-                 * the next subsequent sorting.
+                 * Sort the right part (possibly in parallel), excluding
+                 * known pivot. All elements from the central part are
+                 * equal and therefore already sorted.
                  */
-                a[e3] = a[lower];
+                if (size > MIN_PARALLEL_SORT_SIZE && sorter != null) {
+                    sorter.forkSorter(bits | 1, upper, high);
+                } else {
+                    sort(sorter, a, bits | 1, upper, high);
+                }
+            }
+            high = lower; // Iterate along the left part
+        }
+    }
 
-                /*
-                 * Traditional 3-way (Dutch National Flag) partitioning
-                 *
-                 *   left part                 central part    right part
-                 * +------------------------------------------------------+
-                 * |   < pivot   |     ?     |   == pivot   |   > pivot   |
-                 * +------------------------------------------------------+
-                 *              ^           ^                ^
-                 *              |           |                |
-                 *            lower         k              upper
-                 *
-                 * Invariants:
-                 *
-                 *   all in (low, lower] < pivot
-                 *   all in (k, upper)  == pivot
-                 *   all in [upper, end] > pivot
-                 *
-                 * Pointer k is the last index of ?-part
-                 */
-                for (int k = ++upper; --k > lower; ) {
-                    int ak = a[k];
+    /**
+     * Partitions the specified range of the array using the two pivots specified.
+     *
+     * @param array the array to be partitioned
+     * @param low the index of the first element, inclusive, for partitioning
+     * @param high the index of the last element, exclusive, for partitioning
+     * @param pivotIndices an array containing the indices of the two pivots to be used.
+     * After partitioning, this array the indices of the pivots is updated as well.
+     *
+     */
+    private static void partitionDualPivot(int[] a, int low, int high, int[] pivotIndices) {
+        int end = high - 1;
+        int lower = low;
+        int upper = end;
 
-                    if (ak != pivot) {
-                        a[k] = pivot;
+        int e1 = pivotIndices[0];
+        int e5 = pivotIndices[1];
+        int pivot1 = a[e1];
+        int pivot2 = a[e5];
 
-                        if (ak < pivot) { // Move a[k] to the left side
-                            while (a[++lower] < pivot);
+        /*
+        * The first and the last elements to be sorted are moved
+        * to the locations formerly occupied by the pivots. When
+        * partitioning is completed, the pivots are swapped back
+        * into their final positions, and excluded from the next
+        * subsequent sorting.
+        */
+        a[e1] = a[lower];
+        a[e5] = a[upper];
 
-                            if (a[lower] > pivot) {
-                                a[--upper] = a[lower];
-                            }
-                            a[lower] = ak;
-                        } else { // ak > pivot - Move a[k] to the right side
-                            a[--upper] = ak;
+        /*
+        * Skip elements, which are less or greater than the pivots.
+        */
+        while (a[++lower] < pivot1);
+        while (a[--upper] > pivot2);
+
+        /*
+        * Backward 3-interval partitioning
+        *
+        *   left part                 central part          right part
+        * +------------------------------------------------------------+
+                 * |  < pivot1  |   ?   |  pivot1 <= && <= pivot2  |  > pivot2  |
+        * +------------------------------------------------------------+
+        *             ^       ^                            ^
+        *             |       |                            |
+        *           lower     k                          upper
+        *
+        * Invariants:
+        *
+        *              all in (low, lower] < pivot1
+        *    pivot1 <= all in (k, upper)  <= pivot2
+        *              all in [upper, end) > pivot2
+        *
+        * Pointer k is the last index of ?-part
+        */
+        for (int unused = --lower, k = ++upper; --k > lower; ) {
+            int ak = a[k];
+
+            if (ak < pivot1) { // Move a[k] to the left side
+                while (lower < k) {
+                    if (a[++lower] >= pivot1) {
+                        if (a[lower] > pivot2) {
+                            a[k] = a[--upper];
+                            a[upper] = a[lower];
+                        } else {
+                            a[k] = a[lower];
                         }
+                        a[lower] = ak;
+                        break;
                     }
                 }
+            } else if (ak > pivot2) { // Move a[k] to the right side
+                a[k] = a[--upper];
+                a[upper] = ak;
+            }
+        }
 
-                /*
-                 * Swap the pivot into its final position.
-                 */
-                a[low] = a[lower]; a[lower] = pivot;
+        /*
+         * Swap the pivots into their final positions.
+         */
+        a[low] = a[lower]; a[lower] = pivot1;
+        a[end] = a[upper]; a[upper] = pivot2;
 
-                /*
-                 * Sort the right part (possibly in parallel), excluding
-                 * known pivot. All elements from the central part are
-                 * equal and therefore already sorted.
-                 */
-                if (size > MIN_PARALLEL_SORT_SIZE && sorter != null) {
-                    sorter.forkSorter(bits | 1, upper, high);
-                } else {
-                    sort(sorter, a, bits | 1, upper, high);
+        pivotIndices[0] = lower;
+        pivotIndices[1] = upper;
+    }
+
+
+
+    /**
+     * Partitions the specified range of the array using a single pivot specified.
+     *
+     * @param array the array to be partitioned
+     * @param low the index of the first element, inclusive, for partitioning
+     * @param high the index of the last element, exclusive, for partitioning
+     * @param pivotIndices an array containing the indices of the pivot to be used.
+     * After partitioning, this array the indices of the pivots is updated as well.
+     *
+     */
+    private static void partitionSinglePivot(int[] a, int low, int high, int[] pivotIndices) {
+        int end = high - 1;
+        int lower = low;
+        int upper = end;
+
+
+        int e3 = pivotIndices[0];
+        int pivot = a[e3];
+
+        /*
+        * The first element to be sorted is moved to the
+        * location formerly occupied by the pivot. After
+        * completion of partitioning the pivot is swapped
+        * back into its final position, and excluded from
+        * the next subsequent sorting.
+        */
+        a[e3] = a[lower];
+
+        /*
+        * Traditional 3-way (Dutch National Flag) partitioning
+        *
+        *   left part                 central part    right part
+        * +------------------------------------------------------+
+        * |   < pivot   |     ?     |   == pivot   |   > pivot   |
+        * +------------------------------------------------------+
+        *              ^           ^                ^
+        *              |           |                |
+        *            lower         k              upper
+        *
+        * Invariants:
+        *
+        *   all in (low, lower] < pivot
+        *   all in (k, upper)  == pivot
+        *   all in [upper, end] > pivot
+        *
+        * Pointer k is the last index of ?-part
+        */
+        for (int k = ++upper; --k > lower; ) {
+            int ak = a[k];
+
+            if (ak != pivot) {
+                a[k] = pivot;
+
+                if (ak < pivot) { // Move a[k] to the left side
+                    while (a[++lower] < pivot);
+
+                    if (a[lower] > pivot) {
+                        a[--upper] = a[lower];
+                    }
+                    a[lower] = ak;
+                } else { // ak > pivot - Move a[k] to the right side
+                    a[--upper] = ak;
                 }
             }
-            high = lower; // Iterate along the left part
         }
+
+        /*
+        * Swap the pivot into its final position.
+        */
+        a[low] = a[lower]; a[lower] = pivot;
+        pivotIndices[0] = lower;
+        pivotIndices[1] = upper;
     }
 
     /**
@@ -932,12 +1085,14 @@ static void sort(long[] a, int parallelism, int low, int high) {
     static void sort(Sorter sorter, long[] a, int bits, int low, int high) {
         while (true) {
             int end = high - 1, size = high - low;
+            int[] pivotIndices;
+            int baseOffset = Unsafe.ARRAY_LONG_BASE_OFFSET;
 
             /*
              * Run mixed insertion sort on small non-leftmost parts.
              */
             if (size < MAX_MIXED_INSERTION_SORT_SIZE + bits && (bits & 1) > 0) {
-                mixedInsertionSort(a, low, high - 3 * ((size >> 5) << 3), high);
+                Arrays.arraySort(long.class, a, baseOffset, low, high, high - 3 * ((size >> 5) << 3));
                 return;
             }
 
@@ -945,7 +1100,7 @@ static void sort(Sorter sorter, long[] a, int bits, int low, int high) {
              * Invoke insertion sort on small leftmost part.
              */
             if (size < MAX_INSERTION_SORT_SIZE) {
-                insertionSort(a, low, high);
+                Arrays.arraySort(long.class, a, baseOffset, low, high, -1);
                 return;
             }
 
@@ -1025,164 +1180,214 @@ && tryMergeRuns(sorter, a, low, size)) {
             /*
              * Partitioning with 2 pivots in case of different elements.
              */
-            if (a[e1] < a[e2] && a[e2] < a[e3] && a[e3] < a[e4] && a[e4] < a[e5]) {
+            boolean isDualPivot = (a[e1] < a[e2] && a[e2] < a[e3] && a[e3] < a[e4] && a[e4] < a[e5]);
+            if(isDualPivot)  {
 
                 /*
                  * Use the first and fifth of the five sorted elements as
                  * the pivots. These values are inexpensive approximation
                  * of tertiles. Note, that pivot1 < pivot2.
                  */
-                long pivot1 = a[e1];
-                long pivot2 = a[e5];
-
+                pivotIndices = new int[] {e1, e5};
+                Arrays.arrayPartition(long.class, a, baseOffset, low, high, pivotIndices, Unsafe.ARRAY_INT_BASE_OFFSET, isDualPivot);
+                lower = pivotIndices[0];
+                upper = pivotIndices[1];
                 /*
-                 * The first and the last elements to be sorted are moved
-                 * to the locations formerly occupied by the pivots. When
-                 * partitioning is completed, the pivots are swapped back
-                 * into their final positions, and excluded from the next
-                 * subsequent sorting.
+                 * Sort non-left parts recursively (possibly in parallel),
+                 * excluding known pivots.
                  */
-                a[e1] = a[lower];
-                a[e5] = a[upper];
+                if (size > MIN_PARALLEL_SORT_SIZE && sorter != null) {
+                    sorter.forkSorter(bits | 1, lower + 1, upper);
+                    sorter.forkSorter(bits | 1, upper + 1, high);
+                } else {
+                    sort(sorter, a, bits | 1, lower + 1, upper);
+                    sort(sorter, a, bits | 1, upper + 1, high);
+                }
+
+            } else { // Use single pivot in case of many equal elements
 
                 /*
-                 * Skip elements, which are less or greater than the pivots.
+                 * Use the third of the five sorted elements as the pivot.
+                 * This value is inexpensive approximation of the median.
                  */
-                while (a[++lower] < pivot1);
-                while (a[--upper] > pivot2);
-
+                pivotIndices = new int[] {e3, e3};
+                Arrays.arrayPartition(long.class, a, baseOffset, low, high, pivotIndices, Unsafe.ARRAY_INT_BASE_OFFSET, isDualPivot);
+                lower = pivotIndices[0];
+                upper = pivotIndices[1];
                 /*
-                 * Backward 3-interval partitioning
-                 *
-                 *   left part                 central part          right part
-                 * +------------------------------------------------------------+
-                 * |  < pivot1  |   ?   |  pivot1 <= && <= pivot2  |  > pivot2  |
-                 * +------------------------------------------------------------+
-                 *             ^       ^                            ^
-                 *             |       |                            |
-                 *           lower     k                          upper
-                 *
-                 * Invariants:
-                 *
-                 *              all in (low, lower] < pivot1
-                 *    pivot1 <= all in (k, upper)  <= pivot2
-                 *              all in [upper, end) > pivot2
-                 *
-                 * Pointer k is the last index of ?-part
+                 * Sort the right part (possibly in parallel), excluding
+                 * known pivot. All elements from the central part are
+                 * equal and therefore already sorted.
                  */
-                for (int unused = --lower, k = ++upper; --k > lower; ) {
-                    long ak = a[k];
+                if (size > MIN_PARALLEL_SORT_SIZE && sorter != null) {
+                    sorter.forkSorter(bits | 1, upper, high);
+                } else {
+                    sort(sorter, a, bits | 1, upper, high);
+                }
+            }
+            high = lower; // Iterate along the left part
+        }
+    }
 
-                    if (ak < pivot1) { // Move a[k] to the left side
-                        while (lower < k) {
-                            if (a[++lower] >= pivot1) {
-                                if (a[lower] > pivot2) {
-                                    a[k] = a[--upper];
-                                    a[upper] = a[lower];
-                                } else {
-                                    a[k] = a[lower];
-                                }
-                                a[lower] = ak;
-                                break;
-                            }
-                        }
-                    } else if (ak > pivot2) { // Move a[k] to the right side
-                        a[k] = a[--upper];
-                        a[upper] = ak;
-                    }
-                }
+    /**
+     * Partitions the specified range of the array using the two pivots specified.
+     *
+     * @param array the array to be partitioned
+     * @param low the index of the first element, inclusive, for partitioning
+     * @param high the index of the last element, exclusive, for partitioning
+     * @param pivotIndices an array containing the indices of the two pivots to be used.
+     * After partitioning, this array the indices of the pivots is updated as well.
+     *
+     */
+    private static void partitionDualPivot(long[] a, int low, int high, int[] pivotIndices) {
+        int end = high - 1;
+        int lower = low;
+        int upper = end;
 
-                /*
-                 * Swap the pivots into their final positions.
-                 */
-                a[low] = a[lower]; a[lower] = pivot1;
-                a[end] = a[upper]; a[upper] = pivot2;
+        int e1 = pivotIndices[0];
+        int e5 = pivotIndices[1];
+        long pivot1 = a[e1];
+        long pivot2 = a[e5];
 
-                /*
-                 * Sort non-left parts recursively (possibly in parallel),
-                 * excluding known pivots.
-                 */
-                if (size > MIN_PARALLEL_SORT_SIZE && sorter != null) {
-                    sorter.forkSorter(bits | 1, lower + 1, upper);
-                    sorter.forkSorter(bits | 1, upper + 1, high);
-                } else {
-                    sort(sorter, a, bits | 1, lower + 1, upper);
-                    sort(sorter, a, bits | 1, upper + 1, high);
-                }
+        /*
+        * The first and the last elements to be sorted are moved
+        * to the locations formerly occupied by the pivots. When
+        * partitioning is completed, the pivots are swapped back
+        * into their final positions, and excluded from the next
+        * subsequent sorting.
+        */
+        a[e1] = a[lower];
+        a[e5] = a[upper];
 
-            } else { // Use single pivot in case of many equal elements
+        /*
+        * Skip elements, which are less or greater than the pivots.
+        */
+        while (a[++lower] < pivot1);
+        while (a[--upper] > pivot2);
 
-                /*
-                 * Use the third of the five sorted elements as the pivot.
-                 * This value is inexpensive approximation of the median.
-                 */
-                long pivot = a[e3];
+        /*
+        * Backward 3-interval partitioning
+        *
+        *   left part                 central part          right part
+        * +------------------------------------------------------------+
+                 * |  < pivot1  |   ?   |  pivot1 <= && <= pivot2  |  > pivot2  |
+        * +------------------------------------------------------------+
+        *             ^       ^                            ^
+        *             |       |                            |
+        *           lower     k                          upper
+        *
+        * Invariants:
+        *
+        *              all in (low, lower] < pivot1
+        *    pivot1 <= all in (k, upper)  <= pivot2
+        *              all in [upper, end) > pivot2
+        *
+        * Pointer k is the last index of ?-part
+        */
+        for (int unused = --lower, k = ++upper; --k > lower; ) {
+            long ak = a[k];
+
+            if (ak < pivot1) { // Move a[k] to the left side
+                while (lower < k) {
+                    if (a[++lower] >= pivot1) {
+                        if (a[lower] > pivot2) {
+                            a[k] = a[--upper];
+                            a[upper] = a[lower];
+                        } else {
+                            a[k] = a[lower];
+                        }
+                        a[lower] = ak;
+                        break;
+                    }
+                }
+            } else if (ak > pivot2) { // Move a[k] to the right side
+                a[k] = a[--upper];
+                a[upper] = ak;
+            }
+        }
 
-                /*
-                 * The first element to be sorted is moved to the
-                 * location formerly occupied by the pivot. After
-                 * completion of partitioning the pivot is swapped
-                 * back into its final position, and excluded from
-                 * the next subsequent sorting.
-                 */
-                a[e3] = a[lower];
+        /*
+         * Swap the pivots into their final positions.
+         */
+        a[low] = a[lower]; a[lower] = pivot1;
+        a[end] = a[upper]; a[upper] = pivot2;
 
-                /*
-                 * Traditional 3-way (Dutch National Flag) partitioning
-                 *
-                 *   left part                 central part    right part
-                 * +------------------------------------------------------+
-                 * |   < pivot   |     ?     |   == pivot   |   > pivot   |
-                 * +------------------------------------------------------+
-                 *              ^           ^                ^
-                 *              |           |                |
-                 *            lower         k              upper
-                 *
-                 * Invariants:
-                 *
-                 *   all in (low, lower] < pivot
-                 *   all in (k, upper)  == pivot
-                 *   all in [upper, end] > pivot
-                 *
-                 * Pointer k is the last index of ?-part
-                 */
-                for (int k = ++upper; --k > lower; ) {
-                    long ak = a[k];
+        pivotIndices[0] = lower;
+        pivotIndices[1] = upper;
+    }
 
-                    if (ak != pivot) {
-                        a[k] = pivot;
 
-                        if (ak < pivot) { // Move a[k] to the left side
-                            while (a[++lower] < pivot);
+    /**
+     * Partitions the specified range of the array using a single pivot specified.
+     *
+     * @param array the array to be partitioned
+     * @param low the index of the first element, inclusive, for partitioning
+     * @param high the index of the last element, exclusive, for partitioning
+     * @param pivotIndices an array containing the indices of the pivot to be used.
+     * After partitioning, this array the indices of the pivots is updated as well.
+     *
+     */
+    private static void partitionSinglePivot(long[] a, int low, int high, int[] pivotIndices) {
+        int end = high - 1;
+        int lower = low;
+        int upper = end;
 
-                            if (a[lower] > pivot) {
-                                a[--upper] = a[lower];
-                            }
-                            a[lower] = ak;
-                        } else { // ak > pivot - Move a[k] to the right side
-                            a[--upper] = ak;
-                        }
-                    }
-                }
+        int e3 = pivotIndices[0];
+        long pivot = a[e3];
 
-                /*
-                 * Swap the pivot into its final position.
-                 */
-                a[low] = a[lower]; a[lower] = pivot;
+        /*
+        * The first element to be sorted is moved to the
+        * location formerly occupied by the pivot. After
+        * completion of partitioning the pivot is swapped
+        * back into its final position, and excluded from
+        * the next subsequent sorting.
+        */
+        a[e3] = a[lower];
 
-                /*
-                 * Sort the right part (possibly in parallel), excluding
-                 * known pivot. All elements from the central part are
-                 * equal and therefore already sorted.
-                 */
-                if (size > MIN_PARALLEL_SORT_SIZE && sorter != null) {
-                    sorter.forkSorter(bits | 1, upper, high);
-                } else {
-                    sort(sorter, a, bits | 1, upper, high);
+        /*
+        * Traditional 3-way (Dutch National Flag) partitioning
+        *
+        *   left part                 central part    right part
+        * +------------------------------------------------------+
+        * |   < pivot   |     ?     |   == pivot   |   > pivot   |
+        * +------------------------------------------------------+
+        *              ^           ^                ^
+        *              |           |                |
+        *            lower         k              upper
+        *
+        * Invariants:
+        *
+        *   all in (low, lower] < pivot
+        *   all in (k, upper)  == pivot
+        *   all in [upper, end] > pivot
+        *
+        * Pointer k is the last index of ?-part
+        */
+        for (int k = ++upper; --k > lower; ) {
+            long ak = a[k];
+
+            if (ak != pivot) {
+                a[k] = pivot;
+
+                if (ak < pivot) { // Move a[k] to the left side
+                    while (a[++lower] < pivot);
+
+                    if (a[lower] > pivot) {
+                        a[--upper] = a[lower];
+                    }
+                    a[lower] = ak;
+                } else { // ak > pivot - Move a[k] to the right side
+                    a[--upper] = ak;
                 }
             }
-            high = lower; // Iterate along the left part
         }
+
+        /*
+            * Swap the pivot into its final position.
+            */
+        a[low] = a[lower]; a[lower] = pivot;
+        pivotIndices[0] = lower;
+        pivotIndices[1] = upper;
     }
 
     /**
@@ -2473,12 +2678,14 @@ static void sort(float[] a, int parallelism, int low, int high) {
     static void sort(Sorter sorter, float[] a, int bits, int low, int high) {
         while (true) {
             int end = high - 1, size = high - low;
+            int[] pivotIndices;
+            int baseOffset = Unsafe.ARRAY_FLOAT_BASE_OFFSET;
 
             /*
              * Run mixed insertion sort on small non-leftmost parts.
              */
             if (size < MAX_MIXED_INSERTION_SORT_SIZE + bits && (bits & 1) > 0) {
-                mixedInsertionSort(a, low, high - 3 * ((size >> 5) << 3), high);
+                Arrays.arraySort(float.class, a, baseOffset, low, high, high - 3 * ((size >> 5) << 3));
                 return;
             }
 
@@ -2486,7 +2693,7 @@ static void sort(Sorter sorter, float[] a, int bits, int low, int high) {
              * Invoke insertion sort on small leftmost part.
              */
             if (size < MAX_INSERTION_SORT_SIZE) {
-                insertionSort(a, low, high);
+                Arrays.arraySort(float.class, a, baseOffset, low, high, -1);
                 return;
             }
 
@@ -2566,79 +2773,18 @@ && tryMergeRuns(sorter, a, low, size)) {
             /*
              * Partitioning with 2 pivots in case of different elements.
              */
-            if (a[e1] < a[e2] && a[e2] < a[e3] && a[e3] < a[e4] && a[e4] < a[e5]) {
+            boolean isDualPivot = (a[e1] < a[e2] && a[e2] < a[e3] && a[e3] < a[e4] && a[e4] < a[e5]);
+            if(isDualPivot)  {
 
                 /*
                  * Use the first and fifth of the five sorted elements as
                  * the pivots. These values are inexpensive approximation
                  * of tertiles. Note, that pivot1 < pivot2.
                  */
-                float pivot1 = a[e1];
-                float pivot2 = a[e5];
-
-                /*
-                 * The first and the last elements to be sorted are moved
-                 * to the locations formerly occupied by the pivots. When
-                 * partitioning is completed, the pivots are swapped back
-                 * into their final positions, and excluded from the next
-                 * subsequent sorting.
-                 */
-                a[e1] = a[lower];
-                a[e5] = a[upper];
-
-                /*
-                 * Skip elements, which are less or greater than the pivots.
-                 */
-                while (a[++lower] < pivot1);
-                while (a[--upper] > pivot2);
-
-                /*
-                 * Backward 3-interval partitioning
-                 *
-                 *   left part                 central part          right part
-                 * +------------------------------------------------------------+
-                 * |  < pivot1  |   ?   |  pivot1 <= && <= pivot2  |  > pivot2  |
-                 * +------------------------------------------------------------+
-                 *             ^       ^                            ^
-                 *             |       |                            |
-                 *           lower     k                          upper
-                 *
-                 * Invariants:
-                 *
-                 *              all in (low, lower] < pivot1
-                 *    pivot1 <= all in (k, upper)  <= pivot2
-                 *              all in [upper, end) > pivot2
-                 *
-                 * Pointer k is the last index of ?-part
-                 */
-                for (int unused = --lower, k = ++upper; --k > lower; ) {
-                    float ak = a[k];
-
-                    if (ak < pivot1) { // Move a[k] to the left side
-                        while (lower < k) {
-                            if (a[++lower] >= pivot1) {
-                                if (a[lower] > pivot2) {
-                                    a[k] = a[--upper];
-                                    a[upper] = a[lower];
-                                } else {
-                                    a[k] = a[lower];
-                                }
-                                a[lower] = ak;
-                                break;
-                            }
-                        }
-                    } else if (ak > pivot2) { // Move a[k] to the right side
-                        a[k] = a[--upper];
-                        a[upper] = ak;
-                    }
-                }
-
-                /*
-                 * Swap the pivots into their final positions.
-                 */
-                a[low] = a[lower]; a[lower] = pivot1;
-                a[end] = a[upper]; a[upper] = pivot2;
-
+                pivotIndices = new int[] {e1, e5};
+                Arrays.arrayPartition(float.class, a, baseOffset, low, high, pivotIndices, Unsafe.ARRAY_INT_BASE_OFFSET, isDualPivot);
+                lower = pivotIndices[0];
+                upper = pivotIndices[1];
                 /*
                  * Sort non-left parts recursively (possibly in parallel),
                  * excluding known pivots.
@@ -2657,73 +2803,184 @@ && tryMergeRuns(sorter, a, low, size)) {
                  * Use the third of the five sorted elements as the pivot.
                  * This value is inexpensive approximation of the median.
                  */
-                float pivot = a[e3];
-
+                pivotIndices = new int[] {e3, e3};
+                Arrays.arrayPartition(float.class, a, baseOffset, low, high, pivotIndices, Unsafe.ARRAY_INT_BASE_OFFSET, isDualPivot);
+                lower = pivotIndices[0];
+                upper = pivotIndices[1];
                 /*
-                 * The first element to be sorted is moved to the
-                 * location formerly occupied by the pivot. After
-                 * completion of partitioning the pivot is swapped
-                 * back into its final position, and excluded from
-                 * the next subsequent sorting.
+                 * Sort the right part (possibly in parallel), excluding
+                 * known pivot. All elements from the central part are
+                 * equal and therefore already sorted.
                  */
-                a[e3] = a[lower];
+                if (size > MIN_PARALLEL_SORT_SIZE && sorter != null) {
+                    sorter.forkSorter(bits | 1, upper, high);
+                } else {
+                    sort(sorter, a, bits | 1, upper, high);
+                }
+            }
+            high = lower; // Iterate along the left part
+        }
+    }
 
-                /*
-                 * Traditional 3-way (Dutch National Flag) partitioning
-                 *
-                 *   left part                 central part    right part
-                 * +------------------------------------------------------+
-                 * |   < pivot   |     ?     |   == pivot   |   > pivot   |
-                 * +------------------------------------------------------+
-                 *              ^           ^                ^
-                 *              |           |                |
-                 *            lower         k              upper
-                 *
-                 * Invariants:
-                 *
-                 *   all in (low, lower] < pivot
-                 *   all in (k, upper)  == pivot
-                 *   all in [upper, end] > pivot
-                 *
-                 * Pointer k is the last index of ?-part
-                 */
-                for (int k = ++upper; --k > lower; ) {
-                    float ak = a[k];
+    /**
+     * Partitions the specified range of the array using the two pivots specified.
+     *
+     * @param array the array to be partitioned
+     * @param low the index of the first element, inclusive, for partitioning
+     * @param high the index of the last element, exclusive, for partitioning
+     * @param pivotIndices an array containing the indices of the two pivots to be used.
+     * After partitioning, this array the indices of the pivots is updated as well.
+     *
+     */
+    private static void partitionDualPivot(float[] a, int low, int high, int[] pivotIndices) {
+        int end = high - 1;
+        int lower = low;
+        int upper = end;
 
-                    if (ak != pivot) {
-                        a[k] = pivot;
+        int e1 = pivotIndices[0];
+        int e5 = pivotIndices[1];
+        float pivot1 = a[e1];
+        float pivot2 = a[e5];
 
-                        if (ak < pivot) { // Move a[k] to the left side
-                            while (a[++lower] < pivot);
+        /*
+        * The first and the last elements to be sorted are moved
+        * to the locations formerly occupied by the pivots. When
+        * partitioning is completed, the pivots are swapped back
+        * into their final positions, and excluded from the next
+        * subsequent sorting.
+        */
+        a[e1] = a[lower];
+        a[e5] = a[upper];
 
-                            if (a[lower] > pivot) {
-                                a[--upper] = a[lower];
-                            }
-                            a[lower] = ak;
-                        } else { // ak > pivot - Move a[k] to the right side
-                            a[--upper] = ak;
+        /*
+        * Skip elements, which are less or greater than the pivots.
+        */
+                while (a[++lower] < pivot1);
+                while (a[--upper] > pivot2);
+
+        /*
+        * Backward 3-interval partitioning
+        *
+        *   left part                 central part          right part
+        * +------------------------------------------------------------+
+                 * |  < pivot1  |   ?   |  pivot1 <= && <= pivot2  |  > pivot2  |
+        * +------------------------------------------------------------+
+        *             ^       ^                            ^
+        *             |       |                            |
+        *           lower     k                          upper
+        *
+        * Invariants:
+        *
+        *              all in (low, lower] < pivot1
+        *    pivot1 <= all in (k, upper)  <= pivot2
+        *              all in [upper, end) > pivot2
+        *
+        * Pointer k is the last index of ?-part
+        */
+        for (int unused = --lower, k = ++upper; --k > lower; ) {
+            float ak = a[k];
+
+            if (ak < pivot1) { // Move a[k] to the left side
+                while (lower < k) {
+                    if (a[++lower] >= pivot1) {
+                        if (a[lower] > pivot2) {
+                            a[k] = a[--upper];
+                            a[upper] = a[lower];
+                        } else {
+                            a[k] = a[lower];
                         }
+                        a[lower] = ak;
+                        break;
                     }
                 }
+            } else if (ak > pivot2) { // Move a[k] to the right side
+                a[k] = a[--upper];
+                a[upper] = ak;
+            }
+        }
 
-                /*
-                 * Swap the pivot into its final position.
-                 */
-                a[low] = a[lower]; a[lower] = pivot;
+        /*
+         * Swap the pivots into their final positions.
+         */
+        a[low] = a[lower]; a[lower] = pivot1;
+        a[end] = a[upper]; a[upper] = pivot2;
 
-                /*
-                 * Sort the right part (possibly in parallel), excluding
-                 * known pivot. All elements from the central part are
-                 * equal and therefore already sorted.
-                 */
-                if (size > MIN_PARALLEL_SORT_SIZE && sorter != null) {
-                    sorter.forkSorter(bits | 1, upper, high);
-                } else {
-                    sort(sorter, a, bits | 1, upper, high);
+        pivotIndices[0] = lower;
+        pivotIndices[1] = upper;
+    }
+
+
+    /**
+     * Partitions the specified range of the array using a single pivot specified.
+     *
+     * @param array the array to be partitioned
+     * @param low the index of the first element, inclusive, for partitioning
+     * @param high the index of the last element, exclusive, for partitioning
+     * @param pivotIndices an array containing the indices of the pivot to be used.
+     * After partitioning, this array the indices of the pivots is updated as well.
+     *
+     */
+    private static void partitionSinglePivot(float[] a, int low, int high, int[] pivotIndices) {
+        int end = high - 1;
+        int lower = low;
+        int upper = end;
+
+        int e3 = pivotIndices[0];
+        float pivot = a[e3];
+
+        /*
+        * The first element to be sorted is moved to the
+        * location formerly occupied by the pivot. After
+        * completion of partitioning the pivot is swapped
+        * back into its final position, and excluded from
+        * the next subsequent sorting.
+        */
+        a[e3] = a[lower];
+
+        /*
+        * Traditional 3-way (Dutch National Flag) partitioning
+        *
+        *   left part                 central part    right part
+        * +------------------------------------------------------+
+        * |   < pivot   |     ?     |   == pivot   |   > pivot   |
+        * +------------------------------------------------------+
+        *              ^           ^                ^
+        *              |           |                |
+        *            lower         k              upper
+        *
+        * Invariants:
+        *
+        *   all in (low, lower] < pivot
+        *   all in (k, upper)  == pivot
+        *   all in [upper, end] > pivot
+        *
+        * Pointer k is the last index of ?-part
+        */
+        for (int k = ++upper; --k > lower; ) {
+            float ak = a[k];
+
+            if (ak != pivot) {
+                a[k] = pivot;
+
+                if (ak < pivot) { // Move a[k] to the left side
+                    while (a[++lower] < pivot);
+
+                    if (a[lower] > pivot) {
+                        a[--upper] = a[lower];
+                    }
+                    a[lower] = ak;
+                } else { // ak > pivot - Move a[k] to the right side
+                    a[--upper] = ak;
                 }
             }
-            high = lower; // Iterate along the left part
         }
+
+        /*
+            * Swap the pivot into its final position.
+            */
+        a[low] = a[lower]; a[lower] = pivot;
+        pivotIndices[0] = lower;
+        pivotIndices[1] = upper;
     }
 
     /**
@@ -3279,12 +3536,14 @@ static void sort(double[] a, int parallelism, int low, int high) {
     static void sort(Sorter sorter, double[] a, int bits, int low, int high) {
         while (true) {
             int end = high - 1, size = high - low;
+            int[] pivotIndices;
+            int baseOffset = Unsafe.ARRAY_DOUBLE_BASE_OFFSET;
 
             /*
              * Run mixed insertion sort on small non-leftmost parts.
              */
             if (size < MAX_MIXED_INSERTION_SORT_SIZE + bits && (bits & 1) > 0) {
-                mixedInsertionSort(a, low, high - 3 * ((size >> 5) << 3), high);
+                Arrays.arraySort(double.class, a, baseOffset, low, high, high - 3 * ((size >> 5) << 3));
                 return;
             }
 
@@ -3292,7 +3551,7 @@ static void sort(Sorter sorter, double[] a, int bits, int low, int high) {
              * Invoke insertion sort on small leftmost part.
              */
             if (size < MAX_INSERTION_SORT_SIZE) {
-                insertionSort(a, low, high);
+                Arrays.arraySort(double.class, a, baseOffset, low, high, -1);
                 return;
             }
 
@@ -3372,79 +3631,18 @@ && tryMergeRuns(sorter, a, low, size)) {
             /*
              * Partitioning with 2 pivots in case of different elements.
              */
-            if (a[e1] < a[e2] && a[e2] < a[e3] && a[e3] < a[e4] && a[e4] < a[e5]) {
-
-                /*
-                 * Use the first and fifth of the five sorted elements as
-                 * the pivots. These values are inexpensive approximation
-                 * of tertiles. Note, that pivot1 < pivot2.
-                 */
-                double pivot1 = a[e1];
-                double pivot2 = a[e5];
-
-                /*
-                 * The first and the last elements to be sorted are moved
-                 * to the locations formerly occupied by the pivots. When
-                 * partitioning is completed, the pivots are swapped back
-                 * into their final positions, and excluded from the next
-                 * subsequent sorting.
-                 */
-                a[e1] = a[lower];
-                a[e5] = a[upper];
-
-                /*
-                 * Skip elements, which are less or greater than the pivots.
-                 */
-                while (a[++lower] < pivot1);
-                while (a[--upper] > pivot2);
-
-                /*
-                 * Backward 3-interval partitioning
-                 *
-                 *   left part                 central part          right part
-                 * +------------------------------------------------------------+
-                 * |  < pivot1  |   ?   |  pivot1 <= && <= pivot2  |  > pivot2  |
-                 * +------------------------------------------------------------+
-                 *             ^       ^                            ^
-                 *             |       |                            |
-                 *           lower     k                          upper
-                 *
-                 * Invariants:
-                 *
-                 *              all in (low, lower] < pivot1
-                 *    pivot1 <= all in (k, upper)  <= pivot2
-                 *              all in [upper, end) > pivot2
-                 *
-                 * Pointer k is the last index of ?-part
-                 */
-                for (int unused = --lower, k = ++upper; --k > lower; ) {
-                    double ak = a[k];
-
-                    if (ak < pivot1) { // Move a[k] to the left side
-                        while (lower < k) {
-                            if (a[++lower] >= pivot1) {
-                                if (a[lower] > pivot2) {
-                                    a[k] = a[--upper];
-                                    a[upper] = a[lower];
-                                } else {
-                                    a[k] = a[lower];
-                                }
-                                a[lower] = ak;
-                                break;
-                            }
-                        }
-                    } else if (ak > pivot2) { // Move a[k] to the right side
-                        a[k] = a[--upper];
-                        a[upper] = ak;
-                    }
-                }
+            boolean isDualPivot = (a[e1] < a[e2] && a[e2] < a[e3] && a[e3] < a[e4] && a[e4] < a[e5]);
+            if(isDualPivot)  {
 
                 /*
-                 * Swap the pivots into their final positions.
-                 */
-                a[low] = a[lower]; a[lower] = pivot1;
-                a[end] = a[upper]; a[upper] = pivot2;
-
+                * Use the first and fifth of the five sorted elements as
+                * the pivots. These values are inexpensive approximation
+                * of tertiles. Note, that pivot1 < pivot2.
+                */
+                pivotIndices = new int[] {e1, e5};
+                Arrays.arrayPartition(double.class, a, baseOffset, low, high, pivotIndices, Unsafe.ARRAY_INT_BASE_OFFSET, isDualPivot);
+                lower = pivotIndices[0];
+                upper = pivotIndices[1];
                 /*
                  * Sort non-left parts recursively (possibly in parallel),
                  * excluding known pivots.
@@ -3463,73 +3661,185 @@ && tryMergeRuns(sorter, a, low, size)) {
                  * Use the third of the five sorted elements as the pivot.
                  * This value is inexpensive approximation of the median.
                  */
-                double pivot = a[e3];
+                pivotIndices = new int[] {e3, e3};
+                Arrays.arrayPartition(double.class, a, baseOffset, low, high, pivotIndices, Unsafe.ARRAY_INT_BASE_OFFSET, isDualPivot);
+                lower = pivotIndices[0];
+                upper = pivotIndices[1];
 
                 /*
-                 * The first element to be sorted is moved to the
-                 * location formerly occupied by the pivot. After
-                 * completion of partitioning the pivot is swapped
-                 * back into its final position, and excluded from
-                 * the next subsequent sorting.
+                 * Sort the right part (possibly in parallel), excluding
+                 * known pivot. All elements from the central part are
+                 * equal and therefore already sorted.
                  */
-                a[e3] = a[lower];
+                if (size > MIN_PARALLEL_SORT_SIZE && sorter != null) {
+                    sorter.forkSorter(bits | 1, upper, high);
+                } else {
+                    sort(sorter, a, bits | 1, upper, high);
+                }
+            }
+            high = lower; // Iterate along the left part
+        }
+    }
 
-                /*
-                 * Traditional 3-way (Dutch National Flag) partitioning
-                 *
-                 *   left part                 central part    right part
-                 * +------------------------------------------------------+
-                 * |   < pivot   |     ?     |   == pivot   |   > pivot   |
-                 * +------------------------------------------------------+
-                 *              ^           ^                ^
-                 *              |           |                |
-                 *            lower         k              upper
-                 *
-                 * Invariants:
-                 *
-                 *   all in (low, lower] < pivot
-                 *   all in (k, upper)  == pivot
-                 *   all in [upper, end] > pivot
-                 *
-                 * Pointer k is the last index of ?-part
-                 */
-                for (int k = ++upper; --k > lower; ) {
-                    double ak = a[k];
+    /**
+     * Partitions the specified range of the array using the two pivots specified.
+     *
+     * @param array the array to be partitioned
+     * @param low the index of the first element, inclusive, for partitioning
+     * @param high the index of the last element, exclusive, for partitioning
+     * @param pivotIndices an array containing the indices of the two pivots to be used.
+     * After partitioning, this array the indices of the pivots is updated as well.
+     *
+     */
+    private static void partitionDualPivot(double[] a, int low, int high, int[] pivotIndices) {
+        int end = high - 1;
+        int lower = low;
+        int upper = end;
 
-                    if (ak != pivot) {
-                        a[k] = pivot;
+        int e1 = pivotIndices[0];
+        int e5 = pivotIndices[1];
+        double pivot1 = a[e1];
+        double pivot2 = a[e5];
 
-                        if (ak < pivot) { // Move a[k] to the left side
-                            while (a[++lower] < pivot);
+        /*
+        * The first and the last elements to be sorted are moved
+        * to the locations formerly occupied by the pivots. When
+        * partitioning is completed, the pivots are swapped back
+        * into their final positions, and excluded from the next
+        * subsequent sorting.
+        */
+        a[e1] = a[lower];
+        a[e5] = a[upper];
 
-                            if (a[lower] > pivot) {
-                                a[--upper] = a[lower];
-                            }
-                            a[lower] = ak;
-                        } else { // ak > pivot - Move a[k] to the right side
-                            a[--upper] = ak;
+        /*
+        * Skip elements, which are less or greater than the pivots.
+        */
+                while (a[++lower] < pivot1);
+                while (a[--upper] > pivot2);
+
+        /*
+        * Backward 3-interval partitioning
+        *
+        *   left part                 central part          right part
+        * +------------------------------------------------------------+
+                 * |  < pivot1  |   ?   |  pivot1 <= && <= pivot2  |  > pivot2  |
+        * +------------------------------------------------------------+
+        *             ^       ^                            ^
+        *             |       |                            |
+        *           lower     k                          upper
+        *
+        * Invariants:
+        *
+        *              all in (low, lower] < pivot1
+        *    pivot1 <= all in (k, upper)  <= pivot2
+        *              all in [upper, end) > pivot2
+        *
+        * Pointer k is the last index of ?-part
+        */
+        for (int unused = --lower, k = ++upper; --k > lower; ) {
+            double ak = a[k];
+
+            if (ak < pivot1) { // Move a[k] to the left side
+                while (lower < k) {
+                    if (a[++lower] >= pivot1) {
+                        if (a[lower] > pivot2) {
+                            a[k] = a[--upper];
+                            a[upper] = a[lower];
+                        } else {
+                            a[k] = a[lower];
                         }
+                        a[lower] = ak;
+                        break;
                     }
                 }
+            } else if (ak > pivot2) { // Move a[k] to the right side
+                a[k] = a[--upper];
+                a[upper] = ak;
+            }
+        }
 
-                /*
-                 * Swap the pivot into its final position.
-                 */
-                a[low] = a[lower]; a[lower] = pivot;
+        /*
+         * Swap the pivots into their final positions.
+         */
+        a[low] = a[lower]; a[lower] = pivot1;
+        a[end] = a[upper]; a[upper] = pivot2;
 
-                /*
-                 * Sort the right part (possibly in parallel), excluding
-                 * known pivot. All elements from the central part are
-                 * equal and therefore already sorted.
-                 */
-                if (size > MIN_PARALLEL_SORT_SIZE && sorter != null) {
-                    sorter.forkSorter(bits | 1, upper, high);
-                } else {
-                    sort(sorter, a, bits | 1, upper, high);
+        pivotIndices[0] = lower;
+        pivotIndices[1] = upper;
+    }
+
+
+
+    /**
+     * Partitions the specified range of the array using a single pivot specified.
+     *
+     * @param array the array to be partitioned
+     * @param low the index of the first element, inclusive, for partitioning
+     * @param high the index of the last element, exclusive, for partitioning
+     * @param pivotIndices an array containing the indices of the pivot to be used.
+     * After partitioning, this array the indices of the pivots is updated as well.
+     */
+    private static void partitionSinglePivot(double[] a, int low, int high, int[] pivotIndices) {
+        int end = high - 1;
+        int lower = low;
+        int upper = end;
+
+        int e3 = pivotIndices[0];
+        double pivot = a[e3];
+
+        /*
+        * The first element to be sorted is moved to the
+        * location formerly occupied by the pivot. After
+        * completion of partitioning the pivot is swapped
+        * back into its final position, and excluded from
+        * the next subsequent sorting.
+        */
+        a[e3] = a[lower];
+
+        /*
+        * Traditional 3-way (Dutch National Flag) partitioning
+        *
+        *   left part                 central part    right part
+        * +------------------------------------------------------+
+        * |   < pivot   |     ?     |   == pivot   |   > pivot   |
+        * +------------------------------------------------------+
+        *              ^           ^                ^
+        *              |           |                |
+        *            lower         k              upper
+        *
+        * Invariants:
+        *
+        *   all in (low, lower] < pivot
+        *   all in (k, upper)  == pivot
+        *   all in [upper, end] > pivot
+        *
+        * Pointer k is the last index of ?-part
+        */
+        for (int k = ++upper; --k > lower; ) {
+            double ak = a[k];
+
+            if (ak != pivot) {
+                a[k] = pivot;
+
+                if (ak < pivot) { // Move a[k] to the left side
+                    while (a[++lower] < pivot);
+
+                    if (a[lower] > pivot) {
+                        a[--upper] = a[lower];
+                    }
+                    a[lower] = ak;
+                } else { // ak > pivot - Move a[k] to the right side
+                    a[--upper] = ak;
                 }
             }
-            high = lower; // Iterate along the left part
         }
+
+        /*
+            * Swap the pivot into its final position.
+            */
+        a[low] = a[lower]; a[lower] = pivot;
+        pivotIndices[0] = lower;
+        pivotIndices[1] = upper;
     }
 
     /**
diff --git a/test/micro/org/openjdk/bench/java/util/ArraysSort.java b/test/micro/org/openjdk/bench/java/util/ArraysSort.java
index fb5b2f874ee88..be1634fa1f2ca 100644
--- a/test/micro/org/openjdk/bench/java/util/ArraysSort.java
+++ b/test/micro/org/openjdk/bench/java/util/ArraysSort.java
@@ -47,12 +47,17 @@
 /**
  * Performance test of Arrays.sort() methods
  */
+@Fork(value=1, jvmArgsAppend={"-XX:CompileThreshold=1", "-XX:-TieredCompilation"})
 @BenchmarkMode(Mode.AverageTime)
 @OutputTimeUnit(TimeUnit.MICROSECONDS)
 @State(Scope.Thread)
-@Fork(value = 1)
+@Warmup(iterations = 3, time=5)
+@Measurement(iterations = 3, time=3)
 public class ArraysSort {
 
+    @Param({"10","25","50","75","100", "1000", "10000", "100000", "1000000"})
+    private int size;
+
     private int[] ints_unsorted;
     private long[] longs_unsorted;
     private float[] floats_unsorted;
@@ -64,7 +69,7 @@ public class ArraysSort {
     private double[] doubles_sorted;
 
 
-    public void initialize(int size) {
+    public void initialize() {
         Random rnd = new Random(42);
 
         ints_unsorted = new int[size];
@@ -72,6 +77,8 @@ public void initialize(int size) {
         floats_unsorted = new float[size];
         doubles_unsorted = new double[size];
 
+        int[] intSpecialCases = {Integer.MIN_VALUE, Integer.MAX_VALUE};
+        long[] longSpecialCases = {Long.MIN_VALUE, Long.MAX_VALUE};
         float[] floatSpecialCases = {+0.0f, -0.0f, Float.POSITIVE_INFINITY, Float.NEGATIVE_INFINITY, Float.NaN};
         double[] doubleSpecialCases = {+0.0, -0.0, Double.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY, Double.NaN};
 
@@ -79,16 +86,24 @@ public void initialize(int size) {
             ints_unsorted[i] = rnd.nextInt();
             longs_unsorted[i] = rnd.nextLong();
             if (i % 10 != 0) {
+                ints_unsorted[i] = rnd.nextInt();
+                longs_unsorted[i] = rnd.nextLong();
                 floats_unsorted[i] = rnd.nextFloat();
                 doubles_unsorted[i] = rnd.nextDouble();
             } else {
-                int rndIdx = rnd.nextInt(doubleSpecialCases.length);
-                floats_unsorted[i] = floatSpecialCases[rndIdx];
-                doubles_unsorted[i] = doubleSpecialCases[rndIdx];
+                ints_unsorted[i] = intSpecialCases[rnd.nextInt(intSpecialCases.length)];
+                longs_unsorted[i] = longSpecialCases[rnd.nextInt(longSpecialCases.length)];
+                floats_unsorted[i] = floatSpecialCases[rnd.nextInt(floatSpecialCases.length)];
+                doubles_unsorted[i] = doubleSpecialCases[rnd.nextInt(doubleSpecialCases.length)];
             }
         }
     }
 
+    @Setup
+    public void setup() throws UnsupportedEncodingException, ClassNotFoundException, NoSuchMethodException, Throwable {
+        initialize();
+    }
+
     @Setup(Level.Invocation)
     public void clear() {
         ints_sorted = ints_unsorted.clone();
@@ -121,52 +136,4 @@ public double[] doubleSort() throws Throwable {
         return doubles_sorted;
     }
 
-    @Warmup(iterations = 3, time=2)
-    @Measurement(iterations = 3, time=5)
-    public static class Small extends ArraysSort {
-        @Param({"10","25","50","75","100"})
-        private int size;
-
-        @Setup
-        public void setup() throws UnsupportedEncodingException, ClassNotFoundException, NoSuchMethodException, Throwable {
-            initialize(size);
-        }
-    }
-
-    @Warmup(iterations = 3, time=2)
-    @Measurement(iterations = 3, time=5)
-    public static class Medium extends ArraysSort {
-        @Param({"1000", "10000"})
-        private int size;
-
-        @Setup
-        public void setup() throws UnsupportedEncodingException, ClassNotFoundException, NoSuchMethodException, Throwable {
-            initialize(size);
-        }
-    }
-
-    @Warmup(iterations = 3, time=40)
-    @Measurement(iterations = 3, time=30)
-    public static class Large extends ArraysSort {
-        @Param({"50000", "100000"})
-        private int size;
-
-        @Setup
-        public void setup() throws UnsupportedEncodingException, ClassNotFoundException, NoSuchMethodException, Throwable {
-            initialize(size);
-        }
-    }
-
-    @Warmup(iterations = 3, time=120)
-    @Measurement(iterations = 3, time=30)
-    public static class VeryLarge extends ArraysSort {
-        @Param({"1000000"})
-        private int size;
-
-        @Setup
-        public void setup() throws UnsupportedEncodingException, ClassNotFoundException, NoSuchMethodException, Throwable {
-            initialize(size);
-        }
-    }
-
 }

From 8b80b80bddabe32865596520ec519700bb95710b Mon Sep 17 00:00:00 2001
From: Srinivas Vamsi Parasa <srinivas.vamsi.parasa@intel.com>
Date: Wed, 23 Aug 2023 05:51:43 -0700
Subject: [PATCH 21/40] Update avx512-common-qsort.h

---
 src/java.base/linux/native/libx86_64/avx512-common-qsort.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/java.base/linux/native/libx86_64/avx512-common-qsort.h b/src/java.base/linux/native/libx86_64/avx512-common-qsort.h
index b1a53a054692f..2a3f608a6f4f8 100644
--- a/src/java.base/linux/native/libx86_64/avx512-common-qsort.h
+++ b/src/java.base/linux/native/libx86_64/avx512-common-qsort.h
@@ -26,7 +26,6 @@
  */
 
 // This implementation is based on x86-simd-sort(https://github.com/intel/x86-simd-sort)
-#include <iostream>
 #ifndef AVX512_QSORT_COMMON
 #define AVX512_QSORT_COMMON
 

From 96cdd190e5cc8c7cfff98bf3f46d0180ecbeb0e2 Mon Sep 17 00:00:00 2001
From: vamsi-parasa <srinivas.vamsi.parasa@intel.com>
Date: Wed, 23 Aug 2023 15:57:22 -0700
Subject: [PATCH 22/40] Update copyright for DPQS.java; replace avx512 pivot
 calculation with scalar version

---
 .../native/libx86_64/avx512-32bit-qsort.hpp   | 24 +------------
 .../native/libx86_64/avx512-64bit-common.h    | 14 --------
 .../native/libx86_64/avx512-64bit-qsort.hpp   |  2 +-
 .../native/libx86_64/avx512-common-qsort.h    | 27 +++++---------
 .../classes/java/util/DualPivotQuicksort.java | 35 ++++++++++++++-----
 5 files changed, 37 insertions(+), 65 deletions(-)

diff --git a/src/java.base/linux/native/libx86_64/avx512-32bit-qsort.hpp b/src/java.base/linux/native/libx86_64/avx512-32bit-qsort.hpp
index bc1258debd389..7abc3a5454266 100644
--- a/src/java.base/linux/native/libx86_64/avx512-32bit-qsort.hpp
+++ b/src/java.base/linux/native/libx86_64/avx512-32bit-qsort.hpp
@@ -392,28 +392,6 @@ X86_SIMD_SORT_INLINE void sort_128_32bit(type_t *arr, int32_t N) {
     vtype::mask_storeu(arr + 112, load_mask4, zmm[7]);
 }
 
-template <typename vtype, typename type_t>
-X86_SIMD_SORT_INLINE type_t get_pivot_32bit(type_t *arr, const int64_t left,
-                                            const int64_t right) {
-    // median of 16
-    int64_t size = (right - left) / 16;
-    using zmm_t = typename vtype::zmm_t;
-    using ymm_t = typename vtype::ymm_t;
-    __m512i rand_index1 = _mm512_set_epi64(
-        left + size, left + 2 * size, left + 3 * size, left + 4 * size,
-        left + 5 * size, left + 6 * size, left + 7 * size, left + 8 * size);
-    __m512i rand_index2 = _mm512_set_epi64(
-        left + 9 * size, left + 10 * size, left + 11 * size, left + 12 * size,
-        left + 13 * size, left + 14 * size, left + 15 * size, left + 16 * size);
-    ymm_t rand_vec1 =
-        vtype::template i64gather<sizeof(type_t)>(rand_index1, arr);
-    ymm_t rand_vec2 =
-        vtype::template i64gather<sizeof(type_t)>(rand_index2, arr);
-    zmm_t rand_vec = vtype::merge(rand_vec1, rand_vec2);
-    zmm_t sort = sort_zmm_32bit<vtype>(rand_vec);
-    // pivot will never be a nan, since there are no nan's!
-    return ((type_t *)&sort)[8];
-}
 
 template <typename vtype, typename type_t>
 static void qsort_32bit_(type_t *arr, int64_t left, int64_t right,
@@ -433,7 +411,7 @@ static void qsort_32bit_(type_t *arr, int64_t left, int64_t right,
         return;
     }
 
-    type_t pivot = get_pivot_32bit<vtype>(arr, left, right);
+    type_t pivot = get_pivot_scalar<type_t>(arr, left, right);
     type_t smallest = vtype::type_max();
     type_t biggest = vtype::type_min();
     int64_t pivot_index = partition_avx512_unrolled<vtype, 2>(
diff --git a/src/java.base/linux/native/libx86_64/avx512-64bit-common.h b/src/java.base/linux/native/libx86_64/avx512-64bit-common.h
index 2c3bfd97e1960..bb7553229eacb 100644
--- a/src/java.base/linux/native/libx86_64/avx512-64bit-common.h
+++ b/src/java.base/linux/native/libx86_64/avx512-64bit-common.h
@@ -210,19 +210,5 @@ X86_SIMD_SORT_INLINE zmm_t sort_zmm_64bit(zmm_t zmm) {
     return zmm;
 }
 
-template <typename vtype, typename type_t>
-X86_SIMD_SORT_INLINE type_t get_pivot_64bit(type_t *arr, const int64_t left,
-                                            const int64_t right) {
-    // median of 8
-    int64_t size = (right - left) / 8;
-    using zmm_t = typename vtype::zmm_t;
-    __m512i rand_index = _mm512_set_epi64(
-        left + size, left + 2 * size, left + 3 * size, left + 4 * size,
-        left + 5 * size, left + 6 * size, left + 7 * size, left + 8 * size);
-    zmm_t rand_vec = vtype::template i64gather<sizeof(type_t)>(rand_index, arr);
-    // pivot will never be a nan, since there are no nan's!
-    zmm_t sort = sort_zmm_64bit<vtype>(rand_vec);
-    return ((type_t *)&sort)[4];
-}
 
 #endif
diff --git a/src/java.base/linux/native/libx86_64/avx512-64bit-qsort.hpp b/src/java.base/linux/native/libx86_64/avx512-64bit-qsort.hpp
index 61f618f657049..422f385d052e2 100644
--- a/src/java.base/linux/native/libx86_64/avx512-64bit-qsort.hpp
+++ b/src/java.base/linux/native/libx86_64/avx512-64bit-qsort.hpp
@@ -742,7 +742,7 @@ static void qsort_64bit_(type_t *arr, int64_t left, int64_t right,
         return;
     }
 
-    type_t pivot = get_pivot_64bit<vtype>(arr, left, right);
+    type_t pivot = get_pivot_scalar<type_t>(arr, left, right);
     type_t smallest = vtype::type_max();
     type_t biggest = vtype::type_min();
     int64_t pivot_index = partition_avx512_unrolled<vtype, 8>(
diff --git a/src/java.base/linux/native/libx86_64/avx512-common-qsort.h b/src/java.base/linux/native/libx86_64/avx512-common-qsort.h
index 2a3f608a6f4f8..ae6af54f572fa 100644
--- a/src/java.base/linux/native/libx86_64/avx512-common-qsort.h
+++ b/src/java.base/linux/native/libx86_64/avx512-common-qsort.h
@@ -132,25 +132,16 @@ bool is_a_nan(T elem) {
     return std::isnan(elem);
 }
 
-/*
- * Sort all the NAN's to end of the array and return the index of the last elem
- * in the array which is not a nan
- */
 template <typename T>
-int64_t move_nans_to_end_of_array(T *arr, int64_t arrsize) {
-    int64_t jj = arrsize - 1;
-    int64_t ii = 0;
-    int64_t count = 0;
-    while (ii <= jj) {
-        if (is_a_nan(arr[ii])) {
-            std::swap(arr[ii], arr[jj]);
-            jj -= 1;
-            count++;
-        } else {
-            ii += 1;
-        }
-    }
-    return arrsize - count - 1;
+X86_SIMD_SORT_INLINE T get_pivot_scalar(T *arr, const int64_t left, const int64_t right) {
+    // median of 8 equally spaced elements
+    int64_t NUM_ELEMENTS = 8;
+    int64_t MID = NUM_ELEMENTS / 2;
+    int64_t size = (right - left) / NUM_ELEMENTS;
+    T temp[NUM_ELEMENTS];
+    for (int64_t i = 0; i < NUM_ELEMENTS; i++) temp[i] = arr[left + (i * size)];
+    std::sort(temp, temp + NUM_ELEMENTS);
+    return temp[MID];
 }
 
 template <typename vtype, typename T = typename vtype::type_t>
diff --git a/src/java.base/share/classes/java/util/DualPivotQuicksort.java b/src/java.base/share/classes/java/util/DualPivotQuicksort.java
index 7a7a906176ffd..f02fec26f39a7 100644
--- a/src/java.base/share/classes/java/util/DualPivotQuicksort.java
+++ b/src/java.base/share/classes/java/util/DualPivotQuicksort.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2009, 2021, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2009, 2023, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -123,6 +123,11 @@ private DualPivotQuicksort() {}
      */
     private static final int MAX_RECURSION_DEPTH = 64 * DELTA;
 
+    /**
+     * Min array size to call fast small array sort.
+     */
+    private static final int MIN_FAST_SMALL_ARRAY_SORT_SIZE = 16;
+
     /**
      * Calculates the double depth of parallel merging.
      * Depth is negative, if tasks split before sorting.
@@ -282,7 +287,9 @@ static void sort(Sorter sorter, int[] a, int bits, int low, int high) {
              * Run mixed insertion sort on small non-leftmost parts.
              */
             if (size < MAX_MIXED_INSERTION_SORT_SIZE + bits && (bits & 1) > 0) {
-                Arrays.arraySort(int.class, a, baseOffset, low, high, high - 3 * ((size >> 5) << 3));
+                int last  = high - 3 * ((size >> 5) << 3);
+                if (size < MIN_FAST_SMALL_ARRAY_SORT_SIZE) mixedInsertionSort(a, low, last , high);
+                else Arrays.arraySort(int.class, a, baseOffset, low, high, last);
                 return;
             }
 
@@ -290,7 +297,8 @@ static void sort(Sorter sorter, int[] a, int bits, int low, int high) {
              * Invoke insertion sort on small leftmost part.
              */
             if (size < MAX_INSERTION_SORT_SIZE) {
-                Arrays.arraySort(int.class, a, baseOffset, low, high, -1);
+                if (size < MIN_FAST_SMALL_ARRAY_SORT_SIZE) insertionSort(a, low, high);
+                else Arrays.arraySort(int.class, a, baseOffset, low, high, -1);
                 return;
             }
 
@@ -1092,7 +1100,9 @@ static void sort(Sorter sorter, long[] a, int bits, int low, int high) {
              * Run mixed insertion sort on small non-leftmost parts.
              */
             if (size < MAX_MIXED_INSERTION_SORT_SIZE + bits && (bits & 1) > 0) {
-                Arrays.arraySort(long.class, a, baseOffset, low, high, high - 3 * ((size >> 5) << 3));
+                int last  = high - 3 * ((size >> 5) << 3);
+                if (size < MIN_FAST_SMALL_ARRAY_SORT_SIZE) mixedInsertionSort(a, low, last , high);
+                else Arrays.arraySort(long.class, a, baseOffset, low, high, last);
                 return;
             }
 
@@ -1100,7 +1110,8 @@ static void sort(Sorter sorter, long[] a, int bits, int low, int high) {
              * Invoke insertion sort on small leftmost part.
              */
             if (size < MAX_INSERTION_SORT_SIZE) {
-                Arrays.arraySort(long.class, a, baseOffset, low, high, -1);
+                if (size < MIN_FAST_SMALL_ARRAY_SORT_SIZE) insertionSort(a, low, high);
+                else Arrays.arraySort(long.class, a, baseOffset, low, high, -1);
                 return;
             }
 
@@ -2685,7 +2696,9 @@ static void sort(Sorter sorter, float[] a, int bits, int low, int high) {
              * Run mixed insertion sort on small non-leftmost parts.
              */
             if (size < MAX_MIXED_INSERTION_SORT_SIZE + bits && (bits & 1) > 0) {
-                Arrays.arraySort(float.class, a, baseOffset, low, high, high - 3 * ((size >> 5) << 3));
+                int last  = high - 3 * ((size >> 5) << 3);
+                if (size < MIN_FAST_SMALL_ARRAY_SORT_SIZE) mixedInsertionSort(a, low, last , high);
+                else Arrays.arraySort(float.class, a, baseOffset, low, high, last);
                 return;
             }
 
@@ -2693,7 +2706,8 @@ static void sort(Sorter sorter, float[] a, int bits, int low, int high) {
              * Invoke insertion sort on small leftmost part.
              */
             if (size < MAX_INSERTION_SORT_SIZE) {
-                Arrays.arraySort(float.class, a, baseOffset, low, high, -1);
+                if (size < MIN_FAST_SMALL_ARRAY_SORT_SIZE) insertionSort(a, low, high);
+                else Arrays.arraySort(float.class, a, baseOffset, low, high, -1);
                 return;
             }
 
@@ -3543,7 +3557,9 @@ static void sort(Sorter sorter, double[] a, int bits, int low, int high) {
              * Run mixed insertion sort on small non-leftmost parts.
              */
             if (size < MAX_MIXED_INSERTION_SORT_SIZE + bits && (bits & 1) > 0) {
-                Arrays.arraySort(double.class, a, baseOffset, low, high, high - 3 * ((size >> 5) << 3));
+                int last  = high - 3 * ((size >> 5) << 3);
+                if (size < MIN_FAST_SMALL_ARRAY_SORT_SIZE) mixedInsertionSort(a, low, last , high);
+                else Arrays.arraySort(double.class, a, baseOffset, low, high, last);
                 return;
             }
 
@@ -3551,7 +3567,8 @@ static void sort(Sorter sorter, double[] a, int bits, int low, int high) {
              * Invoke insertion sort on small leftmost part.
              */
             if (size < MAX_INSERTION_SORT_SIZE) {
-                Arrays.arraySort(double.class, a, baseOffset, low, high, -1);
+                if (size < MIN_FAST_SMALL_ARRAY_SORT_SIZE) insertionSort(a, low, high);
+                else Arrays.arraySort(double.class, a, baseOffset, low, high, -1);
                 return;
             }
 

From 5173849175e8c5cfe51c28a2333e470bc83efaa3 Mon Sep 17 00:00:00 2001
From: vamsi-parasa <srinivas.vamsi.parasa@intel.com>
Date: Wed, 23 Aug 2023 16:24:19 -0700
Subject: [PATCH 23/40] add parallelSort benchmarking

---
 .../openjdk/bench/java/util/ArraysSort.java   | 24 +++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/test/micro/org/openjdk/bench/java/util/ArraysSort.java b/test/micro/org/openjdk/bench/java/util/ArraysSort.java
index be1634fa1f2ca..059a3626a0d3d 100644
--- a/test/micro/org/openjdk/bench/java/util/ArraysSort.java
+++ b/test/micro/org/openjdk/bench/java/util/ArraysSort.java
@@ -118,22 +118,46 @@ public int[] intSort() throws Throwable {
         return ints_sorted;
     }
 
+    @Benchmark
+    public int[] intParallelSort() throws Throwable {
+        Arrays.parallelSort(ints_sorted);
+        return ints_sorted;
+    }
+
     @Benchmark
     public long[] longSort() throws Throwable {
         Arrays.sort(longs_sorted);
         return longs_sorted;
     }
 
+    @Benchmark
+    public long[] longParallelSort() throws Throwable {
+        Arrays.parallelSort(longs_sorted);
+        return longs_sorted;
+    }
+
     @Benchmark
     public float[] floatSort() throws Throwable {
         Arrays.sort(floats_sorted);
         return floats_sorted;
     }
 
+    @Benchmark
+    public float[] floatParallelSort() throws Throwable {
+        Arrays.parallelSort(floats_sorted);
+        return floats_sorted;
+    }
+
     @Benchmark
     public double[] doubleSort() throws Throwable {
         Arrays.sort(doubles_sorted);
         return doubles_sorted;
     }
 
+    @Benchmark
+    public double[] doubleParallelSort() throws Throwable {
+        Arrays.parallelSort(doubles_sorted);
+        return doubles_sorted;
+    }
+
 }

From df17b3e24e2995eece6405eaaa79b9089e1e469e Mon Sep 17 00:00:00 2001
From: vamsi-parasa <srinivas.vamsi.parasa@intel.com>
Date: Thu, 24 Aug 2023 16:25:07 -0700
Subject: [PATCH 24/40] Fix unused assignment in DPQS.java and space in
 Arrays.java

---
 .../share/classes/java/util/Arrays.java          |  2 +-
 .../classes/java/util/DualPivotQuicksort.java    | 16 ++++++++--------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/java.base/share/classes/java/util/Arrays.java b/src/java.base/share/classes/java/util/Arrays.java
index 27f4d38f2e4e2..3547c5eece569 100644
--- a/src/java.base/share/classes/java/util/Arrays.java
+++ b/src/java.base/share/classes/java/util/Arrays.java
@@ -395,7 +395,7 @@ public static void sort(float[] a, int fromIndex, int toIndex) {
      * @param a the array to be sorted
      */
     public static void sort(double[] a) {
-         DualPivotQuicksort.sort(a, 0, 0, a.length);
+        DualPivotQuicksort.sort(a, 0, 0, a.length);
     }
 
     /**
diff --git a/src/java.base/share/classes/java/util/DualPivotQuicksort.java b/src/java.base/share/classes/java/util/DualPivotQuicksort.java
index f02fec26f39a7..0c5f9b48e1e68 100644
--- a/src/java.base/share/classes/java/util/DualPivotQuicksort.java
+++ b/src/java.base/share/classes/java/util/DualPivotQuicksort.java
@@ -372,8 +372,8 @@ && tryMergeRuns(sorter, a, low, size)) {
             }
 
             // Pointers
-            int lower = low; // The index of the last element of the left part
-            int upper = end; // The index of the first element of the right part
+            int lower; // The index of the last element of the left part
+            int upper; // The index of the first element of the right part
 
             /*
              * Partitioning with 2 pivots in case of different elements.
@@ -1185,8 +1185,8 @@ && tryMergeRuns(sorter, a, low, size)) {
             }
 
             // Pointers
-            int lower = low; // The index of the last element of the left part
-            int upper = end; // The index of the first element of the right part
+            int lower; // The index of the last element of the left part
+            int upper; // The index of the first element of the right part
 
             /*
              * Partitioning with 2 pivots in case of different elements.
@@ -2781,8 +2781,8 @@ && tryMergeRuns(sorter, a, low, size)) {
             }
 
             // Pointers
-            int lower = low; // The index of the last element of the left part
-            int upper = end; // The index of the first element of the right part
+            int lower; // The index of the last element of the left part
+            int upper; // The index of the first element of the right part
 
             /*
              * Partitioning with 2 pivots in case of different elements.
@@ -3642,8 +3642,8 @@ && tryMergeRuns(sorter, a, low, size)) {
             }
 
             // Pointers
-            int lower = low; // The index of the last element of the left part
-            int upper = end; // The index of the first element of the right part
+            int lower; // The index of the last element of the left part
+            int upper; // The index of the first element of the right part
 
             /*
              * Partitioning with 2 pivots in case of different elements.

From f3b5fcf5df8c68458e59ddf7f0bbd33ed255f688 Mon Sep 17 00:00:00 2001
From: vamsi-parasa <srinivas.vamsi.parasa@intel.com>
Date: Thu, 24 Aug 2023 18:43:04 -0700
Subject: [PATCH 25/40] Move sort and partition intrinsics from Arrays.java to
 DPQS.java

---
 src/hotspot/share/classfile/vmIntrinsics.hpp  | 10 +-
 src/hotspot/share/classfile/vmSymbols.hpp     |  1 +
 .../share/classes/java/util/Arrays.java       | 46 ---------
 .../classes/java/util/DualPivotQuicksort.java | 96 ++++++++++++-------
 4 files changed, 69 insertions(+), 84 deletions(-)

diff --git a/src/hotspot/share/classfile/vmIntrinsics.hpp b/src/hotspot/share/classfile/vmIntrinsics.hpp
index 9fce2446aea19..d5936373202ad 100644
--- a/src/hotspot/share/classfile/vmIntrinsics.hpp
+++ b/src/hotspot/share/classfile/vmIntrinsics.hpp
@@ -341,13 +341,13 @@ class methodHandle;
    do_name(     copyOf_name,                                     "copyOf")                                              \
    do_signature(copyOf_signature,             "([Ljava/lang/Object;ILjava/lang/Class;)[Ljava/lang/Object;")             \
                                                                                                                         \
-  do_intrinsic(_arraySort,                java_util_Arrays,       arraySort_name, arraySort_signature,           F_S)   \
+  do_intrinsic(_arraySort,                java_util_DualPivotQuicksort,       arraySort_name, arraySort_signature, F_S) \
    do_name(     arraySort_name,                                  "arraySort")                                           \
-   do_signature(arraySort_signature,                             "(Ljava/lang/Class;Ljava/lang/Object;JIII)V")          \
+   do_signature(arraySort_signature,          "(Ljava/lang/Class;Ljava/lang/Object;JIII)V")                             \
                                                                                                                         \
-  do_intrinsic(_arrayPartition, java_util_Arrays, arrayPartition_name, arrayPartition_signature, F_S)                   \
-   do_name(arrayPartition_name, "arrayPartition")                                                                       \
-  do_signature(arrayPartition_signature, "(Ljava/lang/Class;Ljava/lang/Object;JII[IJZ)V")                               \
+  do_intrinsic(_arrayPartition, java_util_DualPivotQuicksort, arrayPartition_name, arrayPartition_signature, F_S)       \
+   do_name(     arrayPartition_name,                             "arrayPartition")                                      \
+  do_signature(arrayPartition_signature,      "(Ljava/lang/Class;Ljava/lang/Object;JII[IJZ)V")                          \
                                                                                                                         \
                                                                                                                         \
   do_intrinsic(_copyOfRange,              java_util_Arrays,       copyOfRange_name, copyOfRange_signature,       F_S)   \
diff --git a/src/hotspot/share/classfile/vmSymbols.hpp b/src/hotspot/share/classfile/vmSymbols.hpp
index 1b406550ef78c..38bfa7c978844 100644
--- a/src/hotspot/share/classfile/vmSymbols.hpp
+++ b/src/hotspot/share/classfile/vmSymbols.hpp
@@ -143,6 +143,7 @@
   template(java_util_Vector,                          "java/util/Vector")                         \
   template(java_util_AbstractList,                    "java/util/AbstractList")                   \
   template(java_util_Hashtable,                       "java/util/Hashtable")                      \
+  template(java_util_DualPivotQuicksort,              "java/util/DualPivotQuicksort")             \
   template(java_lang_Compiler,                        "java/lang/Compiler")                       \
   template(jdk_internal_misc_Signal,                  "jdk/internal/misc/Signal")                 \
   template(jdk_internal_util_Preconditions,           "jdk/internal/util/Preconditions")          \
diff --git a/src/java.base/share/classes/java/util/Arrays.java b/src/java.base/share/classes/java/util/Arrays.java
index 3547c5eece569..419adef0239ff 100644
--- a/src/java.base/share/classes/java/util/Arrays.java
+++ b/src/java.base/share/classes/java/util/Arrays.java
@@ -47,7 +47,6 @@
 import java.util.stream.LongStream;
 import java.util.stream.Stream;
 import java.util.stream.StreamSupport;
-import jdk.internal.misc.Unsafe;
 
 /**
  * This class contains various methods for manipulating arrays (such as
@@ -79,51 +78,6 @@ public final class Arrays {
     // Suppresses default constructor, ensuring non-instantiability.
     private Arrays() {}
 
-    /**
-     * Sorts the specified array into ascending numerical order.
-     * While the intrinsic is free to choose its own sorting algorithm, the
-     * fallback implementation uses either mixed insertion sort or simple
-     * insertion sort.
-     *
-     * @param elemType the class of the elements of the array to be sorted
-     * @param array the array to be sorted
-     * @param offset the relative offset, in bytes, from the base address of
-     * the array to sort, otherwise if the array is {@code null},an absolute
-     * address pointing to the first element to sort from.
-     * @param fromIndex the index of the first element, inclusive, to be sorted
-     * @param toIndex the index of the last element, exclusive, to be sorted
-     * @param end the index of the last element for simple insertion sort (in
-     * the case of mixed insertion sort). In the fallback implementation,
-     * if end < 0, we use insertion sort else we use mixed insertion sort.
-     */
-    @IntrinsicCandidate
-    static void arraySort(Class<?> elemType, Object array, long offset, int fromIndex, int toIndex, int end) {
-        DualPivotQuicksort.smallArraySort(array, fromIndex, toIndex, end);
-    }
-
-    /**
-     * Partitions the specified array based on the pivot(s) provided.
-     *
-     * @param elemType the class of the array to be sorted
-     * @param array the array to be sorted
-     * @param offset the relative offset, in bytes, from the base address of
-     * the array to partition, otherwise if the array is {@code null},an absolute
-     * address pointing to the first element to partition from.
-     * @param fromIndex the index of the first element, inclusive, to be sorted
-     * @param toIndex the index of the last element, exclusive, to be sorted
-     * @param pivotIndices the array containing the indices of the pivots. After
-     * partitioning, this array is updated with the new indices of the pivots.
-     * @param pivot_offset the offset in bytes pointing to the base address of
-     * the array used to store the indices of the pivots.
-     * @param isDualPivot a boolean value to choose between dual pivot
-     * partitioning and single pivot partitioning
-     */
-    @IntrinsicCandidate
-    static void arrayPartition(Class<?> elemType, Object array, long offset, int fromIndex, int toIndex, int[] pivotIndices, long pivot_offset, boolean isDualPivot) {
-        if (isDualPivot) DualPivotQuicksort.partitionDualPivot(array, fromIndex, toIndex, pivotIndices);
-        else DualPivotQuicksort.partitionSinglePivot(array, fromIndex, toIndex, pivotIndices);
-    }
-
     /*
      * Sorting methods. Note that all public "sort" methods take the
      * same form: performing argument checks if necessary, and then
diff --git a/src/java.base/share/classes/java/util/DualPivotQuicksort.java b/src/java.base/share/classes/java/util/DualPivotQuicksort.java
index 0c5f9b48e1e68..deb2850d30eae 100644
--- a/src/java.base/share/classes/java/util/DualPivotQuicksort.java
+++ b/src/java.base/share/classes/java/util/DualPivotQuicksort.java
@@ -29,6 +29,7 @@
 import java.util.concurrent.RecursiveTask;
 import java.util.Arrays;
 import jdk.internal.misc.Unsafe;
+import jdk.internal.vm.annotation.IntrinsicCandidate;
 
 
 /**
@@ -128,6 +129,52 @@ private DualPivotQuicksort() {}
      */
     private static final int MIN_FAST_SMALL_ARRAY_SORT_SIZE = 16;
 
+    /**
+     * Sorts the specified array into ascending numerical order.
+     * While the intrinsic is free to choose its own sorting algorithm, the
+     * fallback implementation uses either mixed insertion sort or simple
+     * insertion sort.
+     *
+     * @param elemType the class of the elements of the array to be sorted
+     * @param array the array to be sorted
+     * @param offset the relative offset, in bytes, from the base address of
+     * the array to sort, otherwise if the array is {@code null},an absolute
+     * address pointing to the first element to sort from.
+     * @param low the index of the first element, inclusive, to be sorted
+     * @param high the index of the last element, exclusive, to be sorted
+     * @param end the index of the last element for simple insertion sort (in
+     * the case of mixed insertion sort). In the fallback implementation,
+     * if end < 0, we use insertion sort else we use mixed insertion sort.
+     */
+    @IntrinsicCandidate
+    static void arraySort(Class<?> elemType, Object array, long offset, int low, int high, int end) {
+       if (end < 0) insertionSort(array, low, high);
+       else mixedInsertionSort(array, low, end, high);
+    }
+
+    /**
+     * Partitions the specified array based on the pivot(s) provided.
+     *
+     * @param elemType the class of the array to be sorted
+     * @param array the array to be sorted
+     * @param offset the relative offset, in bytes, from the base address of
+     * the array to partition, otherwise if the array is {@code null},an absolute
+     * address pointing to the first element to partition from.
+     * @param low the index of the first element, inclusive, to be sorted
+     * @param high the index of the last element, exclusive, to be sorted
+     * @param pivotIndices the array containing the indices of the pivots. After
+     * partitioning, this array is updated with the new indices of the pivots.
+     * @param pivot_offset the offset in bytes pointing to the base address of
+     * the array used to store the indices of the pivots.
+     * @param isDualPivot a boolean value to choose between dual pivot
+     * partitioning and single pivot partitioning
+     */
+    @IntrinsicCandidate
+    static void arrayPartition(Class<?> elemType, Object array, long offset, int low, int high, int[] pivotIndices, long pivot_offset, boolean isDualPivot) {
+        if (isDualPivot) partitionDualPivot(array, low, high, pivotIndices);
+        else partitionSinglePivot(array, low, high, pivotIndices);
+    }
+
     /**
      * Calculates the double depth of parallel merging.
      * Depth is negative, if tasks split before sorting.
@@ -145,23 +192,6 @@ private static int getDepth(int parallelism, int size) {
         return depth;
     }
 
-    /**
-     * Sorts the specified range of the array using either insertion sort
-     * or mixed insertion sort depending on the value of end. if end < 0,
-     * we use insertion sort else we use mixed insertion sort.
-     *
-     * @param array the array to be sorted
-     * @param low the index of the first element, inclusive, to be sorted
-     * @param high the index of the last element, exclusive, to be sorted
-     * @param end the index of the last element for simple insertion sort (in
-     * the case of mixed insertion sort). If end < 0, we use insertion sort
-     * else we use mixed insertion sort.
-     */
-    static void smallArraySort(Object array, int low, int high, int end) {
-       if (end < 0) insertionSort(array, low, high);
-       else mixedInsertionSort(array, low, end, high);
-    }
-
     /**
      * Sorts the specified range of the array using insertion sort
      *
@@ -289,7 +319,7 @@ static void sort(Sorter sorter, int[] a, int bits, int low, int high) {
             if (size < MAX_MIXED_INSERTION_SORT_SIZE + bits && (bits & 1) > 0) {
                 int last  = high - 3 * ((size >> 5) << 3);
                 if (size < MIN_FAST_SMALL_ARRAY_SORT_SIZE) mixedInsertionSort(a, low, last , high);
-                else Arrays.arraySort(int.class, a, baseOffset, low, high, last);
+                else arraySort(int.class, a, baseOffset, low, high, last);
                 return;
             }
 
@@ -298,7 +328,7 @@ static void sort(Sorter sorter, int[] a, int bits, int low, int high) {
              */
             if (size < MAX_INSERTION_SORT_SIZE) {
                 if (size < MIN_FAST_SMALL_ARRAY_SORT_SIZE) insertionSort(a, low, high);
-                else Arrays.arraySort(int.class, a, baseOffset, low, high, -1);
+                else arraySort(int.class, a, baseOffset, low, high, -1);
                 return;
             }
 
@@ -386,7 +416,7 @@ && tryMergeRuns(sorter, a, low, size)) {
                  * of tertiles. Note, that pivot1 < pivot2.
                  */
                 pivotIndices = new int[] {e1, e5};
-                Arrays.arrayPartition(int.class, a, baseOffset, low, high, pivotIndices, Unsafe.ARRAY_INT_BASE_OFFSET, isDualPivot);
+                arrayPartition(int.class, a, baseOffset, low, high, pivotIndices, Unsafe.ARRAY_INT_BASE_OFFSET, isDualPivot);
                 lower = pivotIndices[0];
                 upper = pivotIndices[1];
 
@@ -411,7 +441,7 @@ && tryMergeRuns(sorter, a, low, size)) {
                  * This value is inexpensive approximation of the median.
                  */
                 pivotIndices = new int[] {e3, e3};
-                Arrays.arrayPartition(int.class, a, baseOffset, low, high, pivotIndices, Unsafe.ARRAY_INT_BASE_OFFSET, isDualPivot);
+                arrayPartition(int.class, a, baseOffset, low, high, pivotIndices, Unsafe.ARRAY_INT_BASE_OFFSET, isDualPivot);
                 lower = pivotIndices[0];
                 upper = pivotIndices[1];
                 /*
@@ -1102,7 +1132,7 @@ static void sort(Sorter sorter, long[] a, int bits, int low, int high) {
             if (size < MAX_MIXED_INSERTION_SORT_SIZE + bits && (bits & 1) > 0) {
                 int last  = high - 3 * ((size >> 5) << 3);
                 if (size < MIN_FAST_SMALL_ARRAY_SORT_SIZE) mixedInsertionSort(a, low, last , high);
-                else Arrays.arraySort(long.class, a, baseOffset, low, high, last);
+                else arraySort(long.class, a, baseOffset, low, high, last);
                 return;
             }
 
@@ -1111,7 +1141,7 @@ static void sort(Sorter sorter, long[] a, int bits, int low, int high) {
              */
             if (size < MAX_INSERTION_SORT_SIZE) {
                 if (size < MIN_FAST_SMALL_ARRAY_SORT_SIZE) insertionSort(a, low, high);
-                else Arrays.arraySort(long.class, a, baseOffset, low, high, -1);
+                else arraySort(long.class, a, baseOffset, low, high, -1);
                 return;
             }
 
@@ -1200,7 +1230,7 @@ && tryMergeRuns(sorter, a, low, size)) {
                  * of tertiles. Note, that pivot1 < pivot2.
                  */
                 pivotIndices = new int[] {e1, e5};
-                Arrays.arrayPartition(long.class, a, baseOffset, low, high, pivotIndices, Unsafe.ARRAY_INT_BASE_OFFSET, isDualPivot);
+                arrayPartition(long.class, a, baseOffset, low, high, pivotIndices, Unsafe.ARRAY_INT_BASE_OFFSET, isDualPivot);
                 lower = pivotIndices[0];
                 upper = pivotIndices[1];
                 /*
@@ -1222,7 +1252,7 @@ && tryMergeRuns(sorter, a, low, size)) {
                  * This value is inexpensive approximation of the median.
                  */
                 pivotIndices = new int[] {e3, e3};
-                Arrays.arrayPartition(long.class, a, baseOffset, low, high, pivotIndices, Unsafe.ARRAY_INT_BASE_OFFSET, isDualPivot);
+                arrayPartition(long.class, a, baseOffset, low, high, pivotIndices, Unsafe.ARRAY_INT_BASE_OFFSET, isDualPivot);
                 lower = pivotIndices[0];
                 upper = pivotIndices[1];
                 /*
@@ -2698,7 +2728,7 @@ static void sort(Sorter sorter, float[] a, int bits, int low, int high) {
             if (size < MAX_MIXED_INSERTION_SORT_SIZE + bits && (bits & 1) > 0) {
                 int last  = high - 3 * ((size >> 5) << 3);
                 if (size < MIN_FAST_SMALL_ARRAY_SORT_SIZE) mixedInsertionSort(a, low, last , high);
-                else Arrays.arraySort(float.class, a, baseOffset, low, high, last);
+                else arraySort(float.class, a, baseOffset, low, high, last);
                 return;
             }
 
@@ -2707,7 +2737,7 @@ static void sort(Sorter sorter, float[] a, int bits, int low, int high) {
              */
             if (size < MAX_INSERTION_SORT_SIZE) {
                 if (size < MIN_FAST_SMALL_ARRAY_SORT_SIZE) insertionSort(a, low, high);
-                else Arrays.arraySort(float.class, a, baseOffset, low, high, -1);
+                else arraySort(float.class, a, baseOffset, low, high, -1);
                 return;
             }
 
@@ -2796,7 +2826,7 @@ && tryMergeRuns(sorter, a, low, size)) {
                  * of tertiles. Note, that pivot1 < pivot2.
                  */
                 pivotIndices = new int[] {e1, e5};
-                Arrays.arrayPartition(float.class, a, baseOffset, low, high, pivotIndices, Unsafe.ARRAY_INT_BASE_OFFSET, isDualPivot);
+                arrayPartition(float.class, a, baseOffset, low, high, pivotIndices, Unsafe.ARRAY_INT_BASE_OFFSET, isDualPivot);
                 lower = pivotIndices[0];
                 upper = pivotIndices[1];
                 /*
@@ -2818,7 +2848,7 @@ && tryMergeRuns(sorter, a, low, size)) {
                  * This value is inexpensive approximation of the median.
                  */
                 pivotIndices = new int[] {e3, e3};
-                Arrays.arrayPartition(float.class, a, baseOffset, low, high, pivotIndices, Unsafe.ARRAY_INT_BASE_OFFSET, isDualPivot);
+                arrayPartition(float.class, a, baseOffset, low, high, pivotIndices, Unsafe.ARRAY_INT_BASE_OFFSET, isDualPivot);
                 lower = pivotIndices[0];
                 upper = pivotIndices[1];
                 /*
@@ -3559,7 +3589,7 @@ static void sort(Sorter sorter, double[] a, int bits, int low, int high) {
             if (size < MAX_MIXED_INSERTION_SORT_SIZE + bits && (bits & 1) > 0) {
                 int last  = high - 3 * ((size >> 5) << 3);
                 if (size < MIN_FAST_SMALL_ARRAY_SORT_SIZE) mixedInsertionSort(a, low, last , high);
-                else Arrays.arraySort(double.class, a, baseOffset, low, high, last);
+                else arraySort(double.class, a, baseOffset, low, high, last);
                 return;
             }
 
@@ -3568,7 +3598,7 @@ static void sort(Sorter sorter, double[] a, int bits, int low, int high) {
              */
             if (size < MAX_INSERTION_SORT_SIZE) {
                 if (size < MIN_FAST_SMALL_ARRAY_SORT_SIZE) insertionSort(a, low, high);
-                else Arrays.arraySort(double.class, a, baseOffset, low, high, -1);
+                else arraySort(double.class, a, baseOffset, low, high, -1);
                 return;
             }
 
@@ -3657,7 +3687,7 @@ && tryMergeRuns(sorter, a, low, size)) {
                 * of tertiles. Note, that pivot1 < pivot2.
                 */
                 pivotIndices = new int[] {e1, e5};
-                Arrays.arrayPartition(double.class, a, baseOffset, low, high, pivotIndices, Unsafe.ARRAY_INT_BASE_OFFSET, isDualPivot);
+                arrayPartition(double.class, a, baseOffset, low, high, pivotIndices, Unsafe.ARRAY_INT_BASE_OFFSET, isDualPivot);
                 lower = pivotIndices[0];
                 upper = pivotIndices[1];
                 /*
@@ -3679,7 +3709,7 @@ && tryMergeRuns(sorter, a, low, size)) {
                  * This value is inexpensive approximation of the median.
                  */
                 pivotIndices = new int[] {e3, e3};
-                Arrays.arrayPartition(double.class, a, baseOffset, low, high, pivotIndices, Unsafe.ARRAY_INT_BASE_OFFSET, isDualPivot);
+                arrayPartition(double.class, a, baseOffset, low, high, pivotIndices, Unsafe.ARRAY_INT_BASE_OFFSET, isDualPivot);
                 lower = pivotIndices[0];
                 upper = pivotIndices[1];
 

From e44f11a6b69133b8061bb6fda08b65dfe421bd88 Mon Sep 17 00:00:00 2001
From: Srinivas Vamsi Parasa <srinivas.vamsi.parasa@intel.com>
Date: Thu, 24 Aug 2023 18:52:02 -0700
Subject: [PATCH 26/40] Remove unnecessary import in Arrays.java

---
 src/java.base/share/classes/java/util/Arrays.java | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/java.base/share/classes/java/util/Arrays.java b/src/java.base/share/classes/java/util/Arrays.java
index 419adef0239ff..85c514da3083c 100644
--- a/src/java.base/share/classes/java/util/Arrays.java
+++ b/src/java.base/share/classes/java/util/Arrays.java
@@ -30,7 +30,6 @@
 
 import java.io.Serializable;
 import java.lang.reflect.Array;
-import java.util.Arrays.NaturalOrder;
 import java.util.concurrent.ForkJoinPool;
 import java.util.function.BinaryOperator;
 import java.util.function.Consumer;

From 9642d852cce5a9cf8270b850c124ef38fc158c6d Mon Sep 17 00:00:00 2001
From: vamsi-parasa <srinivas.vamsi.parasa@intel.com>
Date: Mon, 28 Aug 2023 14:15:36 -0700
Subject: [PATCH 27/40] Clean up parameters passed to arrayPartition; update
 the check to load library

---
 src/hotspot/cpu/x86/stubGenerator_x86_64.cpp  | 21 ++++++------
 src/hotspot/share/classfile/vmIntrinsics.hpp  |  2 +-
 src/hotspot/share/opto/library_call.cpp       | 10 +++---
 src/hotspot/share/opto/runtime.cpp            |  2 +-
 .../native/libx86_64/avx512-common-qsort.h    |  6 ----
 .../classes/java/util/DualPivotQuicksort.java | 33 ++++++++++---------
 .../openjdk/bench/java/util/ArraysSort.java   |  2 +-
 7 files changed, 37 insertions(+), 39 deletions(-)

diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
index 11936ac764126..640d88f270fe9 100644
--- a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
+++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
@@ -4130,18 +4130,19 @@ void StubGenerator::generate_compiler_stubs() {
       = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
   }
 
-  // Get addresses for avx512 sort and partition routines
-  void *libx86_64 = nullptr;
-  char ebuf_x86_64[1024];
-  char dll_name_avx512[JVM_MAXPATHLEN];
-  if (os::dll_locate_lib(dll_name_avx512, sizeof(dll_name_avx512), Arguments::get_dll_dir(), "x86_64")) {
-    libx86_64 = os::dll_load(dll_name_avx512, ebuf_x86_64, sizeof ebuf_x86_64);
-  }
-  if (libx86_64 != nullptr) {
-    log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "x86_64" JNI_LIB_SUFFIX, p2i(libx86_64));
-
+  // Load x86_64 library on supported hardware to enable avx512 sort and partition intrinsics
     if (UseAVX > 2 && VM_Version::supports_avx512dq()) {
 
+      void *libx86_64 = nullptr;
+      char ebuf_x86_64[1024];
+      char dll_name_avx512[JVM_MAXPATHLEN];
+      if (os::dll_locate_lib(dll_name_avx512, sizeof(dll_name_avx512), Arguments::get_dll_dir(), "x86_64")) {
+        libx86_64 = os::dll_load(dll_name_avx512, ebuf_x86_64, sizeof ebuf_x86_64);
+      }
+    // Get addresses for avx512 sort and partition routines
+    if (libx86_64 != nullptr) {
+      log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "x86_64" JNI_LIB_SUFFIX, p2i(libx86_64));
+
       snprintf(ebuf_x86_64, sizeof(ebuf_x86_64), "avx512_sort_int");
       StubRoutines::_arraysort_int = (address)os::dll_lookup(libx86_64, ebuf_x86_64);
 
diff --git a/src/hotspot/share/classfile/vmIntrinsics.hpp b/src/hotspot/share/classfile/vmIntrinsics.hpp
index d5936373202ad..8c5ca344ae304 100644
--- a/src/hotspot/share/classfile/vmIntrinsics.hpp
+++ b/src/hotspot/share/classfile/vmIntrinsics.hpp
@@ -347,7 +347,7 @@ class methodHandle;
                                                                                                                         \
   do_intrinsic(_arrayPartition, java_util_DualPivotQuicksort, arrayPartition_name, arrayPartition_signature, F_S)       \
    do_name(     arrayPartition_name,                             "arrayPartition")                                      \
-  do_signature(arrayPartition_signature,      "(Ljava/lang/Class;Ljava/lang/Object;JII[IJZ)V")                          \
+  do_signature(arrayPartition_signature,      "(Ljava/lang/Class;Ljava/lang/Object;JII[IZ)V")                           \
                                                                                                                         \
                                                                                                                         \
   do_intrinsic(_copyOfRange,              java_util_Arrays,       copyOfRange_name, copyOfRange_signature,       F_S)   \
diff --git a/src/hotspot/share/opto/library_call.cpp b/src/hotspot/share/opto/library_call.cpp
index 10a8734bc1a8a..b9a39ac61babf 100644
--- a/src/hotspot/share/opto/library_call.cpp
+++ b/src/hotspot/share/opto/library_call.cpp
@@ -5208,8 +5208,7 @@ bool LibraryCallKit::inline_array_partition() {
   Node* fromIndex       = argument(4);
   Node* toIndex         = argument(5);
   Node* pivot_indices   = argument(6);
-  Node* pivot_offset    = argument(7);
-  Node* isDualPivot     = argument(9);
+  Node* isDualPivot     = argument(7);
 
   const TypeInstPtr* elem_klass = gvn().type(elementType)->isa_instptr();
   ciType* elem_type = elem_klass->const_oop()->as_instance()->java_mirror_type();
@@ -5221,11 +5220,14 @@ bool LibraryCallKit::inline_array_partition() {
   if (obj_t == nullptr || obj_t->elem() == Type::BOTTOM ) {
     return false; // failed input validation
   }
-
   Node* obj_adr = make_unsafe_address(obj, offset);
 
   pivot_indices = must_be_not_null(pivot_indices, true);
-  Node* pivot_indices_adr = make_unsafe_address(pivot_indices, pivot_offset); //this offset is not same as array offset
+  const TypeAryPtr* pivot_indices_type = pivot_indices->Value(&_gvn)->isa_aryptr();
+  if (pivot_indices_type == nullptr || pivot_indices_type->elem() == Type::BOTTOM ) {
+    return false; // failed input validation
+  }
+  Node* pivot_indices_adr = array_element_address(pivot_indices, intcon(0), T_INT);
 
   // Call the stub.
   make_runtime_call(RC_LEAF|RC_NO_FP, OptoRuntime::array_partition_Type(),
diff --git a/src/hotspot/share/opto/runtime.cpp b/src/hotspot/share/opto/runtime.cpp
index a0e383d95afd0..52dd29b8fa793 100644
--- a/src/hotspot/share/opto/runtime.cpp
+++ b/src/hotspot/share/opto/runtime.cpp
@@ -884,7 +884,7 @@ const TypeFunc* OptoRuntime::array_sort_Type() {
   int argcnt = num_args;
   const Type** fields = TypeTuple::fields(argcnt);
   int argp = TypeFunc::Parms;
-  fields[argp++] = TypePtr::NOTNULL;    // array(fromIndex)
+  fields[argp++] = TypePtr::NOTNULL;    // array
   fields[argp++] = TypeInt::INT;    // fromIndex
   fields[argp++] = TypeInt::INT;    // toIndex
   assert(argp == TypeFunc::Parms+argcnt, "correct decoding");
diff --git a/src/java.base/linux/native/libx86_64/avx512-common-qsort.h b/src/java.base/linux/native/libx86_64/avx512-common-qsort.h
index ae6af54f572fa..c56990f921eae 100644
--- a/src/java.base/linux/native/libx86_64/avx512-common-qsort.h
+++ b/src/java.base/linux/native/libx86_64/avx512-common-qsort.h
@@ -115,12 +115,6 @@ template <typename type>
 struct ymm_vector;
 
 // Regular quicksort routines:
-template <typename T>
-void avx512_dual_pivot_partition(T *arr, int64_t low, int64_t high, int32_t *pivot_indices, bool isDualPivot);
-
-template <typename T>
-void avx512_single_pivot_partition(T *arr, int64_t low, int64_t high, int32_t *pivot_indices, bool isDualPivot);
-
 template <typename T>
 void avx512_qsort(T *arr, int64_t arrsize);
 
diff --git a/src/java.base/share/classes/java/util/DualPivotQuicksort.java b/src/java.base/share/classes/java/util/DualPivotQuicksort.java
index deb2850d30eae..b3959b1048e75 100644
--- a/src/java.base/share/classes/java/util/DualPivotQuicksort.java
+++ b/src/java.base/share/classes/java/util/DualPivotQuicksort.java
@@ -30,6 +30,7 @@
 import java.util.Arrays;
 import jdk.internal.misc.Unsafe;
 import jdk.internal.vm.annotation.IntrinsicCandidate;
+import jdk.internal.vm.annotation.ForceInline;
 
 
 /**
@@ -147,7 +148,8 @@ private DualPivotQuicksort() {}
      * if end < 0, we use insertion sort else we use mixed insertion sort.
      */
     @IntrinsicCandidate
-    static void arraySort(Class<?> elemType, Object array, long offset, int low, int high, int end) {
+    @ForceInline
+    private static void arraySort(Class<?> elemType, Object array, long offset, int low, int high, int end) {
        if (end < 0) insertionSort(array, low, high);
        else mixedInsertionSort(array, low, end, high);
     }
@@ -164,13 +166,12 @@ static void arraySort(Class<?> elemType, Object array, long offset, int low, int
      * @param high the index of the last element, exclusive, to be sorted
      * @param pivotIndices the array containing the indices of the pivots. After
      * partitioning, this array is updated with the new indices of the pivots.
-     * @param pivot_offset the offset in bytes pointing to the base address of
-     * the array used to store the indices of the pivots.
      * @param isDualPivot a boolean value to choose between dual pivot
      * partitioning and single pivot partitioning
      */
     @IntrinsicCandidate
-    static void arrayPartition(Class<?> elemType, Object array, long offset, int low, int high, int[] pivotIndices, long pivot_offset, boolean isDualPivot) {
+    @ForceInline
+    private static void arrayPartition(Class<?> elemType, Object array, long offset, int low, int high, int[] pivotIndices, boolean isDualPivot) {
         if (isDualPivot) partitionDualPivot(array, low, high, pivotIndices);
         else partitionSinglePivot(array, low, high, pivotIndices);
     }
@@ -200,7 +201,7 @@ private static int getDepth(int parallelism, int size) {
      * @param high the index of the last element, exclusive, to be sorted
      *
      */
-    static void insertionSort(Object array, int low, int high) {
+    private static void insertionSort(Object array, int low, int high) {
         switch (array) {
             case int[] arr -> insertionSort(arr, low, high);
             case long[] arr -> insertionSort(arr, low, high);
@@ -219,7 +220,7 @@ static void insertionSort(Object array, int low, int high) {
      * @param end the index of the last element for simple insertion sort
      *
      */
-    static void mixedInsertionSort(Object array, int low, int end, int high) {
+    private static void mixedInsertionSort(Object array, int low, int end, int high) {
         switch (array) {
             case int[] arr -> mixedInsertionSort(arr, low, end, high);
             case long[] arr ->  mixedInsertionSort(arr, low, end, high);
@@ -239,7 +240,7 @@ static void mixedInsertionSort(Object array, int low, int end, int high) {
      * After partitioning, the indices of the pivots is updated as well.
      *
      */
-    static void partitionDualPivot(Object array, int low, int high, int[] pivotIndices) {
+    private static void partitionDualPivot(Object array, int low, int high, int[] pivotIndices) {
         switch(array) {
             case int[] arr -> partitionDualPivot(arr, low, high, pivotIndices);
             case long[] arr -> partitionDualPivot(arr, low, high, pivotIndices);
@@ -259,7 +260,7 @@ static void partitionDualPivot(Object array, int low, int high, int[] pivotIndic
      * After partitioning, the indices of the pivots is updated as well.
      *
      */
-    static void partitionSinglePivot(Object array, int low, int high, int[] pivotIndices) {
+    private static void partitionSinglePivot(Object array, int low, int high, int[] pivotIndices) {
         switch(array) {
             case int[] arr -> partitionSinglePivot(arr, low, high, pivotIndices);
             case long[] arr -> partitionSinglePivot(arr, low, high, pivotIndices);
@@ -416,7 +417,7 @@ && tryMergeRuns(sorter, a, low, size)) {
                  * of tertiles. Note, that pivot1 < pivot2.
                  */
                 pivotIndices = new int[] {e1, e5};
-                arrayPartition(int.class, a, baseOffset, low, high, pivotIndices, Unsafe.ARRAY_INT_BASE_OFFSET, isDualPivot);
+                arrayPartition(int.class, a, baseOffset, low, high, pivotIndices, isDualPivot);
                 lower = pivotIndices[0];
                 upper = pivotIndices[1];
 
@@ -441,7 +442,7 @@ && tryMergeRuns(sorter, a, low, size)) {
                  * This value is inexpensive approximation of the median.
                  */
                 pivotIndices = new int[] {e3, e3};
-                arrayPartition(int.class, a, baseOffset, low, high, pivotIndices, Unsafe.ARRAY_INT_BASE_OFFSET, isDualPivot);
+                arrayPartition(int.class, a, baseOffset, low, high, pivotIndices, isDualPivot);
                 lower = pivotIndices[0];
                 upper = pivotIndices[1];
                 /*
@@ -1230,7 +1231,7 @@ && tryMergeRuns(sorter, a, low, size)) {
                  * of tertiles. Note, that pivot1 < pivot2.
                  */
                 pivotIndices = new int[] {e1, e5};
-                arrayPartition(long.class, a, baseOffset, low, high, pivotIndices, Unsafe.ARRAY_INT_BASE_OFFSET, isDualPivot);
+                arrayPartition(long.class, a, baseOffset, low, high, pivotIndices, isDualPivot);
                 lower = pivotIndices[0];
                 upper = pivotIndices[1];
                 /*
@@ -1252,7 +1253,7 @@ && tryMergeRuns(sorter, a, low, size)) {
                  * This value is inexpensive approximation of the median.
                  */
                 pivotIndices = new int[] {e3, e3};
-                arrayPartition(long.class, a, baseOffset, low, high, pivotIndices, Unsafe.ARRAY_INT_BASE_OFFSET, isDualPivot);
+                arrayPartition(long.class, a, baseOffset, low, high, pivotIndices, isDualPivot);
                 lower = pivotIndices[0];
                 upper = pivotIndices[1];
                 /*
@@ -2826,7 +2827,7 @@ && tryMergeRuns(sorter, a, low, size)) {
                  * of tertiles. Note, that pivot1 < pivot2.
                  */
                 pivotIndices = new int[] {e1, e5};
-                arrayPartition(float.class, a, baseOffset, low, high, pivotIndices, Unsafe.ARRAY_INT_BASE_OFFSET, isDualPivot);
+                arrayPartition(float.class, a, baseOffset, low, high, pivotIndices, isDualPivot);
                 lower = pivotIndices[0];
                 upper = pivotIndices[1];
                 /*
@@ -2848,7 +2849,7 @@ && tryMergeRuns(sorter, a, low, size)) {
                  * This value is inexpensive approximation of the median.
                  */
                 pivotIndices = new int[] {e3, e3};
-                arrayPartition(float.class, a, baseOffset, low, high, pivotIndices, Unsafe.ARRAY_INT_BASE_OFFSET, isDualPivot);
+                arrayPartition(float.class, a, baseOffset, low, high, pivotIndices, isDualPivot);
                 lower = pivotIndices[0];
                 upper = pivotIndices[1];
                 /*
@@ -3687,7 +3688,7 @@ && tryMergeRuns(sorter, a, low, size)) {
                 * of tertiles. Note, that pivot1 < pivot2.
                 */
                 pivotIndices = new int[] {e1, e5};
-                arrayPartition(double.class, a, baseOffset, low, high, pivotIndices, Unsafe.ARRAY_INT_BASE_OFFSET, isDualPivot);
+                arrayPartition(double.class, a, baseOffset, low, high, pivotIndices, isDualPivot);
                 lower = pivotIndices[0];
                 upper = pivotIndices[1];
                 /*
@@ -3709,7 +3710,7 @@ && tryMergeRuns(sorter, a, low, size)) {
                  * This value is inexpensive approximation of the median.
                  */
                 pivotIndices = new int[] {e3, e3};
-                arrayPartition(double.class, a, baseOffset, low, high, pivotIndices, Unsafe.ARRAY_INT_BASE_OFFSET, isDualPivot);
+                arrayPartition(double.class, a, baseOffset, low, high, pivotIndices, isDualPivot);
                 lower = pivotIndices[0];
                 upper = pivotIndices[1];
 
diff --git a/test/micro/org/openjdk/bench/java/util/ArraysSort.java b/test/micro/org/openjdk/bench/java/util/ArraysSort.java
index 059a3626a0d3d..4cd45d79412c1 100644
--- a/test/micro/org/openjdk/bench/java/util/ArraysSort.java
+++ b/test/micro/org/openjdk/bench/java/util/ArraysSort.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it

From 1746eeddeb6b6ca6313434d1a7626cfabd7068cc Mon Sep 17 00:00:00 2001
From: vamsi-parasa <srinivas.vamsi.parasa@intel.com>
Date: Thu, 31 Aug 2023 11:38:08 -0700
Subject: [PATCH 28/40] update build script

---
 make/modules/java.base/Lib.gmk | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/make/modules/java.base/Lib.gmk b/make/modules/java.base/Lib.gmk
index 85a86372dbf1f..2dca1e9f6d20d 100644
--- a/make/modules/java.base/Lib.gmk
+++ b/make/modules/java.base/Lib.gmk
@@ -237,19 +237,21 @@ endif
 ################################################################################
 
 ifeq ($(call isTargetOs, linux)+$(call isTargetCpu, x86_64)+$(INCLUDE_COMPILER2), true+true+true)
-  $(eval $(call SetupJdkLibrary, BUILD_LIB_X86_64, \
-      NAME := x86_64, \
-      OPTIMIZATION := HIGH, \
-      CFLAGS := $(CFLAGS_JDKLIB), \
-      CXXFLAGS := $(CXXFLAGS_JDKLIB), \
-      LDFLAGS := $(LDFLAGS_JDKLIB) \
-          $(call SET_SHARED_LIBRARY_ORIGIN), \
-      LDFLAGS_linux := -Wl$(COMMA)--no-as-needed, \
-      LIBS := $(LIBCXX), \
-      LIBS_linux := -lc -lm -ldl, \
-  ))
-
-  TARGETS += $(BUILD_LIB_X86_64)
+    ifeq ($(TOOLCHAIN_TYPE), gcc)
+        $(eval $(call SetupJdkLibrary, BUILD_LIB_X86_64, \
+            NAME := x86_64, \
+            TOOLCHAIN := TOOLCHAIN_LINK_CXX, \
+            OPTIMIZATION := HIGH, \
+            CFLAGS := $(CFLAGS_JDKLIB), \
+            CXXFLAGS := $(CXXFLAGS_JDKLIB), \
+            LDFLAGS := $(LDFLAGS_JDKLIB) \
+                $(call SET_SHARED_LIBRARY_ORIGIN), \
+            LIBS := $(LIBCXX), \
+            LIBS_linux := -lc -lm -ldl, \
+        ))
+
+        TARGETS += $(BUILD_LIB_X86_64)
+  endif
 endif
 
 ################################################################################

From a0f006b63bd7de1e318cc5922b29faf1ec33fae7 Mon Sep 17 00:00:00 2001
From: Srinivas Vamsi Parasa <srinivas.vamsi.parasa@intel.com>
Date: Thu, 31 Aug 2023 13:29:41 -0700
Subject: [PATCH 29/40] Update make/modules/java.base/Lib.gmk

Co-authored-by: Erik Joelsson <37597443+erikj79@users.noreply.github.com>
---
 make/modules/java.base/Lib.gmk | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/make/modules/java.base/Lib.gmk b/make/modules/java.base/Lib.gmk
index 2dca1e9f6d20d..b339c9f549bed 100644
--- a/make/modules/java.base/Lib.gmk
+++ b/make/modules/java.base/Lib.gmk
@@ -237,20 +237,20 @@ endif
 ################################################################################
 
 ifeq ($(call isTargetOs, linux)+$(call isTargetCpu, x86_64)+$(INCLUDE_COMPILER2), true+true+true)
-    ifeq ($(TOOLCHAIN_TYPE), gcc)
-        $(eval $(call SetupJdkLibrary, BUILD_LIB_X86_64, \
-            NAME := x86_64, \
-            TOOLCHAIN := TOOLCHAIN_LINK_CXX, \
-            OPTIMIZATION := HIGH, \
-            CFLAGS := $(CFLAGS_JDKLIB), \
-            CXXFLAGS := $(CXXFLAGS_JDKLIB), \
-            LDFLAGS := $(LDFLAGS_JDKLIB) \
-                $(call SET_SHARED_LIBRARY_ORIGIN), \
-            LIBS := $(LIBCXX), \
-            LIBS_linux := -lc -lm -ldl, \
-        ))
-
-        TARGETS += $(BUILD_LIB_X86_64)
+  ifeq ($(TOOLCHAIN_TYPE), gcc)
+    $(eval $(call SetupJdkLibrary, BUILD_LIB_X86_64, \
+        NAME := x86_64, \
+        TOOLCHAIN := TOOLCHAIN_LINK_CXX, \
+        OPTIMIZATION := HIGH, \
+        CFLAGS := $(CFLAGS_JDKLIB), \
+        CXXFLAGS := $(CXXFLAGS_JDKLIB), \
+        LDFLAGS := $(LDFLAGS_JDKLIB) \
+            $(call SET_SHARED_LIBRARY_ORIGIN), \
+        LIBS := $(LIBCXX), \
+        LIBS_linux := -lc -lm -ldl, \
+    ))
+
+    TARGETS += $(BUILD_LIB_X86_64)
   endif
 endif
 

From 0ec5f52d26dd9738fabe9203b95b886e5ccca7f7 Mon Sep 17 00:00:00 2001
From: vamsi-parasa <srinivas.vamsi.parasa@intel.com>
Date: Thu, 31 Aug 2023 14:25:02 -0700
Subject: [PATCH 30/40] Change name of the avxsort library to libx86_64_sort

---
 make/modules/java.base/Lib.gmk                |  6 ++--
 src/hotspot/cpu/x86/stubGenerator_x86_64.cpp  | 30 +++++++++----------
 .../avx512-32bit-qsort.hpp                    |  0
 .../avx512-64bit-common.h                     |  0
 .../avx512-64bit-qsort.hpp                    |  0
 .../avx512-common-qsort.h                     |  0
 .../avxsort_linux_x86.cpp                     |  0
 7 files changed, 18 insertions(+), 18 deletions(-)
 rename src/java.base/linux/native/{libx86_64 => libx86_64_sort}/avx512-32bit-qsort.hpp (100%)
 rename src/java.base/linux/native/{libx86_64 => libx86_64_sort}/avx512-64bit-common.h (100%)
 rename src/java.base/linux/native/{libx86_64 => libx86_64_sort}/avx512-64bit-qsort.hpp (100%)
 rename src/java.base/linux/native/{libx86_64 => libx86_64_sort}/avx512-common-qsort.h (100%)
 rename src/java.base/linux/native/{libx86_64 => libx86_64_sort}/avxsort_linux_x86.cpp (100%)

diff --git a/make/modules/java.base/Lib.gmk b/make/modules/java.base/Lib.gmk
index b339c9f549bed..e8f7e200f0d70 100644
--- a/make/modules/java.base/Lib.gmk
+++ b/make/modules/java.base/Lib.gmk
@@ -238,8 +238,8 @@ endif
 
 ifeq ($(call isTargetOs, linux)+$(call isTargetCpu, x86_64)+$(INCLUDE_COMPILER2), true+true+true)
   ifeq ($(TOOLCHAIN_TYPE), gcc)
-    $(eval $(call SetupJdkLibrary, BUILD_LIB_X86_64, \
-        NAME := x86_64, \
+    $(eval $(call SetupJdkLibrary, BUILD_LIB_X86_64_SORT, \
+        NAME := x86_64_sort, \
         TOOLCHAIN := TOOLCHAIN_LINK_CXX, \
         OPTIMIZATION := HIGH, \
         CFLAGS := $(CFLAGS_JDKLIB), \
@@ -250,7 +250,7 @@ ifeq ($(call isTargetOs, linux)+$(call isTargetCpu, x86_64)+$(INCLUDE_COMPILER2)
         LIBS_linux := -lc -lm -ldl, \
     ))
 
-    TARGETS += $(BUILD_LIB_X86_64)
+    TARGETS += $(BUILD_LIB_X86_64_SORT)
   endif
 endif
 
diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
index 1b3fbd1aabfc5..10dfa8673f9d5 100644
--- a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
+++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
@@ -4179,42 +4179,42 @@ void StubGenerator::generate_compiler_stubs() {
       = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
   }
 
-  // Load x86_64 library on supported hardware to enable avx512 sort and partition intrinsics
+  // Load x86_64_sort library on supported hardware to enable avx512 sort and partition intrinsics
     if (UseAVX > 2 && VM_Version::supports_avx512dq()) {
 
-      void *libx86_64 = nullptr;
+      void *libx86_64_sort = nullptr;
       char ebuf_x86_64[1024];
-      char dll_name_avx512[JVM_MAXPATHLEN];
-      if (os::dll_locate_lib(dll_name_avx512, sizeof(dll_name_avx512), Arguments::get_dll_dir(), "x86_64")) {
-        libx86_64 = os::dll_load(dll_name_avx512, ebuf_x86_64, sizeof ebuf_x86_64);
+      char dll_name_avx512_sort[JVM_MAXPATHLEN];
+      if (os::dll_locate_lib(dll_name_avx512_sort, sizeof(dll_name_avx512_sort), Arguments::get_dll_dir(), "x86_64_sort")) {
+        libx86_64_sort = os::dll_load(dll_name_avx512_sort, ebuf_x86_64, sizeof ebuf_x86_64);
       }
     // Get addresses for avx512 sort and partition routines
-    if (libx86_64 != nullptr) {
-      log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "x86_64" JNI_LIB_SUFFIX, p2i(libx86_64));
+    if (libx86_64_sort != nullptr) {
+      log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "x86_64" JNI_LIB_SUFFIX, p2i(libx86_64_sort));
 
       snprintf(ebuf_x86_64, sizeof(ebuf_x86_64), "avx512_sort_int");
-      StubRoutines::_arraysort_int = (address)os::dll_lookup(libx86_64, ebuf_x86_64);
+      StubRoutines::_arraysort_int = (address)os::dll_lookup(libx86_64_sort, ebuf_x86_64);
 
       snprintf(ebuf_x86_64, sizeof(ebuf_x86_64), "avx512_sort_long");
-      StubRoutines::_arraysort_long = (address)os::dll_lookup(libx86_64, ebuf_x86_64);
+      StubRoutines::_arraysort_long = (address)os::dll_lookup(libx86_64_sort, ebuf_x86_64);
 
       snprintf(ebuf_x86_64, sizeof(ebuf_x86_64), "avx512_sort_float");
-      StubRoutines::_arraysort_float = (address)os::dll_lookup(libx86_64, ebuf_x86_64);
+      StubRoutines::_arraysort_float = (address)os::dll_lookup(libx86_64_sort, ebuf_x86_64);
 
       snprintf(ebuf_x86_64, sizeof(ebuf_x86_64), "avx512_sort_double");
-      StubRoutines::_arraysort_double = (address)os::dll_lookup(libx86_64, ebuf_x86_64);
+      StubRoutines::_arraysort_double = (address)os::dll_lookup(libx86_64_sort, ebuf_x86_64);
 
       snprintf(ebuf_x86_64, sizeof(ebuf_x86_64), "avx512_partition_int");
-      StubRoutines::_array_partition_int = (address)os::dll_lookup(libx86_64, ebuf_x86_64);
+      StubRoutines::_array_partition_int = (address)os::dll_lookup(libx86_64_sort, ebuf_x86_64);
 
       snprintf(ebuf_x86_64, sizeof(ebuf_x86_64), "avx512_partition_long");
-      StubRoutines::_array_partition_long = (address)os::dll_lookup(libx86_64, ebuf_x86_64);
+      StubRoutines::_array_partition_long = (address)os::dll_lookup(libx86_64_sort, ebuf_x86_64);
 
       snprintf(ebuf_x86_64, sizeof(ebuf_x86_64), "avx512_partition_float");
-      StubRoutines::_array_partition_float = (address)os::dll_lookup(libx86_64, ebuf_x86_64);
+      StubRoutines::_array_partition_float = (address)os::dll_lookup(libx86_64_sort, ebuf_x86_64);
 
       snprintf(ebuf_x86_64, sizeof(ebuf_x86_64), "avx512_partition_double");
-      StubRoutines::_array_partition_double = (address)os::dll_lookup(libx86_64, ebuf_x86_64);
+      StubRoutines::_array_partition_double = (address)os::dll_lookup(libx86_64_sort, ebuf_x86_64);
     }
   }
 
diff --git a/src/java.base/linux/native/libx86_64/avx512-32bit-qsort.hpp b/src/java.base/linux/native/libx86_64_sort/avx512-32bit-qsort.hpp
similarity index 100%
rename from src/java.base/linux/native/libx86_64/avx512-32bit-qsort.hpp
rename to src/java.base/linux/native/libx86_64_sort/avx512-32bit-qsort.hpp
diff --git a/src/java.base/linux/native/libx86_64/avx512-64bit-common.h b/src/java.base/linux/native/libx86_64_sort/avx512-64bit-common.h
similarity index 100%
rename from src/java.base/linux/native/libx86_64/avx512-64bit-common.h
rename to src/java.base/linux/native/libx86_64_sort/avx512-64bit-common.h
diff --git a/src/java.base/linux/native/libx86_64/avx512-64bit-qsort.hpp b/src/java.base/linux/native/libx86_64_sort/avx512-64bit-qsort.hpp
similarity index 100%
rename from src/java.base/linux/native/libx86_64/avx512-64bit-qsort.hpp
rename to src/java.base/linux/native/libx86_64_sort/avx512-64bit-qsort.hpp
diff --git a/src/java.base/linux/native/libx86_64/avx512-common-qsort.h b/src/java.base/linux/native/libx86_64_sort/avx512-common-qsort.h
similarity index 100%
rename from src/java.base/linux/native/libx86_64/avx512-common-qsort.h
rename to src/java.base/linux/native/libx86_64_sort/avx512-common-qsort.h
diff --git a/src/java.base/linux/native/libx86_64/avxsort_linux_x86.cpp b/src/java.base/linux/native/libx86_64_sort/avxsort_linux_x86.cpp
similarity index 100%
rename from src/java.base/linux/native/libx86_64/avxsort_linux_x86.cpp
rename to src/java.base/linux/native/libx86_64_sort/avxsort_linux_x86.cpp

From c096ff62e63fd1a374f7f180e11a7578798b06c7 Mon Sep 17 00:00:00 2001
From: vamsi-parasa <srinivas.vamsi.parasa@intel.com>
Date: Fri, 8 Sep 2023 11:04:15 -0700
Subject: [PATCH 31/40] Fix regression when intrinsics are disabled; enable
 insertion sort in intrinsic, change library name to libsimdsort

---
 make/modules/java.base/Lib.gmk                |   6 +-
 src/hotspot/cpu/x86/stubGenerator_x86_64.cpp  |  58 +--
 src/hotspot/share/classfile/vmIntrinsics.hpp  |  20 +-
 .../gc/shenandoah/c2/shenandoahSupport.cpp    |   7 +-
 src/hotspot/share/jvmci/vmStructs_jvmci.cpp   |  12 +-
 src/hotspot/share/opto/c2compiler.cpp         |   6 +-
 src/hotspot/share/opto/library_call.cpp       |  55 ++-
 src/hotspot/share/opto/library_call.hpp       |   2 +-
 src/hotspot/share/opto/runtime.cpp            |  17 +-
 src/hotspot/share/opto/runtime.hpp            |   2 +-
 src/hotspot/share/runtime/stubRoutines.cpp    |  22 +-
 src/hotspot/share/runtime/stubRoutines.hpp    |  14 +-
 .../avx512-32bit-qsort.hpp                    |   4 +-
 .../avx512-64bit-common.h                     |   0
 .../avx512-64bit-qsort.hpp                    |   4 +-
 .../avx512-common-qsort.h                     |  59 ++-
 .../native/libsimdsort/avxsort_linux_x86.cpp  |  85 +++++
 .../libx86_64_sort/avxsort_linux_x86.cpp      |  67 ----
 .../classes/java/util/DualPivotQuicksort.java | 353 +++++++-----------
 test/jdk/java/util/Arrays/Sorting.java        |   7 +-
 20 files changed, 419 insertions(+), 381 deletions(-)
 rename src/java.base/linux/native/{libx86_64_sort => libsimdsort}/avx512-32bit-qsort.hpp (99%)
 rename src/java.base/linux/native/{libx86_64_sort => libsimdsort}/avx512-64bit-common.h (100%)
 rename src/java.base/linux/native/{libx86_64_sort => libsimdsort}/avx512-64bit-qsort.hpp (99%)
 rename src/java.base/linux/native/{libx86_64_sort => libsimdsort}/avx512-common-qsort.h (91%)
 create mode 100644 src/java.base/linux/native/libsimdsort/avxsort_linux_x86.cpp
 delete mode 100644 src/java.base/linux/native/libx86_64_sort/avxsort_linux_x86.cpp

diff --git a/make/modules/java.base/Lib.gmk b/make/modules/java.base/Lib.gmk
index e8f7e200f0d70..976f5e8e75582 100644
--- a/make/modules/java.base/Lib.gmk
+++ b/make/modules/java.base/Lib.gmk
@@ -238,8 +238,8 @@ endif
 
 ifeq ($(call isTargetOs, linux)+$(call isTargetCpu, x86_64)+$(INCLUDE_COMPILER2), true+true+true)
   ifeq ($(TOOLCHAIN_TYPE), gcc)
-    $(eval $(call SetupJdkLibrary, BUILD_LIB_X86_64_SORT, \
-        NAME := x86_64_sort, \
+    $(eval $(call SetupJdkLibrary, BUILD_LIB_SIMD_SORT, \
+        NAME := simdsort, \
         TOOLCHAIN := TOOLCHAIN_LINK_CXX, \
         OPTIMIZATION := HIGH, \
         CFLAGS := $(CFLAGS_JDKLIB), \
@@ -250,7 +250,7 @@ ifeq ($(call isTargetOs, linux)+$(call isTargetCpu, x86_64)+$(INCLUDE_COMPILER2)
         LIBS_linux := -lc -lm -ldl, \
     ))
 
-    TARGETS += $(BUILD_LIB_X86_64_SORT)
+    TARGETS += $(BUILD_LIB_SIMD_SORT)
   endif
 endif
 
diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
index 10dfa8673f9d5..1ac5f566434e4 100644
--- a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
+++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
@@ -4182,39 +4182,51 @@ void StubGenerator::generate_compiler_stubs() {
   // Load x86_64_sort library on supported hardware to enable avx512 sort and partition intrinsics
     if (UseAVX > 2 && VM_Version::supports_avx512dq()) {
 
-      void *libx86_64_sort = nullptr;
-      char ebuf_x86_64[1024];
-      char dll_name_avx512_sort[JVM_MAXPATHLEN];
-      if (os::dll_locate_lib(dll_name_avx512_sort, sizeof(dll_name_avx512_sort), Arguments::get_dll_dir(), "x86_64_sort")) {
-        libx86_64_sort = os::dll_load(dll_name_avx512_sort, ebuf_x86_64, sizeof ebuf_x86_64);
+      void *libsimdsort = nullptr;
+      char ebuf_[1024];
+      char dll_name_simd_sort[JVM_MAXPATHLEN];
+      if (os::dll_locate_lib(dll_name_simd_sort, sizeof(dll_name_simd_sort), Arguments::get_dll_dir(), "simdsort")) {
+        libsimdsort = os::dll_load(dll_name_simd_sort, ebuf_, sizeof ebuf_);
       }
     // Get addresses for avx512 sort and partition routines
-    if (libx86_64_sort != nullptr) {
-      log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "x86_64" JNI_LIB_SUFFIX, p2i(libx86_64_sort));
+    if (libsimdsort != nullptr) {
+      log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "x86_64" JNI_LIB_SUFFIX, p2i(libsimdsort));
 
-      snprintf(ebuf_x86_64, sizeof(ebuf_x86_64), "avx512_sort_int");
-      StubRoutines::_arraysort_int = (address)os::dll_lookup(libx86_64_sort, ebuf_x86_64);
+      snprintf(ebuf_, sizeof(ebuf_), "avx512_sort_int");
+      StubRoutines::_arraysort_int = (address)os::dll_lookup(libsimdsort, ebuf_);
 
-      snprintf(ebuf_x86_64, sizeof(ebuf_x86_64), "avx512_sort_long");
-      StubRoutines::_arraysort_long = (address)os::dll_lookup(libx86_64_sort, ebuf_x86_64);
+      snprintf(ebuf_, sizeof(ebuf_), "avx512_sort_long");
+      StubRoutines::_arraysort_long = (address)os::dll_lookup(libsimdsort, ebuf_);
 
-      snprintf(ebuf_x86_64, sizeof(ebuf_x86_64), "avx512_sort_float");
-      StubRoutines::_arraysort_float = (address)os::dll_lookup(libx86_64_sort, ebuf_x86_64);
+      snprintf(ebuf_, sizeof(ebuf_), "avx512_sort_float");
+      StubRoutines::_arraysort_float = (address)os::dll_lookup(libsimdsort, ebuf_);
 
-      snprintf(ebuf_x86_64, sizeof(ebuf_x86_64), "avx512_sort_double");
-      StubRoutines::_arraysort_double = (address)os::dll_lookup(libx86_64_sort, ebuf_x86_64);
+      snprintf(ebuf_, sizeof(ebuf_), "avx512_sort_double");
+      StubRoutines::_arraysort_double = (address)os::dll_lookup(libsimdsort, ebuf_);
 
-      snprintf(ebuf_x86_64, sizeof(ebuf_x86_64), "avx512_partition_int");
-      StubRoutines::_array_partition_int = (address)os::dll_lookup(libx86_64_sort, ebuf_x86_64);
+      snprintf(ebuf_, sizeof(ebuf_), "avx512_partition_single_int");
+      StubRoutines::_array_partition_single_int = (address)os::dll_lookup(libsimdsort, ebuf_);
 
-      snprintf(ebuf_x86_64, sizeof(ebuf_x86_64), "avx512_partition_long");
-      StubRoutines::_array_partition_long = (address)os::dll_lookup(libx86_64_sort, ebuf_x86_64);
+      snprintf(ebuf_, sizeof(ebuf_), "avx512_partition_dual_int");
+      StubRoutines::_array_partition_dual_int = (address)os::dll_lookup(libsimdsort, ebuf_);
 
-      snprintf(ebuf_x86_64, sizeof(ebuf_x86_64), "avx512_partition_float");
-      StubRoutines::_array_partition_float = (address)os::dll_lookup(libx86_64_sort, ebuf_x86_64);
+      snprintf(ebuf_, sizeof(ebuf_), "avx512_partition_single_long");
+      StubRoutines::_array_partition_single_long = (address)os::dll_lookup(libsimdsort, ebuf_);
 
-      snprintf(ebuf_x86_64, sizeof(ebuf_x86_64), "avx512_partition_double");
-      StubRoutines::_array_partition_double = (address)os::dll_lookup(libx86_64_sort, ebuf_x86_64);
+      snprintf(ebuf_, sizeof(ebuf_), "avx512_partition_dual_long");
+      StubRoutines::_array_partition_dual_long = (address)os::dll_lookup(libsimdsort, ebuf_);
+
+      snprintf(ebuf_, sizeof(ebuf_), "avx512_partition_single_float");
+      StubRoutines::_array_partition_single_float = (address)os::dll_lookup(libsimdsort, ebuf_);
+
+      snprintf(ebuf_, sizeof(ebuf_), "avx512_partition_dual_float");
+      StubRoutines::_array_partition_dual_float = (address)os::dll_lookup(libsimdsort, ebuf_);
+
+      snprintf(ebuf_, sizeof(ebuf_), "avx512_partition_single_double");
+      StubRoutines::_array_partition_single_double = (address)os::dll_lookup(libsimdsort, ebuf_);
+
+      snprintf(ebuf_, sizeof(ebuf_), "avx512_partition_dual_double");
+      StubRoutines::_array_partition_dual_double = (address)os::dll_lookup(libsimdsort, ebuf_);
     }
   }
 
diff --git a/src/hotspot/share/classfile/vmIntrinsics.hpp b/src/hotspot/share/classfile/vmIntrinsics.hpp
index 4f307cfa8a388..bba728b694f8e 100644
--- a/src/hotspot/share/classfile/vmIntrinsics.hpp
+++ b/src/hotspot/share/classfile/vmIntrinsics.hpp
@@ -341,13 +341,19 @@ class methodHandle;
    do_name(     copyOf_name,                                     "copyOf")                                              \
    do_signature(copyOf_signature,             "([Ljava/lang/Object;ILjava/lang/Class;)[Ljava/lang/Object;")             \
                                                                                                                         \
-  do_intrinsic(_arraySort,                java_util_DualPivotQuicksort,       arraySort_name, arraySort_signature, F_S) \
-   do_name(     arraySort_name,                                  "arraySort")                                           \
-   do_signature(arraySort_signature,          "(Ljava/lang/Class;Ljava/lang/Object;JIII)V")                             \
-                                                                                                                        \
-  do_intrinsic(_arrayPartition, java_util_DualPivotQuicksort, arrayPartition_name, arrayPartition_signature, F_S)       \
-   do_name(     arrayPartition_name,                             "arrayPartition")                                      \
-  do_signature(arrayPartition_signature,      "(Ljava/lang/Class;Ljava/lang/Object;JII[IZ)V")                           \
+  do_intrinsic(_arraySortMI, java_util_DualPivotQuicksort, arraySortMI_name, arraySortMI_signature, F_S)                \
+   do_name(     arraySortMI_name,                                  "mixedInsertionSort")                                \
+   do_signature(arraySortMI_signature,          "(Ljava/lang/Class;Ljava/lang/Object;JIII)V")                            \
+  do_intrinsic(_arraySortI, java_util_DualPivotQuicksort, arraySortI_name, arraySortI_signature, F_S)                   \
+   do_name(     arraySortI_name,                                   "insertionSort")                                     \
+   do_signature(arraySortI_signature,          "(Ljava/lang/Class;Ljava/lang/Object;JII)V")                             \
+                                                                                                                        \
+  do_intrinsic(_arrayPartitionSP, java_util_DualPivotQuicksort, arrayPartitionSP_name, arrayPartitionSP_signature, F_S) \
+   do_name(     arrayPartitionSP_name,                             "partitionSinglePivot")                              \
+  do_signature(arrayPartitionSP_signature,      "(Ljava/lang/Class;Ljava/lang/Object;JIII)[I")                          \
+  do_intrinsic(_arrayPartitionDP, java_util_DualPivotQuicksort, arrayPartitionDP_name, arrayPartitionDP_signature, F_S) \
+   do_name(     arrayPartitionDP_name,                             "partitionDualPivot")                                \
+  do_signature(arrayPartitionDP_signature,      "(Ljava/lang/Class;Ljava/lang/Object;JIIII)[I")                         \
                                                                                                                         \
                                                                                                                         \
   do_intrinsic(_copyOfRange,              java_util_Arrays,       copyOfRange_name, copyOfRange_signature,       F_S)   \
diff --git a/src/hotspot/share/gc/shenandoah/c2/shenandoahSupport.cpp b/src/hotspot/share/gc/shenandoah/c2/shenandoahSupport.cpp
index 9a98ec9cd529d..0384ec1942b3d 100644
--- a/src/hotspot/share/gc/shenandoah/c2/shenandoahSupport.cpp
+++ b/src/hotspot/share/gc/shenandoah/c2/shenandoahSupport.cpp
@@ -388,10 +388,11 @@ void ShenandoahBarrierC2Support::verify(RootNode* root) {
         } args[6];
       } calls[] = {
         "array_partition_stub",
-        { { TypeFunc::Parms, ShenandoahStore }, { TypeFunc::Parms+3, ShenandoahStore }, { -1, ShenandoahNone },
-          { -1, ShenandoahNone }, { -1, ShenandoahNone } },
+        { { TypeFunc::Parms, ShenandoahStore }, { TypeFunc::Parms+3, ShenandoahStore },   { -1, ShenandoahNone },
+          { -1, ShenandoahNone },                { -1, ShenandoahNone },                  { -1, ShenandoahNone } },
         "arraysort_stub",
-        { { TypeFunc::Parms, ShenandoahStore }, { -1, ShenandoahNone }, { -1, ShenandoahNone } },
+        { { TypeFunc::Parms, ShenandoahStore },  { -1, ShenandoahNone },                  { -1, ShenandoahNone },
+          { -1,  ShenandoahNone},                 { -1,  ShenandoahNone},                 { -1,  ShenandoahNone} },
         "aescrypt_encryptBlock",
         { { TypeFunc::Parms, ShenandoahLoad },   { TypeFunc::Parms+1, ShenandoahStore },  { TypeFunc::Parms+2, ShenandoahLoad },
           { -1,  ShenandoahNone},                 { -1,  ShenandoahNone},                 { -1,  ShenandoahNone} },
diff --git a/src/hotspot/share/jvmci/vmStructs_jvmci.cpp b/src/hotspot/share/jvmci/vmStructs_jvmci.cpp
index 9107b53fc2b84..e74afd3a7759d 100644
--- a/src/hotspot/share/jvmci/vmStructs_jvmci.cpp
+++ b/src/hotspot/share/jvmci/vmStructs_jvmci.cpp
@@ -331,10 +331,14 @@
   static_field(StubRoutines,                _arraysort_long,                                  address)                               \
   static_field(StubRoutines,                _arraysort_float,                                 address)                               \
   static_field(StubRoutines,                _arraysort_double,                                address)                               \
-  static_field(StubRoutines,                _array_partition_int,                             address)                               \
-  static_field(StubRoutines,                _array_partition_long,                            address)                               \
-  static_field(StubRoutines,                _array_partition_float,                           address)                               \
-  static_field(StubRoutines,                _array_partition_double,                          address)                               \
+  static_field(StubRoutines,                _array_partition_single_int,                      address)                               \
+  static_field(StubRoutines,                _array_partition_dual_int,                        address)                               \
+  static_field(StubRoutines,                _array_partition_single_long,                     address)                               \
+  static_field(StubRoutines,                _array_partition_dual_long,                       address)                               \
+  static_field(StubRoutines,                _array_partition_single_float,                    address)                               \
+  static_field(StubRoutines,                _array_partition_dual_float,                      address)                               \
+  static_field(StubRoutines,                _array_partition_single_double,                   address)                               \
+  static_field(StubRoutines,                _array_partition_dual_double,                     address)                               \
                                                                                                                                      \
   static_field(StubRoutines,                _aescrypt_encryptBlock,                           address)                               \
   static_field(StubRoutines,                _aescrypt_decryptBlock,                           address)                               \
diff --git a/src/hotspot/share/opto/c2compiler.cpp b/src/hotspot/share/opto/c2compiler.cpp
index 5efac02178865..39f56c002e41e 100644
--- a/src/hotspot/share/opto/c2compiler.cpp
+++ b/src/hotspot/share/opto/c2compiler.cpp
@@ -597,8 +597,10 @@ bool C2Compiler::is_intrinsic_supported(vmIntrinsics::ID id) {
   case vmIntrinsics::_min_strict:
   case vmIntrinsics::_max_strict:
   case vmIntrinsics::_arraycopy:
-  case vmIntrinsics::_arraySort:
-  case vmIntrinsics::_arrayPartition:
+  case vmIntrinsics::_arraySortMI:
+  case vmIntrinsics::_arraySortI:
+  case vmIntrinsics::_arrayPartitionSP:
+  case vmIntrinsics::_arrayPartitionDP:
   case vmIntrinsics::_indexOfL:
   case vmIntrinsics::_indexOfU:
   case vmIntrinsics::_indexOfUL:
diff --git a/src/hotspot/share/opto/library_call.cpp b/src/hotspot/share/opto/library_call.cpp
index 477d1ff40558b..2ab21eb1b5355 100644
--- a/src/hotspot/share/opto/library_call.cpp
+++ b/src/hotspot/share/opto/library_call.cpp
@@ -293,8 +293,11 @@ bool LibraryCallKit::try_to_inline(int predicate) {
 
   case vmIntrinsics::_arraycopy:                return inline_arraycopy();
 
-  case vmIntrinsics::_arraySort:                return inline_arraysort();
-  case vmIntrinsics::_arrayPartition:           return inline_array_partition();
+  case vmIntrinsics::_arraySortMI:
+  case vmIntrinsics::_arraySortI:               return inline_arraysort();
+
+  case vmIntrinsics::_arrayPartitionSP:         return inline_array_partition(false /* single pivot*/);
+  case vmIntrinsics::_arrayPartitionDP:         return inline_array_partition(true /* dual pivot*/);
 
   case vmIntrinsics::_compareToL:               return inline_string_compareTo(StrIntrinsicNode::LL);
   case vmIntrinsics::_compareToU:               return inline_string_compareTo(StrIntrinsicNode::UU);
@@ -5367,7 +5370,7 @@ void LibraryCallKit::create_new_uncommon_trap(CallStaticJavaNode* uncommon_trap_
 }
 
 //------------------------------inline_array_partition-----------------------
-bool LibraryCallKit::inline_array_partition() {
+bool LibraryCallKit::inline_array_partition(bool is_dual_pivot) {
 
   address stubAddr = nullptr;
   const char *stubName;
@@ -5378,32 +5381,41 @@ bool LibraryCallKit::inline_array_partition() {
   Node* offset          = argument(2);
   Node* fromIndex       = argument(4);
   Node* toIndex         = argument(5);
-  Node* pivot_indices   = argument(6);
-  Node* isDualPivot     = argument(7);
+  Node* indexPivot1     = argument(6);
+  Node* indexPivot2     = is_dual_pivot? argument(7) : nullptr;
 
   const TypeInstPtr* elem_klass = gvn().type(elementType)->isa_instptr();
   ciType* elem_type = elem_klass->const_oop()->as_instance()->java_mirror_type();
   BasicType bt = elem_type->basic_type();
-  stubAddr = StubRoutines::select_array_partition_function(bt);
-  if (stubAddr == nullptr) return false;
-
+  stubAddr = StubRoutines::select_array_partition_function(bt, is_dual_pivot);
+  // stub not loaded
+  if (stubAddr == nullptr) {
+    return false;
+  }
+  // get the address of the array
   const TypeAryPtr* obj_t = _gvn.type(obj)->isa_aryptr();
   if (obj_t == nullptr || obj_t->elem() == Type::BOTTOM ) {
     return false; // failed input validation
   }
   Node* obj_adr = make_unsafe_address(obj, offset);
 
-  pivot_indices = must_be_not_null(pivot_indices, true);
-  const TypeAryPtr* pivot_indices_type = pivot_indices->Value(&_gvn)->isa_aryptr();
-  if (pivot_indices_type == nullptr || pivot_indices_type->elem() == Type::BOTTOM ) {
-    return false; // failed input validation
-  }
-  Node* pivot_indices_adr = array_element_address(pivot_indices, intcon(0), T_INT);
-
-  // Call the stub.
-  make_runtime_call(RC_LEAF|RC_NO_FP, OptoRuntime::array_partition_Type(),
+  // create the pivotIndices array of type int and size = 2
+  Node* pivotIndices = nullptr;
+  Node* size = intcon(2);
+  Node* klass_node = makecon(TypeKlassPtr::make(ciTypeArrayKlass::make(T_INT)));
+  pivotIndices = new_array(klass_node, size, 0);  // no arguments to push
+  AllocateArrayNode* alloc = tightly_coupled_allocation(pivotIndices);
+  guarantee(alloc != nullptr, "created above");
+  Node* pivotIndices_adr = basic_plus_adr(pivotIndices, arrayOopDesc::base_offset_in_bytes(T_INT));
+
+  // Call the stub
+  make_runtime_call(RC_LEAF|RC_NO_FP, OptoRuntime::array_partition_Type(is_dual_pivot),
                     stubAddr, stubName, TypePtr::BOTTOM,
-                    obj_adr, fromIndex, toIndex, pivot_indices_adr, isDualPivot);
+                    obj_adr, fromIndex, toIndex, pivotIndices_adr, indexPivot1, indexPivot2);
+
+  if (!stopped()) {
+    set_result(pivotIndices);
+  }
 
   return true;
 }
@@ -5426,13 +5438,18 @@ bool LibraryCallKit::inline_arraysort() {
   ciType* elem_type = elem_klass->const_oop()->as_instance()->java_mirror_type();
   BasicType bt = elem_type->basic_type();
   stubAddr = StubRoutines::select_arraysort_function(bt);
-  if (stubAddr == nullptr) return false;
+  //stub not loaded
+  if (stubAddr == nullptr) {
+    return false;
+  }
 
+  // get address of the array
   const TypeAryPtr* obj_t = _gvn.type(obj)->isa_aryptr();
   if (obj_t == nullptr || obj_t->elem() == Type::BOTTOM ) {
     return false; // failed input validation
   }
   Node* obj_adr = make_unsafe_address(obj, offset);
+
   // Call the stub.
   make_runtime_call(RC_LEAF|RC_NO_FP, OptoRuntime::array_sort_Type(),
                     stubAddr, stubName, TypePtr::BOTTOM,
diff --git a/src/hotspot/share/opto/library_call.hpp b/src/hotspot/share/opto/library_call.hpp
index 79258f3575d31..d33c1c8ee0538 100644
--- a/src/hotspot/share/opto/library_call.hpp
+++ b/src/hotspot/share/opto/library_call.hpp
@@ -278,7 +278,7 @@ class LibraryCallKit : public GraphKit {
   void arraycopy_move_allocation_here(AllocateArrayNode* alloc, Node* dest, JVMState* saved_jvms_before_guards, int saved_reexecute_sp,
                                       uint new_idx);
   bool inline_arraysort();
-  bool inline_array_partition();
+  bool inline_array_partition(bool is_dual_pivot);
   typedef enum { LS_get_add, LS_get_set, LS_cmp_swap, LS_cmp_swap_weak, LS_cmp_exchange } LoadStoreKind;
   bool inline_unsafe_load_store(BasicType type,  LoadStoreKind kind, AccessKind access_kind);
   bool inline_unsafe_fence(vmIntrinsics::ID id);
diff --git a/src/hotspot/share/opto/runtime.cpp b/src/hotspot/share/opto/runtime.cpp
index 52dd29b8fa793..e6d8c956a5e63 100644
--- a/src/hotspot/share/opto/runtime.cpp
+++ b/src/hotspot/share/opto/runtime.cpp
@@ -857,17 +857,20 @@ const TypeFunc* OptoRuntime::array_fill_Type() {
   return TypeFunc::make(domain, range);
 }
 
-const TypeFunc* OptoRuntime::array_partition_Type() {
+const TypeFunc* OptoRuntime::array_partition_Type(bool is_dual_pivot) {
   // create input type (domain)
-  int num_args = 5;
+  int num_args = is_dual_pivot ? 6 : 5;
   int argcnt = num_args;
   const Type** fields = TypeTuple::fields(argcnt);
   int argp = TypeFunc::Parms;
-  fields[argp++] = TypePtr::NOTNULL;    // array
-  fields[argp++] = TypeInt::INT;    // low
-  fields[argp++] = TypeInt::INT;    // end
-  fields[argp++] = TypePtr::NOTNULL;    // pivot_indices (int array)
-  fields[argp++] = TypeInt::BOOL;       // isDualPivot
+  fields[argp++] = TypePtr::NOTNULL;  // array
+  fields[argp++] = TypeInt::INT;      // low
+  fields[argp++] = TypeInt::INT;      // end
+  fields[argp++] = TypePtr::NOTNULL;  // pivot_indices (int array)
+  fields[argp++] = TypeInt::INT;      // indexPivot1
+  if (is_dual_pivot) {
+    fields[argp++] = TypeInt::INT;    // indexPivot2
+  }
   assert(argp == TypeFunc::Parms+argcnt, "correct decoding");
   const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields);
 
diff --git a/src/hotspot/share/opto/runtime.hpp b/src/hotspot/share/opto/runtime.hpp
index b85542423e848..4017f70d36296 100644
--- a/src/hotspot/share/opto/runtime.hpp
+++ b/src/hotspot/share/opto/runtime.hpp
@@ -269,7 +269,7 @@ class OptoRuntime : public AllStatic {
   static const TypeFunc* array_fill_Type();
 
   static const TypeFunc* array_sort_Type();
-  static const TypeFunc* array_partition_Type();
+  static const TypeFunc* array_partition_Type(bool is_dual_pivot);
   static const TypeFunc* aescrypt_block_Type();
   static const TypeFunc* cipherBlockChaining_aescrypt_Type();
   static const TypeFunc* electronicCodeBook_aescrypt_Type();
diff --git a/src/hotspot/share/runtime/stubRoutines.cpp b/src/hotspot/share/runtime/stubRoutines.cpp
index 40bf177994e19..dce5740cecfda 100644
--- a/src/hotspot/share/runtime/stubRoutines.cpp
+++ b/src/hotspot/share/runtime/stubRoutines.cpp
@@ -180,10 +180,14 @@ address StubRoutines::_arraysort_int = nullptr;
 address StubRoutines::_arraysort_long = nullptr;
 address StubRoutines::_arraysort_float = nullptr;
 address StubRoutines::_arraysort_double = nullptr;
-address StubRoutines::_array_partition_int  = nullptr;
-address StubRoutines::_array_partition_long = nullptr;
-address StubRoutines::_array_partition_float = nullptr;
-address StubRoutines::_array_partition_double = nullptr;
+address StubRoutines::_array_partition_single_int  = nullptr;
+address StubRoutines::_array_partition_dual_int  = nullptr;
+address StubRoutines::_array_partition_single_long = nullptr;
+address StubRoutines::_array_partition_dual_long = nullptr;
+address StubRoutines::_array_partition_single_float = nullptr;
+address StubRoutines::_array_partition_dual_float = nullptr;
+address StubRoutines::_array_partition_single_double = nullptr;
+address StubRoutines::_array_partition_dual_double = nullptr;
 
 address StubRoutines::_cont_thaw          = nullptr;
 address StubRoutines::_cont_returnBarrier = nullptr;
@@ -553,12 +557,12 @@ address StubRoutines::select_arraysort_function(BasicType t) {
   }
 }
 
-address StubRoutines::select_array_partition_function(BasicType t) {
+address StubRoutines::select_array_partition_function(BasicType t, bool is_dual_pivot) {
   switch(t) {
-    case T_INT:    return _array_partition_int;
-    case T_LONG:   return _array_partition_long;
-    case T_FLOAT:  return _array_partition_float;
-    case T_DOUBLE: return _array_partition_double;
+    case T_INT:    return is_dual_pivot ? _array_partition_dual_int : _array_partition_single_int;
+    case T_LONG:   return is_dual_pivot ? _array_partition_dual_long : _array_partition_single_long;
+    case T_FLOAT:  return is_dual_pivot ? _array_partition_dual_float : _array_partition_single_float;
+    case T_DOUBLE: return is_dual_pivot ? _array_partition_dual_double : _array_partition_single_double;
   default:
     ShouldNotReachHere();
     return nullptr;
diff --git a/src/hotspot/share/runtime/stubRoutines.hpp b/src/hotspot/share/runtime/stubRoutines.hpp
index cc582bf24cc64..ee87450285c2e 100644
--- a/src/hotspot/share/runtime/stubRoutines.hpp
+++ b/src/hotspot/share/runtime/stubRoutines.hpp
@@ -157,10 +157,14 @@ class StubRoutines: AllStatic {
   static address _arraysort_long;
   static address _arraysort_float;
   static address _arraysort_double;
-  static address _array_partition_int;
-  static address _array_partition_long;
-  static address _array_partition_float;
-  static address _array_partition_double;
+  static address _array_partition_single_int;
+  static address _array_partition_dual_int;
+  static address _array_partition_single_long;
+  static address _array_partition_dual_long;
+  static address _array_partition_single_float;
+  static address _array_partition_dual_float;
+  static address _array_partition_single_double;
+  static address _array_partition_dual_double;
   // Leaf routines which implement arraycopy and their addresses
   // arraycopy operands aligned on element type boundary
   static address _jbyte_arraycopy;
@@ -384,7 +388,7 @@ class StubRoutines: AllStatic {
 
   static address generic_arraycopy()   { return _generic_arraycopy; }
   static address select_arraysort_function(BasicType t);
-  static address select_array_partition_function(BasicType t);
+  static address select_array_partition_function(BasicType t, bool is_dual_pivot);
 
   static address jbyte_fill()          { return _jbyte_fill; }
   static address jshort_fill()         { return _jshort_fill; }
diff --git a/src/java.base/linux/native/libx86_64_sort/avx512-32bit-qsort.hpp b/src/java.base/linux/native/libsimdsort/avx512-32bit-qsort.hpp
similarity index 99%
rename from src/java.base/linux/native/libx86_64_sort/avx512-32bit-qsort.hpp
rename to src/java.base/linux/native/libsimdsort/avx512-32bit-qsort.hpp
index 7abc3a5454266..15e406a822900 100644
--- a/src/java.base/linux/native/libx86_64_sort/avx512-32bit-qsort.hpp
+++ b/src/java.base/linux/native/libsimdsort/avx512-32bit-qsort.hpp
@@ -423,7 +423,7 @@ static void qsort_32bit_(type_t *arr, int64_t left, int64_t right,
 }
 
 template <>
-inline void avx512_qsort<int32_t>(int32_t *arr, int64_t fromIndex, int64_t toIndex) {
+void inline avx512_qsort<int32_t>(int32_t *arr, int64_t fromIndex, int64_t toIndex) {
     int64_t arrsize = toIndex - fromIndex;
     if (arrsize > 1) {
         qsort_32bit_<zmm_vector<int32_t>, int32_t>(arr, fromIndex, toIndex - 1,
@@ -432,7 +432,7 @@ inline void avx512_qsort<int32_t>(int32_t *arr, int64_t fromIndex, int64_t toInd
 }
 
 template <>
-inline void avx512_qsort<float>(float *arr, int64_t fromIndex, int64_t toIndex) {
+void inline avx512_qsort<float>(float *arr, int64_t fromIndex, int64_t toIndex) {
     int64_t arrsize = toIndex - fromIndex;
     if (arrsize > 1) {
         qsort_32bit_<zmm_vector<float>, float>(arr, fromIndex, toIndex - 1,
diff --git a/src/java.base/linux/native/libx86_64_sort/avx512-64bit-common.h b/src/java.base/linux/native/libsimdsort/avx512-64bit-common.h
similarity index 100%
rename from src/java.base/linux/native/libx86_64_sort/avx512-64bit-common.h
rename to src/java.base/linux/native/libsimdsort/avx512-64bit-common.h
diff --git a/src/java.base/linux/native/libx86_64_sort/avx512-64bit-qsort.hpp b/src/java.base/linux/native/libsimdsort/avx512-64bit-qsort.hpp
similarity index 99%
rename from src/java.base/linux/native/libx86_64_sort/avx512-64bit-qsort.hpp
rename to src/java.base/linux/native/libsimdsort/avx512-64bit-qsort.hpp
index 422f385d052e2..3028f45a79407 100644
--- a/src/java.base/linux/native/libx86_64_sort/avx512-64bit-qsort.hpp
+++ b/src/java.base/linux/native/libsimdsort/avx512-64bit-qsort.hpp
@@ -754,7 +754,7 @@ static void qsort_64bit_(type_t *arr, int64_t left, int64_t right,
 }
 
 template <>
-inline void avx512_qsort<int64_t>(int64_t *arr, int64_t fromIndex, int64_t toIndex) {
+void inline avx512_qsort<int64_t>(int64_t *arr, int64_t fromIndex, int64_t toIndex) {
     int64_t arrsize = toIndex - fromIndex;
     if (arrsize > 1) {
         qsort_64bit_<zmm_vector<int64_t>, int64_t>(arr, fromIndex, toIndex - 1,
@@ -763,7 +763,7 @@ inline void avx512_qsort<int64_t>(int64_t *arr, int64_t fromIndex, int64_t toInd
 }
 
 template <>
-inline void avx512_qsort<double>(double *arr, int64_t fromIndex, int64_t toIndex) {
+void inline avx512_qsort<double>(double *arr, int64_t fromIndex, int64_t toIndex) {
     int64_t arrsize = toIndex - fromIndex;
     if (arrsize > 1) {
         qsort_64bit_<zmm_vector<double>, double>(arr, fromIndex, toIndex - 1,
diff --git a/src/java.base/linux/native/libx86_64_sort/avx512-common-qsort.h b/src/java.base/linux/native/libsimdsort/avx512-common-qsort.h
similarity index 91%
rename from src/java.base/linux/native/libx86_64_sort/avx512-common-qsort.h
rename to src/java.base/linux/native/libsimdsort/avx512-common-qsort.h
index c56990f921eae..8f255a38e47d6 100644
--- a/src/java.base/linux/native/libx86_64_sort/avx512-common-qsort.h
+++ b/src/java.base/linux/native/libsimdsort/avx512-common-qsort.h
@@ -371,23 +371,21 @@ static inline int64_t partition_avx512_unrolled(type_t *arr, int64_t left,
     return l_store;
 }
 
-// right = to_index (exclusive)
+// to_index (exclusive)
 template <typename vtype, typename type_t>
-static int64_t vectorized_partition(type_t *arr, int64_t left, int64_t right, type_t pivot, bool use_gt) {
+static int64_t vectorized_partition(type_t *arr, int64_t from_index, int64_t to_index, type_t pivot, bool use_gt) {
     type_t smallest = vtype::type_max();
     type_t biggest = vtype::type_min();
     int64_t pivot_index = partition_avx512_unrolled<vtype, 2>(
-            arr, left, right, pivot, &smallest, &biggest, use_gt);
+            arr, from_index, to_index, pivot, &smallest, &biggest, use_gt);
     return pivot_index;
 }
 
 // partitioning functions
 template <typename T>
-void avx512_dual_pivot_partition(T *arr, int64_t from_index, int64_t to_index, int32_t *pivot_indices){
-    const int64_t pidx1 = pivot_indices[0];
-    const int64_t pidx2 = pivot_indices[1];
-    const T pivot1 = arr[pidx1];
-    const T pivot2 = arr[pidx2];
+void avx512_dual_pivot_partition(T *arr, int64_t from_index, int64_t to_index, int32_t *pivot_indices, int64_t index_pivot1, int64_t index_pivot2){
+    const T pivot1 = arr[index_pivot1];
+    const T pivot2 = arr[index_pivot2];
 
     const int64_t low = from_index;
     const int64_t high = to_index;
@@ -395,14 +393,21 @@ void avx512_dual_pivot_partition(T *arr, int64_t from_index, int64_t to_index, i
     const int64_t end = high - 1;
 
 
-    std::swap(arr[pidx1], arr[low]);
-    std::swap(arr[pidx2], arr[end]);
+    std::swap(arr[index_pivot1], arr[low]);
+    std::swap(arr[index_pivot2], arr[end]);
 
 
     const int64_t pivot_index2 = vectorized_partition<zmm_vector<T>, T>(arr, start, end, pivot2, true); // use_gt = true
     std::swap(arr[end], arr[pivot_index2]);
     int64_t upper = pivot_index2;
 
+    // if all other elements are greater than pivot2 (and pivot1), no need to do further partitioning
+    if (upper == start) {
+        pivot_indices[0] = low;
+        pivot_indices[1] = upper;
+        return;
+    }
+
     const int64_t pivot_index1 = vectorized_partition<zmm_vector<T>, T>(arr, start, upper, pivot1, false); // use_ge (use_gt = false)
     int64_t lower = pivot_index1 - 1;
     std::swap(arr[low], arr[lower]);
@@ -412,13 +417,11 @@ void avx512_dual_pivot_partition(T *arr, int64_t from_index, int64_t to_index, i
 }
 
 template <typename T>
-void avx512_single_pivot_partition(T *arr, int64_t from_index, int64_t to_index, int32_t *pivot_indices){
-    const int64_t pidx = pivot_indices[0];
-    const T pivot = arr[pidx];
+void avx512_single_pivot_partition(T *arr, int64_t from_index, int64_t to_index, int32_t *pivot_indices, int64_t index_pivot){
+    const T pivot = arr[index_pivot];
 
     const int64_t low = from_index;
     const int64_t high = to_index;
-    //const int64_t start = low + 1;
     const int64_t end = high - 1;
 
 
@@ -433,11 +436,37 @@ void avx512_single_pivot_partition(T *arr, int64_t from_index, int64_t to_index,
 }
 
 template <typename T>
-inline void avx512_partition(T *arr, int64_t from_index, int64_t to_index, int32_t *pivot_indices, bool is_dual_pviot) {
+void inline avx512_partition(T *arr, int64_t from_index, int64_t to_index, int32_t *pivot_indices, bool is_dual_pviot) {
     if(is_dual_pviot) avx512_dual_pivot_partition<T>(arr, from_index, to_index, pivot_indices);
         else avx512_single_pivot_partition<T>(arr, from_index, to_index, pivot_indices);
 }
 
+template <typename T>
+void inline insertion_sort(T *arr, int32_t from_index, int32_t to_index) {
+    for (int i, k = from_index; ++k < to_index; ) {
+        T ai = arr[i = k];
+
+        if (ai < arr[i - 1]) {
+            while (--i >= from_index && ai < arr[i]) {
+                arr[i + 1] = arr[i];
+            }
+            arr[i + 1] = ai;
+        }
+    }
+}
+
+template <typename T>
+void inline avx512_fastsort(T *arr, int64_t from_index, int64_t to_index, const int32_t INS_SORT_THRESHOLD) {
+    int32_t size = to_index - from_index;
+
+    if (size <= INS_SORT_THRESHOLD) {
+        insertion_sort<T>(arr, from_index, to_index);
+    }
+    else {
+        avx512_qsort<T>(arr, from_index, to_index);
+    }
+}
+
 
 
 #endif  // AVX512_QSORT_COMMON
diff --git a/src/java.base/linux/native/libsimdsort/avxsort_linux_x86.cpp b/src/java.base/linux/native/libsimdsort/avxsort_linux_x86.cpp
new file mode 100644
index 0000000000000..a18acda571ce1
--- /dev/null
+++ b/src/java.base/linux/native/libsimdsort/avxsort_linux_x86.cpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2023 Intel Corporation. All rights reserved.
+ * Intel x86-simd-sort source code.
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#pragma GCC target("avx512dq", "avx512f")
+#include "avx512-32bit-qsort.hpp"
+#include "avx512-64bit-qsort.hpp"
+
+#define DLL_PUBLIC __attribute__((visibility("default")))
+#define INSERTION_SORT_THRESHOLD_32BIT 16
+#define INSERTION_SORT_THRESHOLD_64BIT 20
+
+extern "C" {
+
+    DLL_PUBLIC void avx512_sort_int(int32_t *array, int64_t from_index, int64_t to_index) {
+        avx512_fastsort<int32_t>(array, from_index, to_index, INSERTION_SORT_THRESHOLD_32BIT);
+    }
+
+    DLL_PUBLIC void avx512_sort_long(int64_t *array, int64_t from_index, int64_t to_index) {
+        avx512_fastsort<int64_t>(array, from_index, to_index, INSERTION_SORT_THRESHOLD_64BIT);
+    }
+
+    DLL_PUBLIC void avx512_sort_float(float *array, int64_t from_index, int64_t to_index) {
+        avx512_fastsort<float>(array, from_index, to_index, INSERTION_SORT_THRESHOLD_32BIT);
+    }
+
+    DLL_PUBLIC void avx512_sort_double(double *array, int64_t from_index, int64_t to_index) {
+        avx512_fastsort<double>(array, from_index, to_index, INSERTION_SORT_THRESHOLD_64BIT);
+    }
+
+    DLL_PUBLIC void avx512_partition_single_int(int32_t *array, int64_t from_index, int64_t to_index, int32_t *pivot_indices, int32_t index_pivot) {
+        avx512_single_pivot_partition<int32_t>(array, from_index, to_index, pivot_indices, index_pivot);
+    }
+
+    DLL_PUBLIC void avx512_partition_dual_int(int32_t *array, int64_t from_index, int64_t to_index, int32_t *pivot_indices, int32_t index_pivot1, int32_t index_pivot2) {
+        avx512_dual_pivot_partition<int32_t>(array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2);
+    }
+
+    DLL_PUBLIC void avx512_partition_single_long(int64_t *array, int64_t from_index, int64_t to_index, int32_t *pivot_indices, int32_t index_pivot) {
+        avx512_single_pivot_partition<int64_t>(array, from_index, to_index, pivot_indices, index_pivot);
+    }
+
+    DLL_PUBLIC void avx512_partition_dual_long(int64_t *array, int64_t from_index, int64_t to_index, int32_t *pivot_indices, int32_t index_pivot1, int32_t index_pivot2) {
+        avx512_dual_pivot_partition<int64_t>(array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2);
+    }
+
+    DLL_PUBLIC void avx512_partition_single_float(float *array, int64_t from_index, int64_t to_index, int32_t *pivot_indices, int32_t index_pivot) {
+        avx512_single_pivot_partition<float>(array, from_index, to_index, pivot_indices, index_pivot);
+    }
+
+    DLL_PUBLIC void avx512_partition_dual_float(float *array, int64_t from_index, int64_t to_index, int32_t *pivot_indices, int32_t index_pivot1, int32_t index_pivot2) {
+        avx512_dual_pivot_partition<float>(array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2);
+    }
+
+    DLL_PUBLIC void avx512_partition_single_double(double *array, int64_t from_index, int64_t to_index, int32_t *pivot_indices, int32_t index_pivot) {
+        avx512_single_pivot_partition<double>(array, from_index, to_index, pivot_indices, index_pivot);
+    }
+
+    DLL_PUBLIC void avx512_partition_dual_double(double *array, int64_t from_index, int64_t to_index, int32_t *pivot_indices, int32_t index_pivot1, int32_t index_pivot2) {
+        avx512_dual_pivot_partition<double>(array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2);
+    }
+
+}
diff --git a/src/java.base/linux/native/libx86_64_sort/avxsort_linux_x86.cpp b/src/java.base/linux/native/libx86_64_sort/avxsort_linux_x86.cpp
deleted file mode 100644
index aeea98006ce48..0000000000000
--- a/src/java.base/linux/native/libx86_64_sort/avxsort_linux_x86.cpp
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright (c) 2023 Intel Corporation. All rights reserved.
- * Intel x86-simd-sort source code.
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This code is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 only, as
- * published by the Free Software Foundation.
- *
- * This code is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
- * version 2 for more details (a copy is included in the LICENSE file that
- * accompanied this code).
- *
- * You should have received a copy of the GNU General Public License version
- * 2 along with this work; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
- * or visit www.oracle.com if you need additional information or have any
- * questions.
- *
- */
-
-#pragma GCC target("avx512dq", "avx512f")
-#include "avx512-32bit-qsort.hpp"
-#include "avx512-64bit-qsort.hpp"
-
-#define DLL_PUBLIC __attribute__((visibility("default")))
-
-extern "C" {
-
-    DLL_PUBLIC void avx512_sort_int(int32_t *array, int64_t from_index, int64_t to_index) {
-        avx512_qsort<int32_t>(array, from_index, to_index);
-    }
-
-    DLL_PUBLIC void avx512_sort_long(int64_t *array, int64_t from_index, int64_t to_index) {
-        avx512_qsort<int64_t>(array, from_index, to_index);
-    }
-
-    DLL_PUBLIC void avx512_sort_float(float *array, int64_t from_index, int64_t to_index) {
-        avx512_qsort<float>(array, from_index, to_index);
-    }
-
-    DLL_PUBLIC void avx512_sort_double(double *array, int64_t from_index, int64_t to_index) {
-        avx512_qsort<double>(array, from_index, to_index);
-    }
-
-    DLL_PUBLIC void avx512_partition_int(int32_t *array, int64_t from_index, int64_t to_index, int32_t *pivot_indices, bool is_dual_pivot) {
-        avx512_partition<int32_t>(array, from_index, to_index, pivot_indices, is_dual_pivot);
-    }
-
-    DLL_PUBLIC void avx512_partition_long(int64_t *array, int64_t from_index, int64_t to_index, int32_t *pivot_indices, bool is_dual_pivot) {
-        avx512_partition<int64_t>(array, from_index, to_index, pivot_indices, is_dual_pivot);
-    }
-
-    DLL_PUBLIC void avx512_partition_float(float *array, int64_t from_index, int64_t to_index, int32_t *pivot_indices, bool is_dual_pivot) {
-        avx512_partition<float>(array, from_index, to_index, pivot_indices, is_dual_pivot);
-    }
-
-    DLL_PUBLIC void avx512_partition_double(double *array, int64_t from_index, int64_t to_index, int32_t *pivot_indices, bool is_dual_pivot) {
-        avx512_partition<double>(array, from_index, to_index, pivot_indices, is_dual_pivot);
-    }
-
-}
diff --git a/src/java.base/share/classes/java/util/DualPivotQuicksort.java b/src/java.base/share/classes/java/util/DualPivotQuicksort.java
index b3959b1048e75..f2cf2deab6638 100644
--- a/src/java.base/share/classes/java/util/DualPivotQuicksort.java
+++ b/src/java.base/share/classes/java/util/DualPivotQuicksort.java
@@ -126,15 +126,9 @@ private DualPivotQuicksort() {}
     private static final int MAX_RECURSION_DEPTH = 64 * DELTA;
 
     /**
-     * Min array size to call fast small array sort.
-     */
-    private static final int MIN_FAST_SMALL_ARRAY_SORT_SIZE = 16;
-
-    /**
-     * Sorts the specified array into ascending numerical order.
-     * While the intrinsic is free to choose its own sorting algorithm, the
-     * fallback implementation uses either mixed insertion sort or simple
-     * insertion sort.
+     * Sorts the specified array into ascending numerical order using
+     * mixed insertion sort.The intrinsic is free to choose its own
+     * sorting algorithm.
      *
      * @param elemType the class of the elements of the array to be sorted
      * @param array the array to be sorted
@@ -143,66 +137,35 @@ private DualPivotQuicksort() {}
      * address pointing to the first element to sort from.
      * @param low the index of the first element, inclusive, to be sorted
      * @param high the index of the last element, exclusive, to be sorted
-     * @param end the index of the last element for simple insertion sort (in
-     * the case of mixed insertion sort). In the fallback implementation,
-     * if end < 0, we use insertion sort else we use mixed insertion sort.
+     * @param end the index of the last element for simple insertion sort
      */
     @IntrinsicCandidate
-    @ForceInline
-    private static void arraySort(Class<?> elemType, Object array, long offset, int low, int high, int end) {
-       if (end < 0) insertionSort(array, low, high);
-       else mixedInsertionSort(array, low, end, high);
+    private static void mixedInsertionSort(Class<?> elemType, Object array, long offset, int low, int high, int end) {
+        switch (array) {
+            case int[] arr -> mixedInsertionSort(arr, low, end, high);
+            case long[] arr ->  mixedInsertionSort(arr, low, end, high);
+            case float[] arr ->  mixedInsertionSort(arr, low, end, high);
+            case double[] arr ->  mixedInsertionSort(arr, low, end, high);
+            default -> throw new UnsupportedOperationException();
+        }
     }
 
     /**
-     * Partitions the specified array based on the pivot(s) provided.
+     * Sorts the specified array into ascending numerical order using
+     * insertion sort.The intrinsic is free to choose its own
+     * sorting algorithm.
      *
-     * @param elemType the class of the array to be sorted
+     * @param elemType the class of the elements of the array to be sorted
      * @param array the array to be sorted
      * @param offset the relative offset, in bytes, from the base address of
-     * the array to partition, otherwise if the array is {@code null},an absolute
-     * address pointing to the first element to partition from.
+     * the array to sort, otherwise if the array is {@code null},an absolute
+     * address pointing to the first element to sort from.
      * @param low the index of the first element, inclusive, to be sorted
      * @param high the index of the last element, exclusive, to be sorted
-     * @param pivotIndices the array containing the indices of the pivots. After
-     * partitioning, this array is updated with the new indices of the pivots.
-     * @param isDualPivot a boolean value to choose between dual pivot
-     * partitioning and single pivot partitioning
      */
     @IntrinsicCandidate
-    @ForceInline
-    private static void arrayPartition(Class<?> elemType, Object array, long offset, int low, int high, int[] pivotIndices, boolean isDualPivot) {
-        if (isDualPivot) partitionDualPivot(array, low, high, pivotIndices);
-        else partitionSinglePivot(array, low, high, pivotIndices);
-    }
-
-    /**
-     * Calculates the double depth of parallel merging.
-     * Depth is negative, if tasks split before sorting.
-     *
-     * @param parallelism the parallelism level
-     * @param size the target size
-     * @return the depth of parallel merging
-     */
-    private static int getDepth(int parallelism, int size) {
-        int depth = 0;
-
-        while ((parallelism >>= 3) > 0 && (size >>= 2) > 0) {
-            depth -= 2;
-        }
-        return depth;
-    }
-
-    /**
-     * Sorts the specified range of the array using insertion sort
-     *
-     * @param array the array to be sorted
-     * @param low the index of the first element, inclusive, to be sorted
-     * @param high the index of the last element, exclusive, to be sorted
-     *
-     */
-    private static void insertionSort(Object array, int low, int high) {
-        switch (array) {
+    private static void insertionSort(Class<?> elemType, Object array, long offset, int low, int high) {
+       switch (array) {
             case int[] arr -> insertionSort(arr, low, high);
             case long[] arr -> insertionSort(arr, low, high);
             case float[] arr -> insertionSort(arr, low, high);
@@ -212,62 +175,69 @@ private static void insertionSort(Object array, int low, int high) {
     }
 
     /**
-     * Sorts the specified range of the array using mixed insertion sort.
+     * Partitions the specified array based on the two pivots provided.
      *
+     * @param elemType the class of the array to be sorted
      * @param array the array to be sorted
+     * @param offset the relative offset, in bytes, from the base address of
+     * the array to partition, otherwise if the array is {@code null},an absolute
+     * address pointing to the first element to partition from.
      * @param low the index of the first element, inclusive, to be sorted
      * @param high the index of the last element, exclusive, to be sorted
-     * @param end the index of the last element for simple insertion sort
-     *
+     * @param indexPivot1 the index of pivot1, the first pivot
+     * @param indexPivot2 the index of pivot2, the second pivot
      */
-    private static void mixedInsertionSort(Object array, int low, int end, int high) {
-        switch (array) {
-            case int[] arr -> mixedInsertionSort(arr, low, end, high);
-            case long[] arr ->  mixedInsertionSort(arr, low, end, high);
-            case float[] arr ->  mixedInsertionSort(arr, low, end, high);
-            case double[] arr ->  mixedInsertionSort(arr, low, end, high);
+    @IntrinsicCandidate
+    @ForceInline
+    private static int[] partitionDualPivot(Class<?> elemType, Object array, long offset, int low, int high, int indexPivot1, int indexPivot2) {
+        return switch(array) {
+            case int[] arr -> partitionDualPivot(arr, low, high, indexPivot1, indexPivot2);
+            case long[] arr -> partitionDualPivot(arr, low, high, indexPivot1, indexPivot2);
+            case float[] arr -> partitionDualPivot(arr, low, high, indexPivot1, indexPivot2);
+            case double[] arr -> partitionDualPivot(arr, low, high, indexPivot1, indexPivot2);
             default -> throw new UnsupportedOperationException();
-        }
+        };
     }
 
     /**
-     * Partitions the specified range of the array using the two pivots specified.
-     *
-     * @param array the array to be partitioned
-     * @param low the index of the first element, inclusive, for partitioning
-     * @param high the index of the last element, exclusive, for partitioning
-     * @param pivotIndices an array containing the indices of the two pivots to be used.
-     * After partitioning, the indices of the pivots is updated as well.
+     * Partitions the specified array based on the single pivot provided.
      *
+     * @param elemType the class of the array to be sorted
+     * @param array the array to be sorted
+     * @param offset the relative offset, in bytes, from the base address of
+     * the array to partition, otherwise if the array is {@code null},an absolute
+     * address pointing to the first element to partition from.
+     * @param low the index of the first element, inclusive, to be sorted
+     * @param high the index of the last element, exclusive, to be sorted
+     * @param indexPivot the index of the pivot
      */
-    private static void partitionDualPivot(Object array, int low, int high, int[] pivotIndices) {
-        switch(array) {
-            case int[] arr -> partitionDualPivot(arr, low, high, pivotIndices);
-            case long[] arr -> partitionDualPivot(arr, low, high, pivotIndices);
-            case float[] arr -> partitionDualPivot(arr, low, high, pivotIndices);
-            case double[] arr -> partitionDualPivot(arr, low, high, pivotIndices);
+    @IntrinsicCandidate
+    @ForceInline
+    private static int[] partitionSinglePivot(Class<?> elemType, Object array, long offset, int low, int high, int indexPivot) {
+        return switch(array) {
+            case int[] arr -> partitionSinglePivot(arr, low, high, indexPivot);
+            case long[] arr -> partitionSinglePivot(arr, low, high, indexPivot);
+            case float[] arr -> partitionSinglePivot(arr, low, high, indexPivot);
+            case double[] arr -> partitionSinglePivot(arr, low, high, indexPivot);
             default -> throw new UnsupportedOperationException();
-        }
+        };
     }
 
     /**
-     * Partitions the specified range of the array using a single pivot specified.
-     *
-     * @param array the array to be partitioned
-     * @param low the index of the first element, inclusive, for partitioning
-     * @param high the index of the last element, exclusive, for partitioning
-     * @param pivotIndices an array containing the indices of the pivot to be used.
-     * After partitioning, the indices of the pivots is updated as well.
+     * Calculates the double depth of parallel merging.
+     * Depth is negative, if tasks split before sorting.
      *
+     * @param parallelism the parallelism level
+     * @param size the target size
+     * @return the depth of parallel merging
      */
-    private static void partitionSinglePivot(Object array, int low, int high, int[] pivotIndices) {
-        switch(array) {
-            case int[] arr -> partitionSinglePivot(arr, low, high, pivotIndices);
-            case long[] arr -> partitionSinglePivot(arr, low, high, pivotIndices);
-            case float[] arr -> partitionSinglePivot(arr, low, high, pivotIndices);
-            case double[] arr -> partitionSinglePivot(arr, low, high, pivotIndices);
-            default -> throw new UnsupportedOperationException();
+    private static int getDepth(int parallelism, int size) {
+        int depth = 0;
+
+        while ((parallelism >>= 3) > 0 && (size >>= 2) > 0) {
+            depth -= 2;
         }
+        return depth;
     }
 
     /**
@@ -309,18 +279,14 @@ static void sort(int[] a, int parallelism, int low, int high) {
      * @param high the index of the last element, exclusive, to be sorted
      */
     static void sort(Sorter sorter, int[] a, int bits, int low, int high) {
+        int[] pivotIndices;
         while (true) {
             int end = high - 1, size = high - low;
-            int[] pivotIndices;
-            int baseOffset = Unsafe.ARRAY_INT_BASE_OFFSET;
-
             /*
              * Run mixed insertion sort on small non-leftmost parts.
              */
             if (size < MAX_MIXED_INSERTION_SORT_SIZE + bits && (bits & 1) > 0) {
-                int last  = high - 3 * ((size >> 5) << 3);
-                if (size < MIN_FAST_SMALL_ARRAY_SORT_SIZE) mixedInsertionSort(a, low, last , high);
-                else arraySort(int.class, a, baseOffset, low, high, last);
+                mixedInsertionSort(int.class, a, Unsafe.ARRAY_INT_BASE_OFFSET, low, high, high - 3 * ((size >> 5) << 3));
                 return;
             }
 
@@ -328,8 +294,7 @@ static void sort(Sorter sorter, int[] a, int bits, int low, int high) {
              * Invoke insertion sort on small leftmost part.
              */
             if (size < MAX_INSERTION_SORT_SIZE) {
-                if (size < MIN_FAST_SMALL_ARRAY_SORT_SIZE) insertionSort(a, low, high);
-                else arraySort(int.class, a, baseOffset, low, high, -1);
+                insertionSort(int.class, a, Unsafe.ARRAY_INT_BASE_OFFSET, low, high);
                 return;
             }
 
@@ -409,15 +374,13 @@ && tryMergeRuns(sorter, a, low, size)) {
             /*
              * Partitioning with 2 pivots in case of different elements.
              */
-            boolean isDualPivot = (a[e1] < a[e2] && a[e2] < a[e3] && a[e3] < a[e4] && a[e4] < a[e5]);
-            if (isDualPivot) {
+            if (a[e1] < a[e2] && a[e2] < a[e3] && a[e3] < a[e4] && a[e4] < a[e5]) {
                 /*
                  * Use the first and fifth of the five sorted elements as
                  * the pivots. These values are inexpensive approximation
                  * of tertiles. Note, that pivot1 < pivot2.
                  */
-                pivotIndices = new int[] {e1, e5};
-                arrayPartition(int.class, a, baseOffset, low, high, pivotIndices, isDualPivot);
+                pivotIndices = partitionDualPivot(int.class, a, Unsafe.ARRAY_INT_BASE_OFFSET, low, high, e1, e5);
                 lower = pivotIndices[0];
                 upper = pivotIndices[1];
 
@@ -441,8 +404,7 @@ && tryMergeRuns(sorter, a, low, size)) {
                  * Use the third of the five sorted elements as the pivot.
                  * This value is inexpensive approximation of the median.
                  */
-                pivotIndices = new int[] {e3, e3};
-                arrayPartition(int.class, a, baseOffset, low, high, pivotIndices, isDualPivot);
+                pivotIndices = partitionSinglePivot(int.class, a, Unsafe.ARRAY_INT_BASE_OFFSET, low, high, e3);
                 lower = pivotIndices[0];
                 upper = pivotIndices[1];
                 /*
@@ -461,22 +423,23 @@ && tryMergeRuns(sorter, a, low, size)) {
     }
 
     /**
-     * Partitions the specified range of the array using the two pivots specified.
+     * Partitions the specified range of the array using the two pivots provided.
      *
      * @param array the array to be partitioned
      * @param low the index of the first element, inclusive, for partitioning
      * @param high the index of the last element, exclusive, for partitioning
-     * @param pivotIndices an array containing the indices of the two pivots to be used.
-     * After partitioning, this array the indices of the pivots is updated as well.
+     * @param indexPivot1 the index of pivot1, the first pivot
+     * @param indexPivot2 the index of pivot2, the second pivot
      *
      */
-    private static void partitionDualPivot(int[] a, int low, int high, int[] pivotIndices) {
+    @ForceInline
+    private static int[] partitionDualPivot(int[] a, int low, int high, int indexPivot1, int indexPivot2) {
         int end = high - 1;
         int lower = low;
         int upper = end;
 
-        int e1 = pivotIndices[0];
-        int e5 = pivotIndices[1];
+        int e1 = indexPivot1;
+        int e5 = indexPivot2;
         int pivot1 = a[e1];
         int pivot2 = a[e5];
 
@@ -543,29 +506,26 @@ private static void partitionDualPivot(int[] a, int low, int high, int[] pivotIn
         a[low] = a[lower]; a[lower] = pivot1;
         a[end] = a[upper]; a[upper] = pivot2;
 
-        pivotIndices[0] = lower;
-        pivotIndices[1] = upper;
+        return new int[] {lower, upper};
     }
 
 
 
     /**
-     * Partitions the specified range of the array using a single pivot specified.
+     * Partitions the specified range of the array using a single pivot provided.
      *
      * @param array the array to be partitioned
      * @param low the index of the first element, inclusive, for partitioning
      * @param high the index of the last element, exclusive, for partitioning
-     * @param pivotIndices an array containing the indices of the pivot to be used.
-     * After partitioning, this array the indices of the pivots is updated as well.
+     * @param indexPivot the index of the pivot
      *
      */
-    private static void partitionSinglePivot(int[] a, int low, int high, int[] pivotIndices) {
+    @ForceInline
+    private static int[] partitionSinglePivot(int[] a, int low, int high, int indexPivot) {
         int end = high - 1;
         int lower = low;
         int upper = end;
-
-
-        int e3 = pivotIndices[0];
+        int e3 = indexPivot;
         int pivot = a[e3];
 
         /*
@@ -619,8 +579,7 @@ private static void partitionSinglePivot(int[] a, int low, int high, int[] pivot
         * Swap the pivot into its final position.
         */
         a[low] = a[lower]; a[lower] = pivot;
-        pivotIndices[0] = lower;
-        pivotIndices[1] = upper;
+        return new int[] {lower, upper};
     }
 
     /**
@@ -1122,18 +1081,16 @@ static void sort(long[] a, int parallelism, int low, int high) {
      * @param high the index of the last element, exclusive, to be sorted
      */
     static void sort(Sorter sorter, long[] a, int bits, int low, int high) {
+        int[] pivotIndices;
         while (true) {
             int end = high - 1, size = high - low;
-            int[] pivotIndices;
-            int baseOffset = Unsafe.ARRAY_LONG_BASE_OFFSET;
 
             /*
              * Run mixed insertion sort on small non-leftmost parts.
              */
             if (size < MAX_MIXED_INSERTION_SORT_SIZE + bits && (bits & 1) > 0) {
                 int last  = high - 3 * ((size >> 5) << 3);
-                if (size < MIN_FAST_SMALL_ARRAY_SORT_SIZE) mixedInsertionSort(a, low, last , high);
-                else arraySort(long.class, a, baseOffset, low, high, last);
+                mixedInsertionSort(long.class, a, Unsafe.ARRAY_LONG_BASE_OFFSET, low, high, high - 3 * ((size >> 5) << 3));
                 return;
             }
 
@@ -1141,8 +1098,7 @@ static void sort(Sorter sorter, long[] a, int bits, int low, int high) {
              * Invoke insertion sort on small leftmost part.
              */
             if (size < MAX_INSERTION_SORT_SIZE) {
-                if (size < MIN_FAST_SMALL_ARRAY_SORT_SIZE) insertionSort(a, low, high);
-                else arraySort(long.class, a, baseOffset, low, high, -1);
+                insertionSort(long.class, a, Unsafe.ARRAY_LONG_BASE_OFFSET, low, high);
                 return;
             }
 
@@ -1222,16 +1178,14 @@ && tryMergeRuns(sorter, a, low, size)) {
             /*
              * Partitioning with 2 pivots in case of different elements.
              */
-            boolean isDualPivot = (a[e1] < a[e2] && a[e2] < a[e3] && a[e3] < a[e4] && a[e4] < a[e5]);
-            if(isDualPivot)  {
+            if(a[e1] < a[e2] && a[e2] < a[e3] && a[e3] < a[e4] && a[e4] < a[e5]) {
 
                 /*
                  * Use the first and fifth of the five sorted elements as
                  * the pivots. These values are inexpensive approximation
                  * of tertiles. Note, that pivot1 < pivot2.
                  */
-                pivotIndices = new int[] {e1, e5};
-                arrayPartition(long.class, a, baseOffset, low, high, pivotIndices, isDualPivot);
+                pivotIndices = partitionDualPivot(long.class, a, Unsafe.ARRAY_LONG_BASE_OFFSET, low, high, e1, e5);
                 lower = pivotIndices[0];
                 upper = pivotIndices[1];
                 /*
@@ -1252,8 +1206,7 @@ && tryMergeRuns(sorter, a, low, size)) {
                  * Use the third of the five sorted elements as the pivot.
                  * This value is inexpensive approximation of the median.
                  */
-                pivotIndices = new int[] {e3, e3};
-                arrayPartition(long.class, a, baseOffset, low, high, pivotIndices, isDualPivot);
+                pivotIndices = partitionSinglePivot(long.class, a, Unsafe.ARRAY_LONG_BASE_OFFSET, low, high, e3);
                 lower = pivotIndices[0];
                 upper = pivotIndices[1];
                 /*
@@ -1272,22 +1225,23 @@ && tryMergeRuns(sorter, a, low, size)) {
     }
 
     /**
-     * Partitions the specified range of the array using the two pivots specified.
+     * Partitions the specified range of the array using the two pivots provided.
      *
      * @param array the array to be partitioned
      * @param low the index of the first element, inclusive, for partitioning
      * @param high the index of the last element, exclusive, for partitioning
-     * @param pivotIndices an array containing the indices of the two pivots to be used.
-     * After partitioning, this array the indices of the pivots is updated as well.
+     * @param indexPivot1 the index of pivot1, the first pivot
+     * @param indexPivot2 the index of pivot2, the second pivot
      *
      */
-    private static void partitionDualPivot(long[] a, int low, int high, int[] pivotIndices) {
+    @ForceInline
+    private static int[] partitionDualPivot(long[] a, int low, int high, int indexPivot1, int indexPivot2) {
         int end = high - 1;
         int lower = low;
         int upper = end;
 
-        int e1 = pivotIndices[0];
-        int e5 = pivotIndices[1];
+        int e1 = indexPivot1;
+        int e5 = indexPivot2;
         long pivot1 = a[e1];
         long pivot2 = a[e5];
 
@@ -1354,27 +1308,26 @@ private static void partitionDualPivot(long[] a, int low, int high, int[] pivotI
         a[low] = a[lower]; a[lower] = pivot1;
         a[end] = a[upper]; a[upper] = pivot2;
 
-        pivotIndices[0] = lower;
-        pivotIndices[1] = upper;
+        return new int[] {lower, upper};
     }
 
 
     /**
-     * Partitions the specified range of the array using a single pivot specified.
+     * Partitions the specified range of the array using a single pivot provided.
      *
      * @param array the array to be partitioned
      * @param low the index of the first element, inclusive, for partitioning
      * @param high the index of the last element, exclusive, for partitioning
-     * @param pivotIndices an array containing the indices of the pivot to be used.
-     * After partitioning, this array the indices of the pivots is updated as well.
+     * @param indexPivot the index of the pivot
      *
      */
-    private static void partitionSinglePivot(long[] a, int low, int high, int[] pivotIndices) {
+    @ForceInline
+    private static int[] partitionSinglePivot(long[] a, int low, int high, int indexPivot) {
         int end = high - 1;
         int lower = low;
         int upper = end;
 
-        int e3 = pivotIndices[0];
+        int e3 = indexPivot;
         long pivot = a[e3];
 
         /*
@@ -1428,8 +1381,7 @@ private static void partitionSinglePivot(long[] a, int low, int high, int[] pivo
             * Swap the pivot into its final position.
             */
         a[low] = a[lower]; a[lower] = pivot;
-        pivotIndices[0] = lower;
-        pivotIndices[1] = upper;
+        return new int[] {lower, upper};
     }
 
     /**
@@ -2718,18 +2670,16 @@ static void sort(float[] a, int parallelism, int low, int high) {
      * @param high the index of the last element, exclusive, to be sorted
      */
     static void sort(Sorter sorter, float[] a, int bits, int low, int high) {
+        int[] pivotIndices;
         while (true) {
             int end = high - 1, size = high - low;
-            int[] pivotIndices;
-            int baseOffset = Unsafe.ARRAY_FLOAT_BASE_OFFSET;
 
             /*
              * Run mixed insertion sort on small non-leftmost parts.
              */
             if (size < MAX_MIXED_INSERTION_SORT_SIZE + bits && (bits & 1) > 0) {
                 int last  = high - 3 * ((size >> 5) << 3);
-                if (size < MIN_FAST_SMALL_ARRAY_SORT_SIZE) mixedInsertionSort(a, low, last , high);
-                else arraySort(float.class, a, baseOffset, low, high, last);
+                mixedInsertionSort(float.class, a, Unsafe.ARRAY_FLOAT_BASE_OFFSET, low, high, high - 3 * ((size >> 5) << 3));
                 return;
             }
 
@@ -2737,8 +2687,7 @@ static void sort(Sorter sorter, float[] a, int bits, int low, int high) {
              * Invoke insertion sort on small leftmost part.
              */
             if (size < MAX_INSERTION_SORT_SIZE) {
-                if (size < MIN_FAST_SMALL_ARRAY_SORT_SIZE) insertionSort(a, low, high);
-                else arraySort(float.class, a, baseOffset, low, high, -1);
+                insertionSort(float.class, a, Unsafe.ARRAY_FLOAT_BASE_OFFSET, low, high);
                 return;
             }
 
@@ -2818,16 +2767,14 @@ && tryMergeRuns(sorter, a, low, size)) {
             /*
              * Partitioning with 2 pivots in case of different elements.
              */
-            boolean isDualPivot = (a[e1] < a[e2] && a[e2] < a[e3] && a[e3] < a[e4] && a[e4] < a[e5]);
-            if(isDualPivot)  {
+            if(a[e1] < a[e2] && a[e2] < a[e3] && a[e3] < a[e4] && a[e4] < a[e5]) {
 
                 /*
                  * Use the first and fifth of the five sorted elements as
                  * the pivots. These values are inexpensive approximation
                  * of tertiles. Note, that pivot1 < pivot2.
                  */
-                pivotIndices = new int[] {e1, e5};
-                arrayPartition(float.class, a, baseOffset, low, high, pivotIndices, isDualPivot);
+                pivotIndices = partitionDualPivot(float.class, a, Unsafe.ARRAY_FLOAT_BASE_OFFSET, low, high, e1, e5);
                 lower = pivotIndices[0];
                 upper = pivotIndices[1];
                 /*
@@ -2848,8 +2795,7 @@ && tryMergeRuns(sorter, a, low, size)) {
                  * Use the third of the five sorted elements as the pivot.
                  * This value is inexpensive approximation of the median.
                  */
-                pivotIndices = new int[] {e3, e3};
-                arrayPartition(float.class, a, baseOffset, low, high, pivotIndices, isDualPivot);
+                pivotIndices = partitionSinglePivot(float.class, a, Unsafe.ARRAY_FLOAT_BASE_OFFSET, low, high, e3);
                 lower = pivotIndices[0];
                 upper = pivotIndices[1];
                 /*
@@ -2868,22 +2814,23 @@ && tryMergeRuns(sorter, a, low, size)) {
     }
 
     /**
-     * Partitions the specified range of the array using the two pivots specified.
+     * Partitions the specified range of the array using the two pivots provided.
      *
      * @param array the array to be partitioned
      * @param low the index of the first element, inclusive, for partitioning
      * @param high the index of the last element, exclusive, for partitioning
-     * @param pivotIndices an array containing the indices of the two pivots to be used.
-     * After partitioning, this array the indices of the pivots is updated as well.
+     * @param indexPivot1 the index of pivot1, the first pivot
+     * @param indexPivot2 the index of pivot2, the second pivot
      *
      */
-    private static void partitionDualPivot(float[] a, int low, int high, int[] pivotIndices) {
+    @ForceInline
+    private static int[] partitionDualPivot(float[] a, int low, int high, int indexPivot1, int indexPivot2) {
         int end = high - 1;
         int lower = low;
         int upper = end;
 
-        int e1 = pivotIndices[0];
-        int e5 = pivotIndices[1];
+        int e1 = indexPivot1;
+        int e5 = indexPivot2;
         float pivot1 = a[e1];
         float pivot2 = a[e5];
 
@@ -2950,27 +2897,26 @@ private static void partitionDualPivot(float[] a, int low, int high, int[] pivot
         a[low] = a[lower]; a[lower] = pivot1;
         a[end] = a[upper]; a[upper] = pivot2;
 
-        pivotIndices[0] = lower;
-        pivotIndices[1] = upper;
+        return new int[] {lower, upper};
     }
 
 
     /**
-     * Partitions the specified range of the array using a single pivot specified.
+     * Partitions the specified range of the array using a single pivot provided.
      *
      * @param array the array to be partitioned
      * @param low the index of the first element, inclusive, for partitioning
      * @param high the index of the last element, exclusive, for partitioning
-     * @param pivotIndices an array containing the indices of the pivot to be used.
-     * After partitioning, this array the indices of the pivots is updated as well.
+     * @param indexPivot the index of the pivot
      *
      */
-    private static void partitionSinglePivot(float[] a, int low, int high, int[] pivotIndices) {
+    @ForceInline
+    private static int[] partitionSinglePivot(float[] a, int low, int high, int indexPivot) {
         int end = high - 1;
         int lower = low;
         int upper = end;
 
-        int e3 = pivotIndices[0];
+        int e3 = indexPivot;
         float pivot = a[e3];
 
         /*
@@ -3024,8 +2970,7 @@ private static void partitionSinglePivot(float[] a, int low, int high, int[] piv
             * Swap the pivot into its final position.
             */
         a[low] = a[lower]; a[lower] = pivot;
-        pivotIndices[0] = lower;
-        pivotIndices[1] = upper;
+        return new int[] {lower, upper};
     }
 
     /**
@@ -3579,18 +3524,15 @@ static void sort(double[] a, int parallelism, int low, int high) {
      * @param high the index of the last element, exclusive, to be sorted
      */
     static void sort(Sorter sorter, double[] a, int bits, int low, int high) {
+        int[] pivotIndices;
         while (true) {
             int end = high - 1, size = high - low;
-            int[] pivotIndices;
-            int baseOffset = Unsafe.ARRAY_DOUBLE_BASE_OFFSET;
-
             /*
              * Run mixed insertion sort on small non-leftmost parts.
              */
             if (size < MAX_MIXED_INSERTION_SORT_SIZE + bits && (bits & 1) > 0) {
                 int last  = high - 3 * ((size >> 5) << 3);
-                if (size < MIN_FAST_SMALL_ARRAY_SORT_SIZE) mixedInsertionSort(a, low, last , high);
-                else arraySort(double.class, a, baseOffset, low, high, last);
+                mixedInsertionSort(double.class, a, Unsafe.ARRAY_DOUBLE_BASE_OFFSET, low, high, high - 3 * ((size >> 5) << 3));
                 return;
             }
 
@@ -3598,8 +3540,7 @@ static void sort(Sorter sorter, double[] a, int bits, int low, int high) {
              * Invoke insertion sort on small leftmost part.
              */
             if (size < MAX_INSERTION_SORT_SIZE) {
-                if (size < MIN_FAST_SMALL_ARRAY_SORT_SIZE) insertionSort(a, low, high);
-                else arraySort(double.class, a, baseOffset, low, high, -1);
+                insertionSort(double.class, a, Unsafe.ARRAY_DOUBLE_BASE_OFFSET, low, high);
                 return;
             }
 
@@ -3679,16 +3620,14 @@ && tryMergeRuns(sorter, a, low, size)) {
             /*
              * Partitioning with 2 pivots in case of different elements.
              */
-            boolean isDualPivot = (a[e1] < a[e2] && a[e2] < a[e3] && a[e3] < a[e4] && a[e4] < a[e5]);
-            if(isDualPivot)  {
+            if(a[e1] < a[e2] && a[e2] < a[e3] && a[e3] < a[e4] && a[e4] < a[e5]) {
 
                 /*
                 * Use the first and fifth of the five sorted elements as
                 * the pivots. These values are inexpensive approximation
                 * of tertiles. Note, that pivot1 < pivot2.
                 */
-                pivotIndices = new int[] {e1, e5};
-                arrayPartition(double.class, a, baseOffset, low, high, pivotIndices, isDualPivot);
+                pivotIndices = partitionDualPivot(double.class, a, Unsafe.ARRAY_DOUBLE_BASE_OFFSET, low, high, e1, e5);
                 lower = pivotIndices[0];
                 upper = pivotIndices[1];
                 /*
@@ -3709,8 +3648,7 @@ && tryMergeRuns(sorter, a, low, size)) {
                  * Use the third of the five sorted elements as the pivot.
                  * This value is inexpensive approximation of the median.
                  */
-                pivotIndices = new int[] {e3, e3};
-                arrayPartition(double.class, a, baseOffset, low, high, pivotIndices, isDualPivot);
+                pivotIndices = partitionSinglePivot(double.class, a, Unsafe.ARRAY_DOUBLE_BASE_OFFSET, low, high, e3);
                 lower = pivotIndices[0];
                 upper = pivotIndices[1];
 
@@ -3730,22 +3668,23 @@ && tryMergeRuns(sorter, a, low, size)) {
     }
 
     /**
-     * Partitions the specified range of the array using the two pivots specified.
+     * Partitions the specified range of the array using the two pivots provided.
      *
      * @param array the array to be partitioned
      * @param low the index of the first element, inclusive, for partitioning
      * @param high the index of the last element, exclusive, for partitioning
-     * @param pivotIndices an array containing the indices of the two pivots to be used.
-     * After partitioning, this array the indices of the pivots is updated as well.
+     * @param indexPivot1 the index of pivot1, the first pivot
+     * @param indexPivot2 the index of pivot2, the second pivot
      *
      */
-    private static void partitionDualPivot(double[] a, int low, int high, int[] pivotIndices) {
+    @ForceInline
+    private static int[] partitionDualPivot(double[] a, int low, int high, int indexPivot1, int indexPivot2) {
         int end = high - 1;
         int lower = low;
         int upper = end;
 
-        int e1 = pivotIndices[0];
-        int e5 = pivotIndices[1];
+        int e1 = indexPivot1;
+        int e5 = indexPivot2;
         double pivot1 = a[e1];
         double pivot2 = a[e5];
 
@@ -3812,27 +3751,26 @@ private static void partitionDualPivot(double[] a, int low, int high, int[] pivo
         a[low] = a[lower]; a[lower] = pivot1;
         a[end] = a[upper]; a[upper] = pivot2;
 
-        pivotIndices[0] = lower;
-        pivotIndices[1] = upper;
+        return new int[] {lower, upper};
     }
 
 
 
     /**
-     * Partitions the specified range of the array using a single pivot specified.
+     * Partitions the specified range of the array using a single pivot provided.
      *
      * @param array the array to be partitioned
      * @param low the index of the first element, inclusive, for partitioning
      * @param high the index of the last element, exclusive, for partitioning
-     * @param pivotIndices an array containing the indices of the pivot to be used.
-     * After partitioning, this array the indices of the pivots is updated as well.
+     * @param indexPivot the index of the pivot
      */
-    private static void partitionSinglePivot(double[] a, int low, int high, int[] pivotIndices) {
+    @ForceInline
+    private static int[] partitionSinglePivot(double[] a, int low, int high, int indexPivot) {
         int end = high - 1;
         int lower = low;
         int upper = end;
 
-        int e3 = pivotIndices[0];
+        int e3 = indexPivot;
         double pivot = a[e3];
 
         /*
@@ -3886,8 +3824,7 @@ private static void partitionSinglePivot(double[] a, int low, int high, int[] pi
             * Swap the pivot into its final position.
             */
         a[low] = a[lower]; a[lower] = pivot;
-        pivotIndices[0] = lower;
-        pivotIndices[1] = upper;
+        return new int[] {lower, upper};
     }
 
     /**
diff --git a/test/jdk/java/util/Arrays/Sorting.java b/test/jdk/java/util/Arrays/Sorting.java
index e89496bb2e532..ce5b2ff87e07d 100644
--- a/test/jdk/java/util/Arrays/Sorting.java
+++ b/test/jdk/java/util/Arrays/Sorting.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2009, 2019, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2009, 2023, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -26,7 +26,8 @@
  * @compile/module=java.base java/util/SortingHelper.java
  * @bug 6880672 6896573 6899694 6976036 7013585 7018258 8003981 8226297
  * @build Sorting
- * @run main Sorting -shortrun
+ * @run main/othervm -XX:+UnlockDiagnosticVMOptions -XX:DisableIntrinsic=_arraySortI,_arraySortMI,_arrayPartitionSP,_arrayPartitionDP Sorting -shortrun
+ * @run main/othervm -XX:CompileThreshold=1 -XX:-TieredCompilation Sorting -shortrun
  * @summary Exercise Arrays.sort, Arrays.parallelSort
  *
  * @author Vladimir Yaroslavskiy
@@ -46,7 +47,7 @@ public class Sorting {
 
     // Array lengths used in a long run (default)
     private static final int[] LONG_RUN_LENGTHS = {
-        1, 3, 8, 21, 55, 100, 1_000, 10_000, 100_000 };
+        1, 3, 8, 21, 55, 100, 1_000, 10_000, 100_000};
 
     // Array lengths used in a short run
     private static final int[] SHORT_RUN_LENGTHS = {

From ed8b95c98379fb08e9aea71aca3f7261896304cd Mon Sep 17 00:00:00 2001
From: vamsi-parasa <srinivas.vamsi.parasa@intel.com>
Date: Tue, 12 Sep 2023 15:52:54 -0700
Subject: [PATCH 32/40] Refactor stub handling to use a generic function for
 all types

---
 src/hotspot/cpu/x86/stubGenerator_x86_64.cpp  | 38 ++------
 .../gc/shenandoah/c2/shenandoahSupport.cpp    |  2 +-
 src/hotspot/share/jvmci/vmStructs_jvmci.cpp   | 15 +---
 src/hotspot/share/opto/library_call.cpp       | 14 ++-
 src/hotspot/share/opto/runtime.cpp            |  6 +-
 src/hotspot/share/runtime/stubRoutines.cpp    | 39 +-------
 src/hotspot/share/runtime/stubRoutines.hpp    | 19 ++--
 .../native/libsimdsort/avxsort_linux_x86.cpp  | 88 ++++++++++---------
 8 files changed, 78 insertions(+), 143 deletions(-)

diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
index 1ac5f566434e4..ff688ef1913c5 100644
--- a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
+++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
@@ -4192,41 +4192,15 @@ void StubGenerator::generate_compiler_stubs() {
     if (libsimdsort != nullptr) {
       log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "x86_64" JNI_LIB_SUFFIX, p2i(libsimdsort));
 
-      snprintf(ebuf_, sizeof(ebuf_), "avx512_sort_int");
-      StubRoutines::_arraysort_int = (address)os::dll_lookup(libsimdsort, ebuf_);
+      snprintf(ebuf_, sizeof(ebuf_), "avx512_sort");
+      StubRoutines::_arraysort = (address)os::dll_lookup(libsimdsort, ebuf_);
 
-      snprintf(ebuf_, sizeof(ebuf_), "avx512_sort_long");
-      StubRoutines::_arraysort_long = (address)os::dll_lookup(libsimdsort, ebuf_);
+      snprintf(ebuf_, sizeof(ebuf_), "avx512_partition_single");
+      StubRoutines::_array_partition_single = (address)os::dll_lookup(libsimdsort, ebuf_);
 
-      snprintf(ebuf_, sizeof(ebuf_), "avx512_sort_float");
-      StubRoutines::_arraysort_float = (address)os::dll_lookup(libsimdsort, ebuf_);
+      snprintf(ebuf_, sizeof(ebuf_), "avx512_partition_dual");
+      StubRoutines::_array_partition_dual = (address)os::dll_lookup(libsimdsort, ebuf_);
 
-      snprintf(ebuf_, sizeof(ebuf_), "avx512_sort_double");
-      StubRoutines::_arraysort_double = (address)os::dll_lookup(libsimdsort, ebuf_);
-
-      snprintf(ebuf_, sizeof(ebuf_), "avx512_partition_single_int");
-      StubRoutines::_array_partition_single_int = (address)os::dll_lookup(libsimdsort, ebuf_);
-
-      snprintf(ebuf_, sizeof(ebuf_), "avx512_partition_dual_int");
-      StubRoutines::_array_partition_dual_int = (address)os::dll_lookup(libsimdsort, ebuf_);
-
-      snprintf(ebuf_, sizeof(ebuf_), "avx512_partition_single_long");
-      StubRoutines::_array_partition_single_long = (address)os::dll_lookup(libsimdsort, ebuf_);
-
-      snprintf(ebuf_, sizeof(ebuf_), "avx512_partition_dual_long");
-      StubRoutines::_array_partition_dual_long = (address)os::dll_lookup(libsimdsort, ebuf_);
-
-      snprintf(ebuf_, sizeof(ebuf_), "avx512_partition_single_float");
-      StubRoutines::_array_partition_single_float = (address)os::dll_lookup(libsimdsort, ebuf_);
-
-      snprintf(ebuf_, sizeof(ebuf_), "avx512_partition_dual_float");
-      StubRoutines::_array_partition_dual_float = (address)os::dll_lookup(libsimdsort, ebuf_);
-
-      snprintf(ebuf_, sizeof(ebuf_), "avx512_partition_single_double");
-      StubRoutines::_array_partition_single_double = (address)os::dll_lookup(libsimdsort, ebuf_);
-
-      snprintf(ebuf_, sizeof(ebuf_), "avx512_partition_dual_double");
-      StubRoutines::_array_partition_dual_double = (address)os::dll_lookup(libsimdsort, ebuf_);
     }
   }
 
diff --git a/src/hotspot/share/gc/shenandoah/c2/shenandoahSupport.cpp b/src/hotspot/share/gc/shenandoah/c2/shenandoahSupport.cpp
index 0384ec1942b3d..8f1e041b5a9b4 100644
--- a/src/hotspot/share/gc/shenandoah/c2/shenandoahSupport.cpp
+++ b/src/hotspot/share/gc/shenandoah/c2/shenandoahSupport.cpp
@@ -388,7 +388,7 @@ void ShenandoahBarrierC2Support::verify(RootNode* root) {
         } args[6];
       } calls[] = {
         "array_partition_stub",
-        { { TypeFunc::Parms, ShenandoahStore }, { TypeFunc::Parms+3, ShenandoahStore },   { -1, ShenandoahNone },
+        { { TypeFunc::Parms, ShenandoahStore }, { TypeFunc::Parms+4, ShenandoahStore },   { -1, ShenandoahNone },
           { -1, ShenandoahNone },                { -1, ShenandoahNone },                  { -1, ShenandoahNone } },
         "arraysort_stub",
         { { TypeFunc::Parms, ShenandoahStore },  { -1, ShenandoahNone },                  { -1, ShenandoahNone },
diff --git a/src/hotspot/share/jvmci/vmStructs_jvmci.cpp b/src/hotspot/share/jvmci/vmStructs_jvmci.cpp
index e74afd3a7759d..f687f879b863f 100644
--- a/src/hotspot/share/jvmci/vmStructs_jvmci.cpp
+++ b/src/hotspot/share/jvmci/vmStructs_jvmci.cpp
@@ -327,18 +327,9 @@
   static_field(StubRoutines,                _checkcast_arraycopy_uninit,                      address)                               \
   static_field(StubRoutines,                _unsafe_arraycopy,                                address)                               \
   static_field(StubRoutines,                _generic_arraycopy,                               address)                               \
-  static_field(StubRoutines,                _arraysort_int,                                   address)                               \
-  static_field(StubRoutines,                _arraysort_long,                                  address)                               \
-  static_field(StubRoutines,                _arraysort_float,                                 address)                               \
-  static_field(StubRoutines,                _arraysort_double,                                address)                               \
-  static_field(StubRoutines,                _array_partition_single_int,                      address)                               \
-  static_field(StubRoutines,                _array_partition_dual_int,                        address)                               \
-  static_field(StubRoutines,                _array_partition_single_long,                     address)                               \
-  static_field(StubRoutines,                _array_partition_dual_long,                       address)                               \
-  static_field(StubRoutines,                _array_partition_single_float,                    address)                               \
-  static_field(StubRoutines,                _array_partition_dual_float,                      address)                               \
-  static_field(StubRoutines,                _array_partition_single_double,                   address)                               \
-  static_field(StubRoutines,                _array_partition_dual_double,                     address)                               \
+  static_field(StubRoutines,                _arraysort,                                       address)                               \
+  static_field(StubRoutines,                _array_partition_single,                          address)                               \
+  static_field(StubRoutines,                _array_partition_dual,                            address)                               \
                                                                                                                                      \
   static_field(StubRoutines,                _aescrypt_encryptBlock,                           address)                               \
   static_field(StubRoutines,                _aescrypt_decryptBlock,                           address)                               \
diff --git a/src/hotspot/share/opto/library_call.cpp b/src/hotspot/share/opto/library_call.cpp
index 2ab21eb1b5355..34c2f003651da 100644
--- a/src/hotspot/share/opto/library_call.cpp
+++ b/src/hotspot/share/opto/library_call.cpp
@@ -5387,7 +5387,7 @@ bool LibraryCallKit::inline_array_partition(bool is_dual_pivot) {
   const TypeInstPtr* elem_klass = gvn().type(elementType)->isa_instptr();
   ciType* elem_type = elem_klass->const_oop()->as_instance()->java_mirror_type();
   BasicType bt = elem_type->basic_type();
-  stubAddr = StubRoutines::select_array_partition_function(bt, is_dual_pivot);
+  stubAddr = StubRoutines::select_array_partition_function(is_dual_pivot);
   // stub not loaded
   if (stubAddr == nullptr) {
     return false;
@@ -5408,10 +5408,13 @@ bool LibraryCallKit::inline_array_partition(bool is_dual_pivot) {
   guarantee(alloc != nullptr, "created above");
   Node* pivotIndices_adr = basic_plus_adr(pivotIndices, arrayOopDesc::base_offset_in_bytes(T_INT));
 
+  // pass the bastic type enum to the stub
+  Node* elemType = intcon(bt);
+
   // Call the stub
   make_runtime_call(RC_LEAF|RC_NO_FP, OptoRuntime::array_partition_Type(is_dual_pivot),
                     stubAddr, stubName, TypePtr::BOTTOM,
-                    obj_adr, fromIndex, toIndex, pivotIndices_adr, indexPivot1, indexPivot2);
+                    obj_adr, elemType, fromIndex, toIndex, pivotIndices_adr, indexPivot1, indexPivot2);
 
   if (!stopped()) {
     set_result(pivotIndices);
@@ -5437,7 +5440,7 @@ bool LibraryCallKit::inline_arraysort() {
   const TypeInstPtr* elem_klass = gvn().type(elementType)->isa_instptr();
   ciType* elem_type = elem_klass->const_oop()->as_instance()->java_mirror_type();
   BasicType bt = elem_type->basic_type();
-  stubAddr = StubRoutines::select_arraysort_function(bt);
+  stubAddr = StubRoutines::select_arraysort_function();
   //stub not loaded
   if (stubAddr == nullptr) {
     return false;
@@ -5450,10 +5453,13 @@ bool LibraryCallKit::inline_arraysort() {
   }
   Node* obj_adr = make_unsafe_address(obj, offset);
 
+  // pass the bastic type enum to the stub
+  Node* elemType = intcon(bt);
+
   // Call the stub.
   make_runtime_call(RC_LEAF|RC_NO_FP, OptoRuntime::array_sort_Type(),
                     stubAddr, stubName, TypePtr::BOTTOM,
-                    obj_adr, fromIndex, toIndex);
+                    obj_adr, elemType, fromIndex, toIndex);
 
   return true;
 }
diff --git a/src/hotspot/share/opto/runtime.cpp b/src/hotspot/share/opto/runtime.cpp
index e6d8c956a5e63..7a57b755555f1 100644
--- a/src/hotspot/share/opto/runtime.cpp
+++ b/src/hotspot/share/opto/runtime.cpp
@@ -859,11 +859,12 @@ const TypeFunc* OptoRuntime::array_fill_Type() {
 
 const TypeFunc* OptoRuntime::array_partition_Type(bool is_dual_pivot) {
   // create input type (domain)
-  int num_args = is_dual_pivot ? 6 : 5;
+  int num_args = is_dual_pivot ? 7 : 6;
   int argcnt = num_args;
   const Type** fields = TypeTuple::fields(argcnt);
   int argp = TypeFunc::Parms;
   fields[argp++] = TypePtr::NOTNULL;  // array
+  fields[argp++] = TypeInt::INT;      // element type
   fields[argp++] = TypeInt::INT;      // low
   fields[argp++] = TypeInt::INT;      // end
   fields[argp++] = TypePtr::NOTNULL;  // pivot_indices (int array)
@@ -883,11 +884,12 @@ const TypeFunc* OptoRuntime::array_partition_Type(bool is_dual_pivot) {
 
 const TypeFunc* OptoRuntime::array_sort_Type() {
   // create input type (domain)
-  int num_args      = 3;
+  int num_args      = 4;
   int argcnt = num_args;
   const Type** fields = TypeTuple::fields(argcnt);
   int argp = TypeFunc::Parms;
   fields[argp++] = TypePtr::NOTNULL;    // array
+  fields[argp++] = TypeInt::INT;    // element type
   fields[argp++] = TypeInt::INT;    // fromIndex
   fields[argp++] = TypeInt::INT;    // toIndex
   assert(argp == TypeFunc::Parms+argcnt, "correct decoding");
diff --git a/src/hotspot/share/runtime/stubRoutines.cpp b/src/hotspot/share/runtime/stubRoutines.cpp
index dce5740cecfda..ed26d119f49a6 100644
--- a/src/hotspot/share/runtime/stubRoutines.cpp
+++ b/src/hotspot/share/runtime/stubRoutines.cpp
@@ -176,18 +176,9 @@ address StubRoutines::_hf2f = nullptr;
 address StubRoutines::_vector_f_math[VectorSupport::NUM_VEC_SIZES][VectorSupport::NUM_SVML_OP] = {{nullptr}, {nullptr}};
 address StubRoutines::_vector_d_math[VectorSupport::NUM_VEC_SIZES][VectorSupport::NUM_SVML_OP] = {{nullptr}, {nullptr}};
 
-address StubRoutines::_arraysort_int = nullptr;
-address StubRoutines::_arraysort_long = nullptr;
-address StubRoutines::_arraysort_float = nullptr;
-address StubRoutines::_arraysort_double = nullptr;
-address StubRoutines::_array_partition_single_int  = nullptr;
-address StubRoutines::_array_partition_dual_int  = nullptr;
-address StubRoutines::_array_partition_single_long = nullptr;
-address StubRoutines::_array_partition_dual_long = nullptr;
-address StubRoutines::_array_partition_single_float = nullptr;
-address StubRoutines::_array_partition_dual_float = nullptr;
-address StubRoutines::_array_partition_single_double = nullptr;
-address StubRoutines::_array_partition_dual_double = nullptr;
+address StubRoutines::_arraysort = nullptr;
+address StubRoutines::_array_partition_single  = nullptr;
+address StubRoutines::_array_partition_dual  = nullptr;
 
 address StubRoutines::_cont_thaw          = nullptr;
 address StubRoutines::_cont_returnBarrier = nullptr;
@@ -544,27 +535,3 @@ UnsafeCopyMemoryMark::~UnsafeCopyMemoryMark() {
     }
   }
 }
-
-address StubRoutines::select_arraysort_function(BasicType t) {
-  switch(t) {
-    case T_INT:    return _arraysort_int;
-    case T_LONG:   return _arraysort_long;
-    case T_FLOAT:  return _arraysort_float;
-    case T_DOUBLE: return _arraysort_double;
-  default:
-    ShouldNotReachHere();
-    return nullptr;
-  }
-}
-
-address StubRoutines::select_array_partition_function(BasicType t, bool is_dual_pivot) {
-  switch(t) {
-    case T_INT:    return is_dual_pivot ? _array_partition_dual_int : _array_partition_single_int;
-    case T_LONG:   return is_dual_pivot ? _array_partition_dual_long : _array_partition_single_long;
-    case T_FLOAT:  return is_dual_pivot ? _array_partition_dual_float : _array_partition_single_float;
-    case T_DOUBLE: return is_dual_pivot ? _array_partition_dual_double : _array_partition_single_double;
-  default:
-    ShouldNotReachHere();
-    return nullptr;
-  }
-}
diff --git a/src/hotspot/share/runtime/stubRoutines.hpp b/src/hotspot/share/runtime/stubRoutines.hpp
index ee87450285c2e..4afd471e97de7 100644
--- a/src/hotspot/share/runtime/stubRoutines.hpp
+++ b/src/hotspot/share/runtime/stubRoutines.hpp
@@ -153,18 +153,9 @@ class StubRoutines: AllStatic {
   static BufferBlob* _compiler_stubs_code;                 // code buffer for C2 intrinsics
   static BufferBlob* _final_stubs_code;                    // code buffer for all other routines
 
-  static address _arraysort_int;
-  static address _arraysort_long;
-  static address _arraysort_float;
-  static address _arraysort_double;
-  static address _array_partition_single_int;
-  static address _array_partition_dual_int;
-  static address _array_partition_single_long;
-  static address _array_partition_dual_long;
-  static address _array_partition_single_float;
-  static address _array_partition_dual_float;
-  static address _array_partition_single_double;
-  static address _array_partition_dual_double;
+  static address _arraysort;
+  static address _array_partition_single;
+  static address _array_partition_dual;
   // Leaf routines which implement arraycopy and their addresses
   // arraycopy operands aligned on element type boundary
   static address _jbyte_arraycopy;
@@ -387,8 +378,8 @@ class StubRoutines: AllStatic {
   static UnsafeArrayCopyStub UnsafeArrayCopy_stub()         { return CAST_TO_FN_PTR(UnsafeArrayCopyStub,  _unsafe_arraycopy); }
 
   static address generic_arraycopy()   { return _generic_arraycopy; }
-  static address select_arraysort_function(BasicType t);
-  static address select_array_partition_function(BasicType t, bool is_dual_pivot);
+  static address select_arraysort_function() { return _arraysort; }
+  static address select_array_partition_function(bool is_dual_pivot) { return is_dual_pivot ? _array_partition_dual : _array_partition_single; }
 
   static address jbyte_fill()          { return _jbyte_fill; }
   static address jshort_fill()         { return _jshort_fill; }
diff --git a/src/java.base/linux/native/libsimdsort/avxsort_linux_x86.cpp b/src/java.base/linux/native/libsimdsort/avxsort_linux_x86.cpp
index a18acda571ce1..555be741a4f2b 100644
--- a/src/java.base/linux/native/libsimdsort/avxsort_linux_x86.cpp
+++ b/src/java.base/linux/native/libsimdsort/avxsort_linux_x86.cpp
@@ -27,6 +27,7 @@
 #pragma GCC target("avx512dq", "avx512f")
 #include "avx512-32bit-qsort.hpp"
 #include "avx512-64bit-qsort.hpp"
+#include "classfile_constants.h"
 
 #define DLL_PUBLIC __attribute__((visibility("default")))
 #define INSERTION_SORT_THRESHOLD_32BIT 16
@@ -34,52 +35,55 @@
 
 extern "C" {
 
-    DLL_PUBLIC void avx512_sort_int(int32_t *array, int64_t from_index, int64_t to_index) {
-        avx512_fastsort<int32_t>(array, from_index, to_index, INSERTION_SORT_THRESHOLD_32BIT);
+    DLL_PUBLIC void avx512_sort(void *array, int elem_type, int32_t from_index, int32_t to_index) {
+        switch(elem_type) {
+            case JVM_T_INT:
+                avx512_fastsort<int32_t>((int32_t*)array, from_index, to_index, INSERTION_SORT_THRESHOLD_32BIT);
+                break;
+            case JVM_T_LONG:
+                avx512_fastsort<int64_t>((int64_t*)array, from_index, to_index, INSERTION_SORT_THRESHOLD_64BIT);
+                break;
+            case JVM_T_FLOAT:
+                avx512_fastsort<float>((float*)array, from_index, to_index, INSERTION_SORT_THRESHOLD_32BIT);
+                break;
+            case JVM_T_DOUBLE:
+                avx512_fastsort<double>((double*)array, from_index, to_index, INSERTION_SORT_THRESHOLD_64BIT);
+                break;
+        }
     }
 
-    DLL_PUBLIC void avx512_sort_long(int64_t *array, int64_t from_index, int64_t to_index) {
-        avx512_fastsort<int64_t>(array, from_index, to_index, INSERTION_SORT_THRESHOLD_64BIT);
+    DLL_PUBLIC void avx512_partition_single(void *array, int elem_type, int32_t from_index, int32_t to_index, int32_t *pivot_indices, int32_t index_pivot) {
+        switch(elem_type) {
+            case JVM_T_INT:
+                avx512_single_pivot_partition<int32_t>((int32_t*)array, from_index, to_index, pivot_indices, index_pivot);
+                break;
+            case JVM_T_LONG:
+                avx512_single_pivot_partition<int64_t>((int64_t*)array, from_index, to_index, pivot_indices, index_pivot);
+                break;
+            case JVM_T_FLOAT:
+                 avx512_single_pivot_partition<float>((float*)array, from_index, to_index, pivot_indices, index_pivot);
+                break;
+            case JVM_T_DOUBLE:
+                avx512_single_pivot_partition<double>((double*)array, from_index, to_index, pivot_indices, index_pivot);
+                break;
+        }
     }
 
-    DLL_PUBLIC void avx512_sort_float(float *array, int64_t from_index, int64_t to_index) {
-        avx512_fastsort<float>(array, from_index, to_index, INSERTION_SORT_THRESHOLD_32BIT);
-    }
-
-    DLL_PUBLIC void avx512_sort_double(double *array, int64_t from_index, int64_t to_index) {
-        avx512_fastsort<double>(array, from_index, to_index, INSERTION_SORT_THRESHOLD_64BIT);
-    }
-
-    DLL_PUBLIC void avx512_partition_single_int(int32_t *array, int64_t from_index, int64_t to_index, int32_t *pivot_indices, int32_t index_pivot) {
-        avx512_single_pivot_partition<int32_t>(array, from_index, to_index, pivot_indices, index_pivot);
-    }
-
-    DLL_PUBLIC void avx512_partition_dual_int(int32_t *array, int64_t from_index, int64_t to_index, int32_t *pivot_indices, int32_t index_pivot1, int32_t index_pivot2) {
-        avx512_dual_pivot_partition<int32_t>(array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2);
-    }
-
-    DLL_PUBLIC void avx512_partition_single_long(int64_t *array, int64_t from_index, int64_t to_index, int32_t *pivot_indices, int32_t index_pivot) {
-        avx512_single_pivot_partition<int64_t>(array, from_index, to_index, pivot_indices, index_pivot);
-    }
-
-    DLL_PUBLIC void avx512_partition_dual_long(int64_t *array, int64_t from_index, int64_t to_index, int32_t *pivot_indices, int32_t index_pivot1, int32_t index_pivot2) {
-        avx512_dual_pivot_partition<int64_t>(array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2);
-    }
-
-    DLL_PUBLIC void avx512_partition_single_float(float *array, int64_t from_index, int64_t to_index, int32_t *pivot_indices, int32_t index_pivot) {
-        avx512_single_pivot_partition<float>(array, from_index, to_index, pivot_indices, index_pivot);
-    }
-
-    DLL_PUBLIC void avx512_partition_dual_float(float *array, int64_t from_index, int64_t to_index, int32_t *pivot_indices, int32_t index_pivot1, int32_t index_pivot2) {
-        avx512_dual_pivot_partition<float>(array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2);
-    }
-
-    DLL_PUBLIC void avx512_partition_single_double(double *array, int64_t from_index, int64_t to_index, int32_t *pivot_indices, int32_t index_pivot) {
-        avx512_single_pivot_partition<double>(array, from_index, to_index, pivot_indices, index_pivot);
-    }
-
-    DLL_PUBLIC void avx512_partition_dual_double(double *array, int64_t from_index, int64_t to_index, int32_t *pivot_indices, int32_t index_pivot1, int32_t index_pivot2) {
-        avx512_dual_pivot_partition<double>(array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2);
+    DLL_PUBLIC void avx512_partition_dual(void *array, int elem_type, int32_t from_index, int32_t to_index, int32_t *pivot_indices, int32_t index_pivot1, int32_t index_pivot2) {
+        switch(elem_type) {
+            case JVM_T_INT:
+                avx512_dual_pivot_partition<int32_t>((int32_t*)array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2);
+                break;
+            case JVM_T_LONG:
+                avx512_dual_pivot_partition<int64_t>((int64_t*)array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2);
+                break;
+            case JVM_T_FLOAT:
+                avx512_dual_pivot_partition<float>((float*)array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2);
+                break;
+            case JVM_T_DOUBLE:
+                avx512_dual_pivot_partition<double>((double*)array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2);
+                break;
+        }
     }
 
 }

From 172b2d3e91b3689cddc4bc92597d610d72645a17 Mon Sep 17 00:00:00 2001
From: vamsi-parasa <srinivas.vamsi.parasa@intel.com>
Date: Wed, 13 Sep 2023 15:54:18 -0700
Subject: [PATCH 33/40] Refactor the sort and partition intrinsics to accept
 method references for fallback functions

---
 src/hotspot/cpu/x86/stubGenerator_x86_64.cpp  |  11 +-
 src/hotspot/share/classfile/vmIntrinsics.hpp  |  19 +-
 src/hotspot/share/jvmci/vmStructs_jvmci.cpp   |   5 +-
 src/hotspot/share/opto/c2compiler.cpp         |   6 +-
 src/hotspot/share/opto/library_call.cpp       |  19 +-
 src/hotspot/share/opto/library_call.hpp       |   4 +-
 src/hotspot/share/opto/runtime.cpp            |   8 +-
 src/hotspot/share/opto/runtime.hpp            |   2 +-
 src/hotspot/share/runtime/stubRoutines.cpp    |   5 +-
 src/hotspot/share/runtime/stubRoutines.hpp    |   9 +-
 .../native/libsimdsort/avx512-common-qsort.h  |  12 +-
 .../native/libsimdsort/avxsort_linux_x86.cpp  |  35 +---
 .../classes/java/util/DualPivotQuicksort.java | 196 +++++++++---------
 test/jdk/java/util/Arrays/Sorting.java        |   2 +-
 14 files changed, 145 insertions(+), 188 deletions(-)

diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
index ff688ef1913c5..bcbb8764982cb 100644
--- a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
+++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
@@ -4190,16 +4190,13 @@ void StubGenerator::generate_compiler_stubs() {
       }
     // Get addresses for avx512 sort and partition routines
     if (libsimdsort != nullptr) {
-      log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "x86_64" JNI_LIB_SUFFIX, p2i(libsimdsort));
+      log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "simdsort" JNI_LIB_SUFFIX, p2i(libsimdsort));
 
       snprintf(ebuf_, sizeof(ebuf_), "avx512_sort");
-      StubRoutines::_arraysort = (address)os::dll_lookup(libsimdsort, ebuf_);
+      StubRoutines::_array_sort = (address)os::dll_lookup(libsimdsort, ebuf_);
 
-      snprintf(ebuf_, sizeof(ebuf_), "avx512_partition_single");
-      StubRoutines::_array_partition_single = (address)os::dll_lookup(libsimdsort, ebuf_);
-
-      snprintf(ebuf_, sizeof(ebuf_), "avx512_partition_dual");
-      StubRoutines::_array_partition_dual = (address)os::dll_lookup(libsimdsort, ebuf_);
+      snprintf(ebuf_, sizeof(ebuf_), "avx512_partition");
+      StubRoutines::_array_partition = (address)os::dll_lookup(libsimdsort, ebuf_);
 
     }
   }
diff --git a/src/hotspot/share/classfile/vmIntrinsics.hpp b/src/hotspot/share/classfile/vmIntrinsics.hpp
index bba728b694f8e..fcd7e6a89b7a0 100644
--- a/src/hotspot/share/classfile/vmIntrinsics.hpp
+++ b/src/hotspot/share/classfile/vmIntrinsics.hpp
@@ -341,20 +341,13 @@ class methodHandle;
    do_name(     copyOf_name,                                     "copyOf")                                              \
    do_signature(copyOf_signature,             "([Ljava/lang/Object;ILjava/lang/Class;)[Ljava/lang/Object;")             \
                                                                                                                         \
-  do_intrinsic(_arraySortMI, java_util_DualPivotQuicksort, arraySortMI_name, arraySortMI_signature, F_S)                \
-   do_name(     arraySortMI_name,                                  "mixedInsertionSort")                                \
-   do_signature(arraySortMI_signature,          "(Ljava/lang/Class;Ljava/lang/Object;JIII)V")                            \
-  do_intrinsic(_arraySortI, java_util_DualPivotQuicksort, arraySortI_name, arraySortI_signature, F_S)                   \
-   do_name(     arraySortI_name,                                   "insertionSort")                                     \
-   do_signature(arraySortI_signature,          "(Ljava/lang/Class;Ljava/lang/Object;JII)V")                             \
-                                                                                                                        \
-  do_intrinsic(_arrayPartitionSP, java_util_DualPivotQuicksort, arrayPartitionSP_name, arrayPartitionSP_signature, F_S) \
-   do_name(     arrayPartitionSP_name,                             "partitionSinglePivot")                              \
-  do_signature(arrayPartitionSP_signature,      "(Ljava/lang/Class;Ljava/lang/Object;JIII)[I")                          \
-  do_intrinsic(_arrayPartitionDP, java_util_DualPivotQuicksort, arrayPartitionDP_name, arrayPartitionDP_signature, F_S) \
-   do_name(     arrayPartitionDP_name,                             "partitionDualPivot")                                \
-  do_signature(arrayPartitionDP_signature,      "(Ljava/lang/Class;Ljava/lang/Object;JIIII)[I")                         \
+  do_intrinsic(_arraySort,                java_util_DualPivotQuicksort, arraySort_name, arraySort_signature,     F_S)   \
+   do_name(     arraySort_name,                                  "arraySort")                                           \
+   do_signature(arraySort_signature,          "(Ljava/lang/Class;Ljava/lang/Object;JIILjava/util/SortOperation;)V")     \
                                                                                                                         \
+  do_intrinsic(_arrayPartition,           java_util_DualPivotQuicksort, arrayPartition_name, arrayPartition_signature, F_S)   \
+   do_name(     arrayPartition_name,                             "arrayPartition")                                      \
+   do_signature(arrayPartition_signature, "(Ljava/lang/Class;Ljava/lang/Object;JIIIILjava/util/PartitionOperation;)[I") \
                                                                                                                         \
   do_intrinsic(_copyOfRange,              java_util_Arrays,       copyOfRange_name, copyOfRange_signature,       F_S)   \
    do_name(     copyOfRange_name,                                "copyOfRange")                                         \
diff --git a/src/hotspot/share/jvmci/vmStructs_jvmci.cpp b/src/hotspot/share/jvmci/vmStructs_jvmci.cpp
index f687f879b863f..794895ec8fbdb 100644
--- a/src/hotspot/share/jvmci/vmStructs_jvmci.cpp
+++ b/src/hotspot/share/jvmci/vmStructs_jvmci.cpp
@@ -327,9 +327,8 @@
   static_field(StubRoutines,                _checkcast_arraycopy_uninit,                      address)                               \
   static_field(StubRoutines,                _unsafe_arraycopy,                                address)                               \
   static_field(StubRoutines,                _generic_arraycopy,                               address)                               \
-  static_field(StubRoutines,                _arraysort,                                       address)                               \
-  static_field(StubRoutines,                _array_partition_single,                          address)                               \
-  static_field(StubRoutines,                _array_partition_dual,                            address)                               \
+  static_field(StubRoutines,                _array_sort,                                      address)                               \
+  static_field(StubRoutines,                _array_partition,                                 address)                               \
                                                                                                                                      \
   static_field(StubRoutines,                _aescrypt_encryptBlock,                           address)                               \
   static_field(StubRoutines,                _aescrypt_decryptBlock,                           address)                               \
diff --git a/src/hotspot/share/opto/c2compiler.cpp b/src/hotspot/share/opto/c2compiler.cpp
index 39f56c002e41e..5efac02178865 100644
--- a/src/hotspot/share/opto/c2compiler.cpp
+++ b/src/hotspot/share/opto/c2compiler.cpp
@@ -597,10 +597,8 @@ bool C2Compiler::is_intrinsic_supported(vmIntrinsics::ID id) {
   case vmIntrinsics::_min_strict:
   case vmIntrinsics::_max_strict:
   case vmIntrinsics::_arraycopy:
-  case vmIntrinsics::_arraySortMI:
-  case vmIntrinsics::_arraySortI:
-  case vmIntrinsics::_arrayPartitionSP:
-  case vmIntrinsics::_arrayPartitionDP:
+  case vmIntrinsics::_arraySort:
+  case vmIntrinsics::_arrayPartition:
   case vmIntrinsics::_indexOfL:
   case vmIntrinsics::_indexOfU:
   case vmIntrinsics::_indexOfUL:
diff --git a/src/hotspot/share/opto/library_call.cpp b/src/hotspot/share/opto/library_call.cpp
index 34c2f003651da..9d119a19ea313 100644
--- a/src/hotspot/share/opto/library_call.cpp
+++ b/src/hotspot/share/opto/library_call.cpp
@@ -293,11 +293,8 @@ bool LibraryCallKit::try_to_inline(int predicate) {
 
   case vmIntrinsics::_arraycopy:                return inline_arraycopy();
 
-  case vmIntrinsics::_arraySortMI:
-  case vmIntrinsics::_arraySortI:               return inline_arraysort();
-
-  case vmIntrinsics::_arrayPartitionSP:         return inline_array_partition(false /* single pivot*/);
-  case vmIntrinsics::_arrayPartitionDP:         return inline_array_partition(true /* dual pivot*/);
+  case vmIntrinsics::_arraySort:                return inline_array_sort();
+  case vmIntrinsics::_arrayPartition:           return inline_array_partition();
 
   case vmIntrinsics::_compareToL:               return inline_string_compareTo(StrIntrinsicNode::LL);
   case vmIntrinsics::_compareToU:               return inline_string_compareTo(StrIntrinsicNode::UU);
@@ -5370,7 +5367,7 @@ void LibraryCallKit::create_new_uncommon_trap(CallStaticJavaNode* uncommon_trap_
 }
 
 //------------------------------inline_array_partition-----------------------
-bool LibraryCallKit::inline_array_partition(bool is_dual_pivot) {
+bool LibraryCallKit::inline_array_partition() {
 
   address stubAddr = nullptr;
   const char *stubName;
@@ -5382,12 +5379,12 @@ bool LibraryCallKit::inline_array_partition(bool is_dual_pivot) {
   Node* fromIndex       = argument(4);
   Node* toIndex         = argument(5);
   Node* indexPivot1     = argument(6);
-  Node* indexPivot2     = is_dual_pivot? argument(7) : nullptr;
+  Node* indexPivot2     = argument(7);
 
   const TypeInstPtr* elem_klass = gvn().type(elementType)->isa_instptr();
   ciType* elem_type = elem_klass->const_oop()->as_instance()->java_mirror_type();
   BasicType bt = elem_type->basic_type();
-  stubAddr = StubRoutines::select_array_partition_function(is_dual_pivot);
+  stubAddr = StubRoutines::select_array_partition_function();
   // stub not loaded
   if (stubAddr == nullptr) {
     return false;
@@ -5412,7 +5409,7 @@ bool LibraryCallKit::inline_array_partition(bool is_dual_pivot) {
   Node* elemType = intcon(bt);
 
   // Call the stub
-  make_runtime_call(RC_LEAF|RC_NO_FP, OptoRuntime::array_partition_Type(is_dual_pivot),
+  make_runtime_call(RC_LEAF|RC_NO_FP, OptoRuntime::array_partition_Type(),
                     stubAddr, stubName, TypePtr::BOTTOM,
                     obj_adr, elemType, fromIndex, toIndex, pivotIndices_adr, indexPivot1, indexPivot2);
 
@@ -5424,8 +5421,8 @@ bool LibraryCallKit::inline_array_partition(bool is_dual_pivot) {
 }
 
 
-//------------------------------inline_arraysort-----------------------
-bool LibraryCallKit::inline_arraysort() {
+//------------------------------inline_array_sort-----------------------
+bool LibraryCallKit::inline_array_sort() {
 
   address stubAddr = nullptr;
   const char *stubName;
diff --git a/src/hotspot/share/opto/library_call.hpp b/src/hotspot/share/opto/library_call.hpp
index d33c1c8ee0538..55d1dc78f1fd5 100644
--- a/src/hotspot/share/opto/library_call.hpp
+++ b/src/hotspot/share/opto/library_call.hpp
@@ -277,8 +277,8 @@ class LibraryCallKit : public GraphKit {
   JVMState* arraycopy_restore_alloc_state(AllocateArrayNode* alloc, int& saved_reexecute_sp);
   void arraycopy_move_allocation_here(AllocateArrayNode* alloc, Node* dest, JVMState* saved_jvms_before_guards, int saved_reexecute_sp,
                                       uint new_idx);
-  bool inline_arraysort();
-  bool inline_array_partition(bool is_dual_pivot);
+  bool inline_array_sort();
+  bool inline_array_partition();
   typedef enum { LS_get_add, LS_get_set, LS_cmp_swap, LS_cmp_swap_weak, LS_cmp_exchange } LoadStoreKind;
   bool inline_unsafe_load_store(BasicType type,  LoadStoreKind kind, AccessKind access_kind);
   bool inline_unsafe_fence(vmIntrinsics::ID id);
diff --git a/src/hotspot/share/opto/runtime.cpp b/src/hotspot/share/opto/runtime.cpp
index 7a57b755555f1..473062abfca09 100644
--- a/src/hotspot/share/opto/runtime.cpp
+++ b/src/hotspot/share/opto/runtime.cpp
@@ -857,9 +857,9 @@ const TypeFunc* OptoRuntime::array_fill_Type() {
   return TypeFunc::make(domain, range);
 }
 
-const TypeFunc* OptoRuntime::array_partition_Type(bool is_dual_pivot) {
+const TypeFunc* OptoRuntime::array_partition_Type() {
   // create input type (domain)
-  int num_args = is_dual_pivot ? 7 : 6;
+  int num_args = 7;
   int argcnt = num_args;
   const Type** fields = TypeTuple::fields(argcnt);
   int argp = TypeFunc::Parms;
@@ -869,9 +869,7 @@ const TypeFunc* OptoRuntime::array_partition_Type(bool is_dual_pivot) {
   fields[argp++] = TypeInt::INT;      // end
   fields[argp++] = TypePtr::NOTNULL;  // pivot_indices (int array)
   fields[argp++] = TypeInt::INT;      // indexPivot1
-  if (is_dual_pivot) {
-    fields[argp++] = TypeInt::INT;    // indexPivot2
-  }
+  fields[argp++] = TypeInt::INT;      // indexPivot2
   assert(argp == TypeFunc::Parms+argcnt, "correct decoding");
   const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields);
 
diff --git a/src/hotspot/share/opto/runtime.hpp b/src/hotspot/share/opto/runtime.hpp
index 4017f70d36296..b85542423e848 100644
--- a/src/hotspot/share/opto/runtime.hpp
+++ b/src/hotspot/share/opto/runtime.hpp
@@ -269,7 +269,7 @@ class OptoRuntime : public AllStatic {
   static const TypeFunc* array_fill_Type();
 
   static const TypeFunc* array_sort_Type();
-  static const TypeFunc* array_partition_Type(bool is_dual_pivot);
+  static const TypeFunc* array_partition_Type();
   static const TypeFunc* aescrypt_block_Type();
   static const TypeFunc* cipherBlockChaining_aescrypt_Type();
   static const TypeFunc* electronicCodeBook_aescrypt_Type();
diff --git a/src/hotspot/share/runtime/stubRoutines.cpp b/src/hotspot/share/runtime/stubRoutines.cpp
index ed26d119f49a6..bea2a934bc603 100644
--- a/src/hotspot/share/runtime/stubRoutines.cpp
+++ b/src/hotspot/share/runtime/stubRoutines.cpp
@@ -176,9 +176,8 @@ address StubRoutines::_hf2f = nullptr;
 address StubRoutines::_vector_f_math[VectorSupport::NUM_VEC_SIZES][VectorSupport::NUM_SVML_OP] = {{nullptr}, {nullptr}};
 address StubRoutines::_vector_d_math[VectorSupport::NUM_VEC_SIZES][VectorSupport::NUM_SVML_OP] = {{nullptr}, {nullptr}};
 
-address StubRoutines::_arraysort = nullptr;
-address StubRoutines::_array_partition_single  = nullptr;
-address StubRoutines::_array_partition_dual  = nullptr;
+address StubRoutines::_array_sort = nullptr;
+address StubRoutines::_array_partition  = nullptr;
 
 address StubRoutines::_cont_thaw          = nullptr;
 address StubRoutines::_cont_returnBarrier = nullptr;
diff --git a/src/hotspot/share/runtime/stubRoutines.hpp b/src/hotspot/share/runtime/stubRoutines.hpp
index 4afd471e97de7..eb29238b46308 100644
--- a/src/hotspot/share/runtime/stubRoutines.hpp
+++ b/src/hotspot/share/runtime/stubRoutines.hpp
@@ -153,9 +153,8 @@ class StubRoutines: AllStatic {
   static BufferBlob* _compiler_stubs_code;                 // code buffer for C2 intrinsics
   static BufferBlob* _final_stubs_code;                    // code buffer for all other routines
 
-  static address _arraysort;
-  static address _array_partition_single;
-  static address _array_partition_dual;
+  static address _array_sort;
+  static address _array_partition;
   // Leaf routines which implement arraycopy and their addresses
   // arraycopy operands aligned on element type boundary
   static address _jbyte_arraycopy;
@@ -378,8 +377,8 @@ class StubRoutines: AllStatic {
   static UnsafeArrayCopyStub UnsafeArrayCopy_stub()         { return CAST_TO_FN_PTR(UnsafeArrayCopyStub,  _unsafe_arraycopy); }
 
   static address generic_arraycopy()   { return _generic_arraycopy; }
-  static address select_arraysort_function() { return _arraysort; }
-  static address select_array_partition_function(bool is_dual_pivot) { return is_dual_pivot ? _array_partition_dual : _array_partition_single; }
+  static address select_arraysort_function() { return _array_sort; }
+  static address select_array_partition_function() { return _array_partition; }
 
   static address jbyte_fill()          { return _jbyte_fill; }
   static address jshort_fill()         { return _jshort_fill; }
diff --git a/src/java.base/linux/native/libsimdsort/avx512-common-qsort.h b/src/java.base/linux/native/libsimdsort/avx512-common-qsort.h
index 8f255a38e47d6..16aeb0d50a30f 100644
--- a/src/java.base/linux/native/libsimdsort/avx512-common-qsort.h
+++ b/src/java.base/linux/native/libsimdsort/avx512-common-qsort.h
@@ -436,9 +436,13 @@ void avx512_single_pivot_partition(T *arr, int64_t from_index, int64_t to_index,
 }
 
 template <typename T>
-void inline avx512_partition(T *arr, int64_t from_index, int64_t to_index, int32_t *pivot_indices, bool is_dual_pviot) {
-    if(is_dual_pviot) avx512_dual_pivot_partition<T>(arr, from_index, to_index, pivot_indices);
-        else avx512_single_pivot_partition<T>(arr, from_index, to_index, pivot_indices);
+void inline avx512_fast_partition(T *arr, int64_t from_index, int64_t to_index, int32_t *pivot_indices, int64_t index_pivot1, int64_t index_pivot2) {
+    if (index_pivot1 != index_pivot2) {
+        avx512_dual_pivot_partition<T>(arr, from_index, to_index, pivot_indices, index_pivot1, index_pivot2);
+    }
+    else {
+        avx512_single_pivot_partition<T>(arr, from_index, to_index, pivot_indices, index_pivot1);
+    }
 }
 
 template <typename T>
@@ -456,7 +460,7 @@ void inline insertion_sort(T *arr, int32_t from_index, int32_t to_index) {
 }
 
 template <typename T>
-void inline avx512_fastsort(T *arr, int64_t from_index, int64_t to_index, const int32_t INS_SORT_THRESHOLD) {
+void inline avx512_fast_sort(T *arr, int64_t from_index, int64_t to_index, const int32_t INS_SORT_THRESHOLD) {
     int32_t size = to_index - from_index;
 
     if (size <= INS_SORT_THRESHOLD) {
diff --git a/src/java.base/linux/native/libsimdsort/avxsort_linux_x86.cpp b/src/java.base/linux/native/libsimdsort/avxsort_linux_x86.cpp
index 555be741a4f2b..a4ac2a8e4955f 100644
--- a/src/java.base/linux/native/libsimdsort/avxsort_linux_x86.cpp
+++ b/src/java.base/linux/native/libsimdsort/avxsort_linux_x86.cpp
@@ -38,50 +38,33 @@ extern "C" {
     DLL_PUBLIC void avx512_sort(void *array, int elem_type, int32_t from_index, int32_t to_index) {
         switch(elem_type) {
             case JVM_T_INT:
-                avx512_fastsort<int32_t>((int32_t*)array, from_index, to_index, INSERTION_SORT_THRESHOLD_32BIT);
+                avx512_fast_sort<int32_t>((int32_t*)array, from_index, to_index, INSERTION_SORT_THRESHOLD_32BIT);
                 break;
             case JVM_T_LONG:
-                avx512_fastsort<int64_t>((int64_t*)array, from_index, to_index, INSERTION_SORT_THRESHOLD_64BIT);
+                avx512_fast_sort<int64_t>((int64_t*)array, from_index, to_index, INSERTION_SORT_THRESHOLD_64BIT);
                 break;
             case JVM_T_FLOAT:
-                avx512_fastsort<float>((float*)array, from_index, to_index, INSERTION_SORT_THRESHOLD_32BIT);
+                avx512_fast_sort<float>((float*)array, from_index, to_index, INSERTION_SORT_THRESHOLD_32BIT);
                 break;
             case JVM_T_DOUBLE:
-                avx512_fastsort<double>((double*)array, from_index, to_index, INSERTION_SORT_THRESHOLD_64BIT);
+                avx512_fast_sort<double>((double*)array, from_index, to_index, INSERTION_SORT_THRESHOLD_64BIT);
                 break;
         }
     }
 
-    DLL_PUBLIC void avx512_partition_single(void *array, int elem_type, int32_t from_index, int32_t to_index, int32_t *pivot_indices, int32_t index_pivot) {
+    DLL_PUBLIC void avx512_partition(void *array, int elem_type, int32_t from_index, int32_t to_index, int32_t *pivot_indices, int32_t index_pivot1, int32_t index_pivot2) {
         switch(elem_type) {
             case JVM_T_INT:
-                avx512_single_pivot_partition<int32_t>((int32_t*)array, from_index, to_index, pivot_indices, index_pivot);
+                avx512_fast_partition<int32_t>((int32_t*)array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2);
                 break;
             case JVM_T_LONG:
-                avx512_single_pivot_partition<int64_t>((int64_t*)array, from_index, to_index, pivot_indices, index_pivot);
+                avx512_fast_partition<int64_t>((int64_t*)array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2);
                 break;
             case JVM_T_FLOAT:
-                 avx512_single_pivot_partition<float>((float*)array, from_index, to_index, pivot_indices, index_pivot);
+                avx512_fast_partition<float>((float*)array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2);
                 break;
             case JVM_T_DOUBLE:
-                avx512_single_pivot_partition<double>((double*)array, from_index, to_index, pivot_indices, index_pivot);
-                break;
-        }
-    }
-
-    DLL_PUBLIC void avx512_partition_dual(void *array, int elem_type, int32_t from_index, int32_t to_index, int32_t *pivot_indices, int32_t index_pivot1, int32_t index_pivot2) {
-        switch(elem_type) {
-            case JVM_T_INT:
-                avx512_dual_pivot_partition<int32_t>((int32_t*)array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2);
-                break;
-            case JVM_T_LONG:
-                avx512_dual_pivot_partition<int64_t>((int64_t*)array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2);
-                break;
-            case JVM_T_FLOAT:
-                avx512_dual_pivot_partition<float>((float*)array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2);
-                break;
-            case JVM_T_DOUBLE:
-                avx512_dual_pivot_partition<double>((double*)array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2);
+                avx512_fast_partition<double>((double*)array, from_index, to_index, pivot_indices, index_pivot1, index_pivot2);
                 break;
         }
     }
diff --git a/src/java.base/share/classes/java/util/DualPivotQuicksort.java b/src/java.base/share/classes/java/util/DualPivotQuicksort.java
index f2cf2deab6638..d6fe8ddcffcf4 100644
--- a/src/java.base/share/classes/java/util/DualPivotQuicksort.java
+++ b/src/java.base/share/classes/java/util/DualPivotQuicksort.java
@@ -53,6 +53,42 @@
  *
  * @since 1.7 * 14
  */
+
+/**
+ * Represents a function that accepts an array and sorts a specified range
+ * of the array into ascending order.
+ */
+@FunctionalInterface
+interface SortOperation<A> {
+    /**
+     * Sorts the specified range of the array.
+     *
+     * @param a the array to be sorted
+     * @param low the index of the first element, inclusive, to be sorted
+     * @param high the index of the last element, exclusive, to be sorted
+     */
+    void sort(A a, int low, int high);
+}
+
+/**
+ * Represents a function that accepts an array and partitions a specified range
+ * of the array based on the pivots provided.
+ */
+@FunctionalInterface
+interface PartitionOperation<A> {
+     /**
+     * Partitions the specified range of the array.
+     *
+     * @param a the array to be sorted
+     * @param low the index of the first element, inclusive, to be sorted
+     * @param high the index of the last element, exclusive, to be sorted
+     * @param indexPivot1 the index of pivot1, the first pivot
+     * @param indexPivot2 the index of pivot2, the second pivot
+     */
+    int[] partition(A a, int low, int high, int indexPivot1, int indexPivot2);
+}
+
+
 final class DualPivotQuicksort {
 
     /**
@@ -137,41 +173,11 @@ private DualPivotQuicksort() {}
      * address pointing to the first element to sort from.
      * @param low the index of the first element, inclusive, to be sorted
      * @param high the index of the last element, exclusive, to be sorted
-     * @param end the index of the last element for simple insertion sort
-     */
-    @IntrinsicCandidate
-    private static void mixedInsertionSort(Class<?> elemType, Object array, long offset, int low, int high, int end) {
-        switch (array) {
-            case int[] arr -> mixedInsertionSort(arr, low, end, high);
-            case long[] arr ->  mixedInsertionSort(arr, low, end, high);
-            case float[] arr ->  mixedInsertionSort(arr, low, end, high);
-            case double[] arr ->  mixedInsertionSort(arr, low, end, high);
-            default -> throw new UnsupportedOperationException();
-        }
-    }
-
-    /**
-     * Sorts the specified array into ascending numerical order using
-     * insertion sort.The intrinsic is free to choose its own
-     * sorting algorithm.
-     *
-     * @param elemType the class of the elements of the array to be sorted
-     * @param array the array to be sorted
-     * @param offset the relative offset, in bytes, from the base address of
-     * the array to sort, otherwise if the array is {@code null},an absolute
-     * address pointing to the first element to sort from.
-     * @param low the index of the first element, inclusive, to be sorted
-     * @param high the index of the last element, exclusive, to be sorted
+     * @param so the method reference for the fallback implementation
      */
     @IntrinsicCandidate
-    private static void insertionSort(Class<?> elemType, Object array, long offset, int low, int high) {
-       switch (array) {
-            case int[] arr -> insertionSort(arr, low, high);
-            case long[] arr -> insertionSort(arr, low, high);
-            case float[] arr -> insertionSort(arr, low, high);
-            case double[] arr -> insertionSort(arr, low, high);
-            default -> throw new UnsupportedOperationException();
-        }
+    private static <A> void arraySort(Class<?> elemType, A array, long offset, int low, int high, SortOperation<A> so) {
+        so.sort(array, low, high);
     }
 
     /**
@@ -186,42 +192,14 @@ private static void insertionSort(Class<?> elemType, Object array, long offset,
      * @param high the index of the last element, exclusive, to be sorted
      * @param indexPivot1 the index of pivot1, the first pivot
      * @param indexPivot2 the index of pivot2, the second pivot
+     * @param po the method reference for the fallback implementation
      */
     @IntrinsicCandidate
     @ForceInline
-    private static int[] partitionDualPivot(Class<?> elemType, Object array, long offset, int low, int high, int indexPivot1, int indexPivot2) {
-        return switch(array) {
-            case int[] arr -> partitionDualPivot(arr, low, high, indexPivot1, indexPivot2);
-            case long[] arr -> partitionDualPivot(arr, low, high, indexPivot1, indexPivot2);
-            case float[] arr -> partitionDualPivot(arr, low, high, indexPivot1, indexPivot2);
-            case double[] arr -> partitionDualPivot(arr, low, high, indexPivot1, indexPivot2);
-            default -> throw new UnsupportedOperationException();
-        };
+    private static <A> int[] arrayPartition(Class<?> elemType, A array, long offset, int low, int high, int indexPivot1, int indexPivot2, PartitionOperation<A> po) {
+        return po.partition(array, low, high, indexPivot1, indexPivot2);
     }
 
-    /**
-     * Partitions the specified array based on the single pivot provided.
-     *
-     * @param elemType the class of the array to be sorted
-     * @param array the array to be sorted
-     * @param offset the relative offset, in bytes, from the base address of
-     * the array to partition, otherwise if the array is {@code null},an absolute
-     * address pointing to the first element to partition from.
-     * @param low the index of the first element, inclusive, to be sorted
-     * @param high the index of the last element, exclusive, to be sorted
-     * @param indexPivot the index of the pivot
-     */
-    @IntrinsicCandidate
-    @ForceInline
-    private static int[] partitionSinglePivot(Class<?> elemType, Object array, long offset, int low, int high, int indexPivot) {
-        return switch(array) {
-            case int[] arr -> partitionSinglePivot(arr, low, high, indexPivot);
-            case long[] arr -> partitionSinglePivot(arr, low, high, indexPivot);
-            case float[] arr -> partitionSinglePivot(arr, low, high, indexPivot);
-            case double[] arr -> partitionSinglePivot(arr, low, high, indexPivot);
-            default -> throw new UnsupportedOperationException();
-        };
-    }
 
     /**
      * Calculates the double depth of parallel merging.
@@ -286,7 +264,7 @@ static void sort(Sorter sorter, int[] a, int bits, int low, int high) {
              * Run mixed insertion sort on small non-leftmost parts.
              */
             if (size < MAX_MIXED_INSERTION_SORT_SIZE + bits && (bits & 1) > 0) {
-                mixedInsertionSort(int.class, a, Unsafe.ARRAY_INT_BASE_OFFSET, low, high, high - 3 * ((size >> 5) << 3));
+                arraySort(int.class, a, Unsafe.ARRAY_INT_BASE_OFFSET, low, high, DualPivotQuicksort::mixedInsertionSort);
                 return;
             }
 
@@ -294,7 +272,7 @@ static void sort(Sorter sorter, int[] a, int bits, int low, int high) {
              * Invoke insertion sort on small leftmost part.
              */
             if (size < MAX_INSERTION_SORT_SIZE) {
-                insertionSort(int.class, a, Unsafe.ARRAY_INT_BASE_OFFSET, low, high);
+                arraySort(int.class, a, Unsafe.ARRAY_INT_BASE_OFFSET, low, high, DualPivotQuicksort::insertionSort);
                 return;
             }
 
@@ -380,7 +358,7 @@ && tryMergeRuns(sorter, a, low, size)) {
                  * the pivots. These values are inexpensive approximation
                  * of tertiles. Note, that pivot1 < pivot2.
                  */
-                pivotIndices = partitionDualPivot(int.class, a, Unsafe.ARRAY_INT_BASE_OFFSET, low, high, e1, e5);
+                pivotIndices = arrayPartition(int.class, a, Unsafe.ARRAY_INT_BASE_OFFSET, low, high, e1, e5, DualPivotQuicksort::partitionDualPivot);
                 lower = pivotIndices[0];
                 upper = pivotIndices[1];
 
@@ -404,7 +382,7 @@ && tryMergeRuns(sorter, a, low, size)) {
                  * Use the third of the five sorted elements as the pivot.
                  * This value is inexpensive approximation of the median.
                  */
-                pivotIndices = partitionSinglePivot(int.class, a, Unsafe.ARRAY_INT_BASE_OFFSET, low, high, e3);
+                pivotIndices = arrayPartition(int.class, a, Unsafe.ARRAY_INT_BASE_OFFSET, low, high, e3, e3, DualPivotQuicksort::partitionSinglePivot);
                 lower = pivotIndices[0];
                 upper = pivotIndices[1];
                 /*
@@ -517,15 +495,18 @@ private static int[] partitionDualPivot(int[] a, int low, int high, int indexPiv
      * @param array the array to be partitioned
      * @param low the index of the first element, inclusive, for partitioning
      * @param high the index of the last element, exclusive, for partitioning
-     * @param indexPivot the index of the pivot
+     * @param indexPivot1 the index of pivot1, the first pivot
+     * @param indexPivot2 the index of pivot2, the second pivot
      *
      */
     @ForceInline
-    private static int[] partitionSinglePivot(int[] a, int low, int high, int indexPivot) {
+    private static int[] partitionSinglePivot(int[] a, int low, int high, int indexPivot1, int indexPivot2) {
+        if (indexPivot1 != indexPivot2) throw new IllegalArgumentException("both the pivot indices must be same");
+
         int end = high - 1;
         int lower = low;
         int upper = end;
-        int e3 = indexPivot;
+        int e3 = indexPivot1;
         int pivot = a[e3];
 
         /*
@@ -596,10 +577,11 @@ private static int[] partitionSinglePivot(int[] a, int low, int high, int indexP
      *
      * @param a the array to be sorted
      * @param low the index of the first element, inclusive, to be sorted
-     * @param end the index of the last element for simple insertion sort
      * @param high the index of the last element, exclusive, to be sorted
      */
-    private static void mixedInsertionSort(int[] a, int low, int end, int high) {
+    private static void mixedInsertionSort(int[] a, int low, int high) {
+        int size = high - low;
+        int end = high - 3 * ((size >> 5) << 3);
         if (end == high) {
 
             /*
@@ -1089,8 +1071,7 @@ static void sort(Sorter sorter, long[] a, int bits, int low, int high) {
              * Run mixed insertion sort on small non-leftmost parts.
              */
             if (size < MAX_MIXED_INSERTION_SORT_SIZE + bits && (bits & 1) > 0) {
-                int last  = high - 3 * ((size >> 5) << 3);
-                mixedInsertionSort(long.class, a, Unsafe.ARRAY_LONG_BASE_OFFSET, low, high, high - 3 * ((size >> 5) << 3));
+                arraySort(long.class, a, Unsafe.ARRAY_LONG_BASE_OFFSET, low, high, DualPivotQuicksort::mixedInsertionSort);
                 return;
             }
 
@@ -1098,7 +1079,7 @@ static void sort(Sorter sorter, long[] a, int bits, int low, int high) {
              * Invoke insertion sort on small leftmost part.
              */
             if (size < MAX_INSERTION_SORT_SIZE) {
-                insertionSort(long.class, a, Unsafe.ARRAY_LONG_BASE_OFFSET, low, high);
+                arraySort(long.class, a, Unsafe.ARRAY_LONG_BASE_OFFSET, low, high, DualPivotQuicksort::insertionSort);
                 return;
             }
 
@@ -1185,7 +1166,7 @@ && tryMergeRuns(sorter, a, low, size)) {
                  * the pivots. These values are inexpensive approximation
                  * of tertiles. Note, that pivot1 < pivot2.
                  */
-                pivotIndices = partitionDualPivot(long.class, a, Unsafe.ARRAY_LONG_BASE_OFFSET, low, high, e1, e5);
+                pivotIndices = arrayPartition(long.class, a, Unsafe.ARRAY_LONG_BASE_OFFSET, low, high, e1, e5, DualPivotQuicksort::partitionDualPivot);
                 lower = pivotIndices[0];
                 upper = pivotIndices[1];
                 /*
@@ -1206,7 +1187,7 @@ && tryMergeRuns(sorter, a, low, size)) {
                  * Use the third of the five sorted elements as the pivot.
                  * This value is inexpensive approximation of the median.
                  */
-                pivotIndices = partitionSinglePivot(long.class, a, Unsafe.ARRAY_LONG_BASE_OFFSET, low, high, e3);
+                pivotIndices = arrayPartition(long.class, a, Unsafe.ARRAY_LONG_BASE_OFFSET, low, high, e3, e3, DualPivotQuicksort::partitionSinglePivot);
                 lower = pivotIndices[0];
                 upper = pivotIndices[1];
                 /*
@@ -1318,16 +1299,19 @@ private static int[] partitionDualPivot(long[] a, int low, int high, int indexPi
      * @param array the array to be partitioned
      * @param low the index of the first element, inclusive, for partitioning
      * @param high the index of the last element, exclusive, for partitioning
-     * @param indexPivot the index of the pivot
+     * @param indexPivot1 the index of pivot1, the first pivot
+     * @param indexPivot2 the index of pivot2, the second pivot
      *
      */
     @ForceInline
-    private static int[] partitionSinglePivot(long[] a, int low, int high, int indexPivot) {
+    private static int[] partitionSinglePivot(long[] a, int low, int high, int indexPivot1, int indexPivot2) {
+        if (indexPivot1 != indexPivot2) throw new IllegalArgumentException("both the pivot indices must be same");
+
         int end = high - 1;
         int lower = low;
         int upper = end;
 
-        int e3 = indexPivot;
+        int e3 = indexPivot1;
         long pivot = a[e3];
 
         /*
@@ -1398,10 +1382,11 @@ private static int[] partitionSinglePivot(long[] a, int low, int high, int index
      *
      * @param a the array to be sorted
      * @param low the index of the first element, inclusive, to be sorted
-     * @param end the index of the last element for simple insertion sort
      * @param high the index of the last element, exclusive, to be sorted
      */
-    private static void mixedInsertionSort(long[] a, int low, int end, int high) {
+    private static void mixedInsertionSort(long[] a, int low, int high) {
+        int size = high - low;
+        int end = high - 3 * ((size >> 5) << 3);
         if (end == high) {
 
             /*
@@ -2678,8 +2663,7 @@ static void sort(Sorter sorter, float[] a, int bits, int low, int high) {
              * Run mixed insertion sort on small non-leftmost parts.
              */
             if (size < MAX_MIXED_INSERTION_SORT_SIZE + bits && (bits & 1) > 0) {
-                int last  = high - 3 * ((size >> 5) << 3);
-                mixedInsertionSort(float.class, a, Unsafe.ARRAY_FLOAT_BASE_OFFSET, low, high, high - 3 * ((size >> 5) << 3));
+                arraySort(float.class, a, Unsafe.ARRAY_FLOAT_BASE_OFFSET, low, high, DualPivotQuicksort::mixedInsertionSort);
                 return;
             }
 
@@ -2687,7 +2671,7 @@ static void sort(Sorter sorter, float[] a, int bits, int low, int high) {
              * Invoke insertion sort on small leftmost part.
              */
             if (size < MAX_INSERTION_SORT_SIZE) {
-                insertionSort(float.class, a, Unsafe.ARRAY_FLOAT_BASE_OFFSET, low, high);
+                arraySort(float.class, a, Unsafe.ARRAY_FLOAT_BASE_OFFSET, low, high, DualPivotQuicksort::insertionSort);
                 return;
             }
 
@@ -2774,7 +2758,7 @@ && tryMergeRuns(sorter, a, low, size)) {
                  * the pivots. These values are inexpensive approximation
                  * of tertiles. Note, that pivot1 < pivot2.
                  */
-                pivotIndices = partitionDualPivot(float.class, a, Unsafe.ARRAY_FLOAT_BASE_OFFSET, low, high, e1, e5);
+                pivotIndices = arrayPartition(float.class, a, Unsafe.ARRAY_FLOAT_BASE_OFFSET, low, high, e1, e5, DualPivotQuicksort::partitionDualPivot);
                 lower = pivotIndices[0];
                 upper = pivotIndices[1];
                 /*
@@ -2795,7 +2779,7 @@ && tryMergeRuns(sorter, a, low, size)) {
                  * Use the third of the five sorted elements as the pivot.
                  * This value is inexpensive approximation of the median.
                  */
-                pivotIndices = partitionSinglePivot(float.class, a, Unsafe.ARRAY_FLOAT_BASE_OFFSET, low, high, e3);
+                pivotIndices = arrayPartition(float.class, a, Unsafe.ARRAY_FLOAT_BASE_OFFSET, low, high, e3, e3, DualPivotQuicksort::partitionSinglePivot);
                 lower = pivotIndices[0];
                 upper = pivotIndices[1];
                 /*
@@ -2907,16 +2891,18 @@ private static int[] partitionDualPivot(float[] a, int low, int high, int indexP
      * @param array the array to be partitioned
      * @param low the index of the first element, inclusive, for partitioning
      * @param high the index of the last element, exclusive, for partitioning
-     * @param indexPivot the index of the pivot
+     * @param indexPivot1 the index of pivot1, the first pivot
+     * @param indexPivot2 the index of pivot2, the second pivot
      *
      */
     @ForceInline
-    private static int[] partitionSinglePivot(float[] a, int low, int high, int indexPivot) {
+    private static int[] partitionSinglePivot(float[] a, int low, int high, int indexPivot1, int indexPivot2) {
+        if (indexPivot1 != indexPivot2) throw new IllegalArgumentException("both the pivot indices must be same");
         int end = high - 1;
         int lower = low;
         int upper = end;
 
-        int e3 = indexPivot;
+        int e3 = indexPivot1;
         float pivot = a[e3];
 
         /*
@@ -2987,10 +2973,11 @@ private static int[] partitionSinglePivot(float[] a, int low, int high, int inde
      *
      * @param a the array to be sorted
      * @param low the index of the first element, inclusive, to be sorted
-     * @param end the index of the last element for simple insertion sort
      * @param high the index of the last element, exclusive, to be sorted
      */
-    private static void mixedInsertionSort(float[] a, int low, int end, int high) {
+    private static void mixedInsertionSort(float[] a, int low, int high) {
+        int size = high - low;
+        int end = high - 3 * ((size >> 5) << 3);
         if (end == high) {
 
             /*
@@ -3531,8 +3518,7 @@ static void sort(Sorter sorter, double[] a, int bits, int low, int high) {
              * Run mixed insertion sort on small non-leftmost parts.
              */
             if (size < MAX_MIXED_INSERTION_SORT_SIZE + bits && (bits & 1) > 0) {
-                int last  = high - 3 * ((size >> 5) << 3);
-                mixedInsertionSort(double.class, a, Unsafe.ARRAY_DOUBLE_BASE_OFFSET, low, high, high - 3 * ((size >> 5) << 3));
+                arraySort(double.class, a, Unsafe.ARRAY_DOUBLE_BASE_OFFSET, low, high, DualPivotQuicksort::mixedInsertionSort);
                 return;
             }
 
@@ -3540,7 +3526,7 @@ static void sort(Sorter sorter, double[] a, int bits, int low, int high) {
              * Invoke insertion sort on small leftmost part.
              */
             if (size < MAX_INSERTION_SORT_SIZE) {
-                insertionSort(double.class, a, Unsafe.ARRAY_DOUBLE_BASE_OFFSET, low, high);
+                arraySort(double.class, a, Unsafe.ARRAY_DOUBLE_BASE_OFFSET, low, high, DualPivotQuicksort::insertionSort);
                 return;
             }
 
@@ -3627,7 +3613,7 @@ && tryMergeRuns(sorter, a, low, size)) {
                 * the pivots. These values are inexpensive approximation
                 * of tertiles. Note, that pivot1 < pivot2.
                 */
-                pivotIndices = partitionDualPivot(double.class, a, Unsafe.ARRAY_DOUBLE_BASE_OFFSET, low, high, e1, e5);
+                pivotIndices = arrayPartition(double.class, a, Unsafe.ARRAY_DOUBLE_BASE_OFFSET, low, high, e1, e5, DualPivotQuicksort::partitionDualPivot);
                 lower = pivotIndices[0];
                 upper = pivotIndices[1];
                 /*
@@ -3648,7 +3634,7 @@ && tryMergeRuns(sorter, a, low, size)) {
                  * Use the third of the five sorted elements as the pivot.
                  * This value is inexpensive approximation of the median.
                  */
-                pivotIndices = partitionSinglePivot(double.class, a, Unsafe.ARRAY_DOUBLE_BASE_OFFSET, low, high, e3);
+                pivotIndices = arrayPartition(double.class, a, Unsafe.ARRAY_DOUBLE_BASE_OFFSET, low, high, e3, e3, DualPivotQuicksort::partitionSinglePivot);
                 lower = pivotIndices[0];
                 upper = pivotIndices[1];
 
@@ -3762,15 +3748,18 @@ private static int[] partitionDualPivot(double[] a, int low, int high, int index
      * @param array the array to be partitioned
      * @param low the index of the first element, inclusive, for partitioning
      * @param high the index of the last element, exclusive, for partitioning
-     * @param indexPivot the index of the pivot
+     * @param indexPivot1 the index of pivot1, the first pivot
+     * @param indexPivot2 the index of pivot2, the second pivot
      */
     @ForceInline
-    private static int[] partitionSinglePivot(double[] a, int low, int high, int indexPivot) {
+    private static int[] partitionSinglePivot(double[] a, int low, int high, int indexPivot1, int indexPivot2) {
+        if (indexPivot1 != indexPivot2) throw new IllegalArgumentException("both the pivot indices must be same");
+
         int end = high - 1;
         int lower = low;
         int upper = end;
 
-        int e3 = indexPivot;
+        int e3 = indexPivot1;
         double pivot = a[e3];
 
         /*
@@ -3841,10 +3830,11 @@ private static int[] partitionSinglePivot(double[] a, int low, int high, int ind
      *
      * @param a the array to be sorted
      * @param low the index of the first element, inclusive, to be sorted
-     * @param end the index of the last element for simple insertion sort
      * @param high the index of the last element, exclusive, to be sorted
      */
-    private static void mixedInsertionSort(double[] a, int low, int end, int high) {
+    private static void mixedInsertionSort(double[] a, int low, int high) {
+        int size = high - low;
+        int end = high - 3 * ((size >> 5) << 3);
         if (end == high) {
 
             /*
diff --git a/test/jdk/java/util/Arrays/Sorting.java b/test/jdk/java/util/Arrays/Sorting.java
index ce5b2ff87e07d..d368885abe082 100644
--- a/test/jdk/java/util/Arrays/Sorting.java
+++ b/test/jdk/java/util/Arrays/Sorting.java
@@ -26,7 +26,7 @@
  * @compile/module=java.base java/util/SortingHelper.java
  * @bug 6880672 6896573 6899694 6976036 7013585 7018258 8003981 8226297
  * @build Sorting
- * @run main/othervm -XX:+UnlockDiagnosticVMOptions -XX:DisableIntrinsic=_arraySortI,_arraySortMI,_arrayPartitionSP,_arrayPartitionDP Sorting -shortrun
+ * @run main/othervm -XX:+UnlockDiagnosticVMOptions -XX:DisableIntrinsic=_arraySort,_arrayPartition, Sorting -shortrun
  * @run main/othervm -XX:CompileThreshold=1 -XX:-TieredCompilation Sorting -shortrun
  * @summary Exercise Arrays.sort, Arrays.parallelSort
  *

From e63a2aa081275c3f1ed2ccc4315a60f304d18b34 Mon Sep 17 00:00:00 2001
From: vamsi-parasa <srinivas.vamsi.parasa@intel.com>
Date: Fri, 15 Sep 2023 15:09:03 -0700
Subject: [PATCH 34/40] Move functional interfaces close to the associated
 methods

---
 src/hotspot/share/classfile/vmIntrinsics.hpp  |  6 +-
 .../classes/java/util/DualPivotQuicksort.java | 78 +++++++++----------
 2 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/src/hotspot/share/classfile/vmIntrinsics.hpp b/src/hotspot/share/classfile/vmIntrinsics.hpp
index fcd7e6a89b7a0..d6c22e6eaed6e 100644
--- a/src/hotspot/share/classfile/vmIntrinsics.hpp
+++ b/src/hotspot/share/classfile/vmIntrinsics.hpp
@@ -343,11 +343,11 @@ class methodHandle;
                                                                                                                         \
   do_intrinsic(_arraySort,                java_util_DualPivotQuicksort, arraySort_name, arraySort_signature,     F_S)   \
    do_name(     arraySort_name,                                  "arraySort")                                           \
-   do_signature(arraySort_signature,          "(Ljava/lang/Class;Ljava/lang/Object;JIILjava/util/SortOperation;)V")     \
+   do_signature(arraySort_signature, "(Ljava/lang/Class;Ljava/lang/Object;JIILjava/util/DualPivotQuicksort$SortOperation;)V") \
                                                                                                                         \
-  do_intrinsic(_arrayPartition,           java_util_DualPivotQuicksort, arrayPartition_name, arrayPartition_signature, F_S)   \
+  do_intrinsic(_arrayPartition,           java_util_DualPivotQuicksort, arrayPartition_name, arrayPartition_signature, F_S) \
    do_name(     arrayPartition_name,                             "arrayPartition")                                      \
-   do_signature(arrayPartition_signature, "(Ljava/lang/Class;Ljava/lang/Object;JIIIILjava/util/PartitionOperation;)[I") \
+   do_signature(arrayPartition_signature, "(Ljava/lang/Class;Ljava/lang/Object;JIIIILjava/util/DualPivotQuicksort$PartitionOperation;)[I") \
                                                                                                                         \
   do_intrinsic(_copyOfRange,              java_util_Arrays,       copyOfRange_name, copyOfRange_signature,       F_S)   \
    do_name(     copyOfRange_name,                                "copyOfRange")                                         \
diff --git a/src/java.base/share/classes/java/util/DualPivotQuicksort.java b/src/java.base/share/classes/java/util/DualPivotQuicksort.java
index d6fe8ddcffcf4..85a750c25066e 100644
--- a/src/java.base/share/classes/java/util/DualPivotQuicksort.java
+++ b/src/java.base/share/classes/java/util/DualPivotQuicksort.java
@@ -54,40 +54,6 @@
  * @since 1.7 * 14
  */
 
-/**
- * Represents a function that accepts an array and sorts a specified range
- * of the array into ascending order.
- */
-@FunctionalInterface
-interface SortOperation<A> {
-    /**
-     * Sorts the specified range of the array.
-     *
-     * @param a the array to be sorted
-     * @param low the index of the first element, inclusive, to be sorted
-     * @param high the index of the last element, exclusive, to be sorted
-     */
-    void sort(A a, int low, int high);
-}
-
-/**
- * Represents a function that accepts an array and partitions a specified range
- * of the array based on the pivots provided.
- */
-@FunctionalInterface
-interface PartitionOperation<A> {
-     /**
-     * Partitions the specified range of the array.
-     *
-     * @param a the array to be sorted
-     * @param low the index of the first element, inclusive, to be sorted
-     * @param high the index of the last element, exclusive, to be sorted
-     * @param indexPivot1 the index of pivot1, the first pivot
-     * @param indexPivot2 the index of pivot2, the second pivot
-     */
-    int[] partition(A a, int low, int high, int indexPivot1, int indexPivot2);
-}
-
 
 final class DualPivotQuicksort {
 
@@ -161,6 +127,22 @@ private DualPivotQuicksort() {}
      */
     private static final int MAX_RECURSION_DEPTH = 64 * DELTA;
 
+    /**
+     * Represents a function that accepts an array and sorts a specified range
+     * of the array into ascending order.
+     */
+    @FunctionalInterface
+    private static interface SortOperation<A> {
+        /**
+         * Sorts the specified range of the array.
+         *
+         * @param a the array to be sorted
+         * @param low the index of the first element, inclusive, to be sorted
+         * @param high the index of the last element, exclusive, to be sorted
+         */
+        void sort(A a, int low, int high);
+    }
+
     /**
      * Sorts the specified array into ascending numerical order using
      * mixed insertion sort.The intrinsic is free to choose its own
@@ -176,10 +158,29 @@ private DualPivotQuicksort() {}
      * @param so the method reference for the fallback implementation
      */
     @IntrinsicCandidate
+    @ForceInline
     private static <A> void arraySort(Class<?> elemType, A array, long offset, int low, int high, SortOperation<A> so) {
         so.sort(array, low, high);
     }
 
+    /**
+     * Represents a function that accepts an array and partitions a specified range
+     * of the array based on the pivots provided.
+     */
+    @FunctionalInterface
+    interface PartitionOperation<A> {
+        /**
+         * Partitions the specified range of the array.
+         *
+         * @param a the array to be sorted
+         * @param low the index of the first element, inclusive, to be sorted
+         * @param high the index of the last element, exclusive, to be sorted
+         * @param indexPivot1 the index of pivot1, the first pivot
+         * @param indexPivot2 the index of pivot2, the second pivot
+         */
+        int[] partition(A a, int low, int high, int indexPivot1, int indexPivot2);
+    }
+
     /**
      * Partitions the specified array based on the two pivots provided.
      *
@@ -200,7 +201,6 @@ private static <A> int[] arrayPartition(Class<?> elemType, A array, long offset,
         return po.partition(array, low, high, indexPivot1, indexPivot2);
     }
 
-
     /**
      * Calculates the double depth of parallel merging.
      * Depth is negative, if tasks split before sorting.
@@ -501,7 +501,7 @@ private static int[] partitionDualPivot(int[] a, int low, int high, int indexPiv
      */
     @ForceInline
     private static int[] partitionSinglePivot(int[] a, int low, int high, int indexPivot1, int indexPivot2) {
-        if (indexPivot1 != indexPivot2) throw new IllegalArgumentException("both the pivot indices must be same");
+        if (indexPivot1 != indexPivot2) throw new IllegalArgumentException("Both the pivot indices must be same");
 
         int end = high - 1;
         int lower = low;
@@ -1305,7 +1305,7 @@ private static int[] partitionDualPivot(long[] a, int low, int high, int indexPi
      */
     @ForceInline
     private static int[] partitionSinglePivot(long[] a, int low, int high, int indexPivot1, int indexPivot2) {
-        if (indexPivot1 != indexPivot2) throw new IllegalArgumentException("both the pivot indices must be same");
+        if (indexPivot1 != indexPivot2) throw new IllegalArgumentException("Both the pivot indices must be same");
 
         int end = high - 1;
         int lower = low;
@@ -2897,7 +2897,7 @@ private static int[] partitionDualPivot(float[] a, int low, int high, int indexP
      */
     @ForceInline
     private static int[] partitionSinglePivot(float[] a, int low, int high, int indexPivot1, int indexPivot2) {
-        if (indexPivot1 != indexPivot2) throw new IllegalArgumentException("both the pivot indices must be same");
+        if (indexPivot1 != indexPivot2) throw new IllegalArgumentException("Both the pivot indices must be same");
         int end = high - 1;
         int lower = low;
         int upper = end;
@@ -3753,7 +3753,7 @@ private static int[] partitionDualPivot(double[] a, int low, int high, int index
      */
     @ForceInline
     private static int[] partitionSinglePivot(double[] a, int low, int high, int indexPivot1, int indexPivot2) {
-        if (indexPivot1 != indexPivot2) throw new IllegalArgumentException("both the pivot indices must be same");
+        if (indexPivot1 != indexPivot2) throw new IllegalArgumentException("Both the pivot indices must be same");
 
         int end = high - 1;
         int lower = low;

From 7fc1afac4ca287908f2ddaeb2bd554044791452e Mon Sep 17 00:00:00 2001
From: vamsi-parasa <srinivas.vamsi.parasa@intel.com>
Date: Mon, 18 Sep 2023 11:44:59 -0700
Subject: [PATCH 35/40] Remove the unnecessary exception in single pivot
 partitioning fallback method

---
 src/hotspot/share/opto/library_call.cpp       |  4 +-
 .../classes/java/util/DualPivotQuicksort.java | 38 ++++++++-----------
 test/jdk/java/util/Arrays/Sorting.java        |  2 +-
 3 files changed, 19 insertions(+), 25 deletions(-)

diff --git a/src/hotspot/share/opto/library_call.cpp b/src/hotspot/share/opto/library_call.cpp
index 9d119a19ea313..eb8c76dfc56aa 100644
--- a/src/hotspot/share/opto/library_call.cpp
+++ b/src/hotspot/share/opto/library_call.cpp
@@ -5405,7 +5405,7 @@ bool LibraryCallKit::inline_array_partition() {
   guarantee(alloc != nullptr, "created above");
   Node* pivotIndices_adr = basic_plus_adr(pivotIndices, arrayOopDesc::base_offset_in_bytes(T_INT));
 
-  // pass the bastic type enum to the stub
+  // pass the basic type enum to the stub
   Node* elemType = intcon(bt);
 
   // Call the stub
@@ -5450,7 +5450,7 @@ bool LibraryCallKit::inline_array_sort() {
   }
   Node* obj_adr = make_unsafe_address(obj, offset);
 
-  // pass the bastic type enum to the stub
+  // pass the basic type enum to the stub
   Node* elemType = intcon(bt);
 
   // Call the stub.
diff --git a/src/java.base/share/classes/java/util/DualPivotQuicksort.java b/src/java.base/share/classes/java/util/DualPivotQuicksort.java
index 85a750c25066e..c0f95f390cfde 100644
--- a/src/java.base/share/classes/java/util/DualPivotQuicksort.java
+++ b/src/java.base/share/classes/java/util/DualPivotQuicksort.java
@@ -128,13 +128,13 @@ private DualPivotQuicksort() {}
     private static final int MAX_RECURSION_DEPTH = 64 * DELTA;
 
     /**
-     * Represents a function that accepts an array and sorts a specified range
-     * of the array into ascending order.
+     * Represents a function that accepts an array and sorts the specified range
+     * of an array into ascending order.
      */
     @FunctionalInterface
     private static interface SortOperation<A> {
         /**
-         * Sorts the specified range of the array.
+         * Sorts the specified range of an array.
          *
          * @param a the array to be sorted
          * @param low the index of the first element, inclusive, to be sorted
@@ -144,9 +144,7 @@ private static interface SortOperation<A> {
     }
 
     /**
-     * Sorts the specified array into ascending numerical order using
-     * mixed insertion sort.The intrinsic is free to choose its own
-     * sorting algorithm.
+     * Sorts the specified range of an array into ascending numerical order.
      *
      * @param elemType the class of the elements of the array to be sorted
      * @param array the array to be sorted
@@ -164,13 +162,13 @@ private static <A> void arraySort(Class<?> elemType, A array, long offset, int l
     }
 
     /**
-     * Represents a function that accepts an array and partitions a specified range
-     * of the array based on the pivots provided.
+     * Represents a function that accepts an array and partitions the specified range
+     * of an array using the pivots provided.
      */
     @FunctionalInterface
     interface PartitionOperation<A> {
         /**
-         * Partitions the specified range of the array.
+         * Partitions the specified range of an array using the given pivots.
          *
          * @param a the array to be sorted
          * @param low the index of the first element, inclusive, to be sorted
@@ -182,7 +180,7 @@ interface PartitionOperation<A> {
     }
 
     /**
-     * Partitions the specified array based on the two pivots provided.
+     * Partitions the specified range of an array using the two pivots provided.
      *
      * @param elemType the class of the array to be sorted
      * @param array the array to be sorted
@@ -401,7 +399,7 @@ && tryMergeRuns(sorter, a, low, size)) {
     }
 
     /**
-     * Partitions the specified range of the array using the two pivots provided.
+     * Partitions the specified range of an array using the two pivots provided.
      *
      * @param array the array to be partitioned
      * @param low the index of the first element, inclusive, for partitioning
@@ -490,7 +488,7 @@ private static int[] partitionDualPivot(int[] a, int low, int high, int indexPiv
 
 
     /**
-     * Partitions the specified range of the array using a single pivot provided.
+     * Partitions the specified range of an array using a single pivot provided.
      *
      * @param array the array to be partitioned
      * @param low the index of the first element, inclusive, for partitioning
@@ -501,7 +499,6 @@ private static int[] partitionDualPivot(int[] a, int low, int high, int indexPiv
      */
     @ForceInline
     private static int[] partitionSinglePivot(int[] a, int low, int high, int indexPivot1, int indexPivot2) {
-        if (indexPivot1 != indexPivot2) throw new IllegalArgumentException("Both the pivot indices must be same");
 
         int end = high - 1;
         int lower = low;
@@ -1206,7 +1203,7 @@ && tryMergeRuns(sorter, a, low, size)) {
     }
 
     /**
-     * Partitions the specified range of the array using the two pivots provided.
+     * Partitions the specified range of an array using the two pivots provided.
      *
      * @param array the array to be partitioned
      * @param low the index of the first element, inclusive, for partitioning
@@ -1294,7 +1291,7 @@ private static int[] partitionDualPivot(long[] a, int low, int high, int indexPi
 
 
     /**
-     * Partitions the specified range of the array using a single pivot provided.
+     * Partitions the specified range of an array using a single pivot provided.
      *
      * @param array the array to be partitioned
      * @param low the index of the first element, inclusive, for partitioning
@@ -1305,7 +1302,6 @@ private static int[] partitionDualPivot(long[] a, int low, int high, int indexPi
      */
     @ForceInline
     private static int[] partitionSinglePivot(long[] a, int low, int high, int indexPivot1, int indexPivot2) {
-        if (indexPivot1 != indexPivot2) throw new IllegalArgumentException("Both the pivot indices must be same");
 
         int end = high - 1;
         int lower = low;
@@ -2798,7 +2794,7 @@ && tryMergeRuns(sorter, a, low, size)) {
     }
 
     /**
-     * Partitions the specified range of the array using the two pivots provided.
+     * Partitions the specified range of an array using the two pivots provided.
      *
      * @param array the array to be partitioned
      * @param low the index of the first element, inclusive, for partitioning
@@ -2886,7 +2882,7 @@ private static int[] partitionDualPivot(float[] a, int low, int high, int indexP
 
 
     /**
-     * Partitions the specified range of the array using a single pivot provided.
+     * Partitions the specified range of an array using a single pivot provided.
      *
      * @param array the array to be partitioned
      * @param low the index of the first element, inclusive, for partitioning
@@ -2897,7 +2893,6 @@ private static int[] partitionDualPivot(float[] a, int low, int high, int indexP
      */
     @ForceInline
     private static int[] partitionSinglePivot(float[] a, int low, int high, int indexPivot1, int indexPivot2) {
-        if (indexPivot1 != indexPivot2) throw new IllegalArgumentException("Both the pivot indices must be same");
         int end = high - 1;
         int lower = low;
         int upper = end;
@@ -3654,7 +3649,7 @@ && tryMergeRuns(sorter, a, low, size)) {
     }
 
     /**
-     * Partitions the specified range of the array using the two pivots provided.
+     * Partitions the specified range of an array using the two pivots provided.
      *
      * @param array the array to be partitioned
      * @param low the index of the first element, inclusive, for partitioning
@@ -3743,7 +3738,7 @@ private static int[] partitionDualPivot(double[] a, int low, int high, int index
 
 
     /**
-     * Partitions the specified range of the array using a single pivot provided.
+     * Partitions the specified range of an array using a single pivot provided.
      *
      * @param array the array to be partitioned
      * @param low the index of the first element, inclusive, for partitioning
@@ -3753,7 +3748,6 @@ private static int[] partitionDualPivot(double[] a, int low, int high, int index
      */
     @ForceInline
     private static int[] partitionSinglePivot(double[] a, int low, int high, int indexPivot1, int indexPivot2) {
-        if (indexPivot1 != indexPivot2) throw new IllegalArgumentException("Both the pivot indices must be same");
 
         int end = high - 1;
         int lower = low;
diff --git a/test/jdk/java/util/Arrays/Sorting.java b/test/jdk/java/util/Arrays/Sorting.java
index d368885abe082..113c8a688620c 100644
--- a/test/jdk/java/util/Arrays/Sorting.java
+++ b/test/jdk/java/util/Arrays/Sorting.java
@@ -47,7 +47,7 @@ public class Sorting {
 
     // Array lengths used in a long run (default)
     private static final int[] LONG_RUN_LENGTHS = {
-        1, 3, 8, 21, 55, 100, 1_000, 10_000, 100_000};
+        1, 3, 8, 21, 55, 100, 1_000, 10_000, 100_000 };
 
     // Array lengths used in a short run
     private static final int[] SHORT_RUN_LENGTHS = {

From bf41d2ab73e55bdcaaae35bceab456d07136748f Mon Sep 17 00:00:00 2001
From: vamsi-parasa <srinivas.vamsi.parasa@intel.com>
Date: Mon, 18 Sep 2023 18:48:30 -0700
Subject: [PATCH 36/40] Rename arraySort and arrayPartition Java methods to
 sort and partition. Cleanup some comments

---
 src/hotspot/share/classfile/vmIntrinsics.hpp  |  4 +-
 .../classes/java/util/DualPivotQuicksort.java | 79 +++++++++----------
 2 files changed, 39 insertions(+), 44 deletions(-)

diff --git a/src/hotspot/share/classfile/vmIntrinsics.hpp b/src/hotspot/share/classfile/vmIntrinsics.hpp
index d6c22e6eaed6e..66b8a43640728 100644
--- a/src/hotspot/share/classfile/vmIntrinsics.hpp
+++ b/src/hotspot/share/classfile/vmIntrinsics.hpp
@@ -342,11 +342,11 @@ class methodHandle;
    do_signature(copyOf_signature,             "([Ljava/lang/Object;ILjava/lang/Class;)[Ljava/lang/Object;")             \
                                                                                                                         \
   do_intrinsic(_arraySort,                java_util_DualPivotQuicksort, arraySort_name, arraySort_signature,     F_S)   \
-   do_name(     arraySort_name,                                  "arraySort")                                           \
+   do_name(     arraySort_name,                                  "sort")                                                \
    do_signature(arraySort_signature, "(Ljava/lang/Class;Ljava/lang/Object;JIILjava/util/DualPivotQuicksort$SortOperation;)V") \
                                                                                                                         \
   do_intrinsic(_arrayPartition,           java_util_DualPivotQuicksort, arrayPartition_name, arrayPartition_signature, F_S) \
-   do_name(     arrayPartition_name,                             "arrayPartition")                                      \
+   do_name(     arrayPartition_name,                             "partition")                                           \
    do_signature(arrayPartition_signature, "(Ljava/lang/Class;Ljava/lang/Object;JIIIILjava/util/DualPivotQuicksort$PartitionOperation;)[I") \
                                                                                                                         \
   do_intrinsic(_copyOfRange,              java_util_Arrays,       copyOfRange_name, copyOfRange_signature,       F_S)   \
diff --git a/src/java.base/share/classes/java/util/DualPivotQuicksort.java b/src/java.base/share/classes/java/util/DualPivotQuicksort.java
index c0f95f390cfde..3fa87815596db 100644
--- a/src/java.base/share/classes/java/util/DualPivotQuicksort.java
+++ b/src/java.base/share/classes/java/util/DualPivotQuicksort.java
@@ -27,7 +27,6 @@
 
 import java.util.concurrent.CountedCompleter;
 import java.util.concurrent.RecursiveTask;
-import java.util.Arrays;
 import jdk.internal.misc.Unsafe;
 import jdk.internal.vm.annotation.IntrinsicCandidate;
 import jdk.internal.vm.annotation.ForceInline;
@@ -128,13 +127,13 @@ private DualPivotQuicksort() {}
     private static final int MAX_RECURSION_DEPTH = 64 * DELTA;
 
     /**
-     * Represents a function that accepts an array and sorts the specified range
-     * of an array into ascending order.
+     * Represents a function that accepts the array and sorts the specified range
+     * of the array into ascending order.
      */
     @FunctionalInterface
     private static interface SortOperation<A> {
         /**
-         * Sorts the specified range of an array.
+         * Sorts the specified range of the array.
          *
          * @param a the array to be sorted
          * @param low the index of the first element, inclusive, to be sorted
@@ -144,7 +143,7 @@ private static interface SortOperation<A> {
     }
 
     /**
-     * Sorts the specified range of an array into ascending numerical order.
+     * Sorts the specified range of the array into ascending numerical order.
      *
      * @param elemType the class of the elements of the array to be sorted
      * @param array the array to be sorted
@@ -157,18 +156,18 @@ private static interface SortOperation<A> {
      */
     @IntrinsicCandidate
     @ForceInline
-    private static <A> void arraySort(Class<?> elemType, A array, long offset, int low, int high, SortOperation<A> so) {
+    private static <A> void sort(Class<?> elemType, A array, long offset, int low, int high, SortOperation<A> so) {
         so.sort(array, low, high);
     }
 
     /**
-     * Represents a function that accepts an array and partitions the specified range
-     * of an array using the pivots provided.
+     * Represents a function that accepts the array and partitions the specified range
+     * of the array using the pivots provided.
      */
     @FunctionalInterface
     interface PartitionOperation<A> {
         /**
-         * Partitions the specified range of an array using the given pivots.
+         * Partitions the specified range of the array using the given pivots.
          *
          * @param a the array to be sorted
          * @param low the index of the first element, inclusive, to be sorted
@@ -180,7 +179,7 @@ interface PartitionOperation<A> {
     }
 
     /**
-     * Partitions the specified range of an array using the two pivots provided.
+     * Partitions the specified range of the array using the two pivots provided.
      *
      * @param elemType the class of the array to be sorted
      * @param array the array to be sorted
@@ -195,7 +194,7 @@ interface PartitionOperation<A> {
      */
     @IntrinsicCandidate
     @ForceInline
-    private static <A> int[] arrayPartition(Class<?> elemType, A array, long offset, int low, int high, int indexPivot1, int indexPivot2, PartitionOperation<A> po) {
+    private static <A> int[] partition(Class<?> elemType, A array, long offset, int low, int high, int indexPivot1, int indexPivot2, PartitionOperation<A> po) {
         return po.partition(array, low, high, indexPivot1, indexPivot2);
     }
 
@@ -255,14 +254,13 @@ static void sort(int[] a, int parallelism, int low, int high) {
      * @param high the index of the last element, exclusive, to be sorted
      */
     static void sort(Sorter sorter, int[] a, int bits, int low, int high) {
-        int[] pivotIndices;
         while (true) {
             int end = high - 1, size = high - low;
             /*
              * Run mixed insertion sort on small non-leftmost parts.
              */
             if (size < MAX_MIXED_INSERTION_SORT_SIZE + bits && (bits & 1) > 0) {
-                arraySort(int.class, a, Unsafe.ARRAY_INT_BASE_OFFSET, low, high, DualPivotQuicksort::mixedInsertionSort);
+                sort(int.class, a, Unsafe.ARRAY_INT_BASE_OFFSET, low, high, DualPivotQuicksort::mixedInsertionSort);
                 return;
             }
 
@@ -270,7 +268,7 @@ static void sort(Sorter sorter, int[] a, int bits, int low, int high) {
              * Invoke insertion sort on small leftmost part.
              */
             if (size < MAX_INSERTION_SORT_SIZE) {
-                arraySort(int.class, a, Unsafe.ARRAY_INT_BASE_OFFSET, low, high, DualPivotQuicksort::insertionSort);
+                sort(int.class, a, Unsafe.ARRAY_INT_BASE_OFFSET, low, high, DualPivotQuicksort::insertionSort);
                 return;
             }
 
@@ -356,7 +354,7 @@ && tryMergeRuns(sorter, a, low, size)) {
                  * the pivots. These values are inexpensive approximation
                  * of tertiles. Note, that pivot1 < pivot2.
                  */
-                pivotIndices = arrayPartition(int.class, a, Unsafe.ARRAY_INT_BASE_OFFSET, low, high, e1, e5, DualPivotQuicksort::partitionDualPivot);
+                int[] pivotIndices = partition(int.class, a, Unsafe.ARRAY_INT_BASE_OFFSET, low, high, e1, e5, DualPivotQuicksort::partitionDualPivot);
                 lower = pivotIndices[0];
                 upper = pivotIndices[1];
 
@@ -380,7 +378,7 @@ && tryMergeRuns(sorter, a, low, size)) {
                  * Use the third of the five sorted elements as the pivot.
                  * This value is inexpensive approximation of the median.
                  */
-                pivotIndices = arrayPartition(int.class, a, Unsafe.ARRAY_INT_BASE_OFFSET, low, high, e3, e3, DualPivotQuicksort::partitionSinglePivot);
+                int[] pivotIndices = partition(int.class, a, Unsafe.ARRAY_INT_BASE_OFFSET, low, high, e3, e3, DualPivotQuicksort::partitionSinglePivot);
                 lower = pivotIndices[0];
                 upper = pivotIndices[1];
                 /*
@@ -399,7 +397,7 @@ && tryMergeRuns(sorter, a, low, size)) {
     }
 
     /**
-     * Partitions the specified range of an array using the two pivots provided.
+     * Partitions the specified range of the array using the two pivots provided.
      *
      * @param array the array to be partitioned
      * @param low the index of the first element, inclusive, for partitioning
@@ -488,7 +486,7 @@ private static int[] partitionDualPivot(int[] a, int low, int high, int indexPiv
 
 
     /**
-     * Partitions the specified range of an array using a single pivot provided.
+     * Partitions the specified range of the array using a single pivot provided.
      *
      * @param array the array to be partitioned
      * @param low the index of the first element, inclusive, for partitioning
@@ -1060,7 +1058,6 @@ static void sort(long[] a, int parallelism, int low, int high) {
      * @param high the index of the last element, exclusive, to be sorted
      */
     static void sort(Sorter sorter, long[] a, int bits, int low, int high) {
-        int[] pivotIndices;
         while (true) {
             int end = high - 1, size = high - low;
 
@@ -1068,7 +1065,7 @@ static void sort(Sorter sorter, long[] a, int bits, int low, int high) {
              * Run mixed insertion sort on small non-leftmost parts.
              */
             if (size < MAX_MIXED_INSERTION_SORT_SIZE + bits && (bits & 1) > 0) {
-                arraySort(long.class, a, Unsafe.ARRAY_LONG_BASE_OFFSET, low, high, DualPivotQuicksort::mixedInsertionSort);
+                sort(long.class, a, Unsafe.ARRAY_LONG_BASE_OFFSET, low, high, DualPivotQuicksort::mixedInsertionSort);
                 return;
             }
 
@@ -1076,7 +1073,7 @@ static void sort(Sorter sorter, long[] a, int bits, int low, int high) {
              * Invoke insertion sort on small leftmost part.
              */
             if (size < MAX_INSERTION_SORT_SIZE) {
-                arraySort(long.class, a, Unsafe.ARRAY_LONG_BASE_OFFSET, low, high, DualPivotQuicksort::insertionSort);
+                sort(long.class, a, Unsafe.ARRAY_LONG_BASE_OFFSET, low, high, DualPivotQuicksort::insertionSort);
                 return;
             }
 
@@ -1156,14 +1153,14 @@ && tryMergeRuns(sorter, a, low, size)) {
             /*
              * Partitioning with 2 pivots in case of different elements.
              */
-            if(a[e1] < a[e2] && a[e2] < a[e3] && a[e3] < a[e4] && a[e4] < a[e5]) {
+            if (a[e1] < a[e2] && a[e2] < a[e3] && a[e3] < a[e4] && a[e4] < a[e5]) {
 
                 /*
                  * Use the first and fifth of the five sorted elements as
                  * the pivots. These values are inexpensive approximation
                  * of tertiles. Note, that pivot1 < pivot2.
                  */
-                pivotIndices = arrayPartition(long.class, a, Unsafe.ARRAY_LONG_BASE_OFFSET, low, high, e1, e5, DualPivotQuicksort::partitionDualPivot);
+                int[] pivotIndices = partition(long.class, a, Unsafe.ARRAY_LONG_BASE_OFFSET, low, high, e1, e5, DualPivotQuicksort::partitionDualPivot);
                 lower = pivotIndices[0];
                 upper = pivotIndices[1];
                 /*
@@ -1184,7 +1181,7 @@ && tryMergeRuns(sorter, a, low, size)) {
                  * Use the third of the five sorted elements as the pivot.
                  * This value is inexpensive approximation of the median.
                  */
-                pivotIndices = arrayPartition(long.class, a, Unsafe.ARRAY_LONG_BASE_OFFSET, low, high, e3, e3, DualPivotQuicksort::partitionSinglePivot);
+                int[] pivotIndices = partition(long.class, a, Unsafe.ARRAY_LONG_BASE_OFFSET, low, high, e3, e3, DualPivotQuicksort::partitionSinglePivot);
                 lower = pivotIndices[0];
                 upper = pivotIndices[1];
                 /*
@@ -1203,7 +1200,7 @@ && tryMergeRuns(sorter, a, low, size)) {
     }
 
     /**
-     * Partitions the specified range of an array using the two pivots provided.
+     * Partitions the specified range of the array using the two pivots provided.
      *
      * @param array the array to be partitioned
      * @param low the index of the first element, inclusive, for partitioning
@@ -1291,7 +1288,7 @@ private static int[] partitionDualPivot(long[] a, int low, int high, int indexPi
 
 
     /**
-     * Partitions the specified range of an array using a single pivot provided.
+     * Partitions the specified range of the array using a single pivot provided.
      *
      * @param array the array to be partitioned
      * @param low the index of the first element, inclusive, for partitioning
@@ -2651,7 +2648,6 @@ static void sort(float[] a, int parallelism, int low, int high) {
      * @param high the index of the last element, exclusive, to be sorted
      */
     static void sort(Sorter sorter, float[] a, int bits, int low, int high) {
-        int[] pivotIndices;
         while (true) {
             int end = high - 1, size = high - low;
 
@@ -2659,7 +2655,7 @@ static void sort(Sorter sorter, float[] a, int bits, int low, int high) {
              * Run mixed insertion sort on small non-leftmost parts.
              */
             if (size < MAX_MIXED_INSERTION_SORT_SIZE + bits && (bits & 1) > 0) {
-                arraySort(float.class, a, Unsafe.ARRAY_FLOAT_BASE_OFFSET, low, high, DualPivotQuicksort::mixedInsertionSort);
+                sort(float.class, a, Unsafe.ARRAY_FLOAT_BASE_OFFSET, low, high, DualPivotQuicksort::mixedInsertionSort);
                 return;
             }
 
@@ -2667,7 +2663,7 @@ static void sort(Sorter sorter, float[] a, int bits, int low, int high) {
              * Invoke insertion sort on small leftmost part.
              */
             if (size < MAX_INSERTION_SORT_SIZE) {
-                arraySort(float.class, a, Unsafe.ARRAY_FLOAT_BASE_OFFSET, low, high, DualPivotQuicksort::insertionSort);
+                sort(float.class, a, Unsafe.ARRAY_FLOAT_BASE_OFFSET, low, high, DualPivotQuicksort::insertionSort);
                 return;
             }
 
@@ -2747,14 +2743,14 @@ && tryMergeRuns(sorter, a, low, size)) {
             /*
              * Partitioning with 2 pivots in case of different elements.
              */
-            if(a[e1] < a[e2] && a[e2] < a[e3] && a[e3] < a[e4] && a[e4] < a[e5]) {
+            if (a[e1] < a[e2] && a[e2] < a[e3] && a[e3] < a[e4] && a[e4] < a[e5]) {
 
                 /*
                  * Use the first and fifth of the five sorted elements as
                  * the pivots. These values are inexpensive approximation
                  * of tertiles. Note, that pivot1 < pivot2.
                  */
-                pivotIndices = arrayPartition(float.class, a, Unsafe.ARRAY_FLOAT_BASE_OFFSET, low, high, e1, e5, DualPivotQuicksort::partitionDualPivot);
+                int[] pivotIndices = partition(float.class, a, Unsafe.ARRAY_FLOAT_BASE_OFFSET, low, high, e1, e5, DualPivotQuicksort::partitionDualPivot);
                 lower = pivotIndices[0];
                 upper = pivotIndices[1];
                 /*
@@ -2775,7 +2771,7 @@ && tryMergeRuns(sorter, a, low, size)) {
                  * Use the third of the five sorted elements as the pivot.
                  * This value is inexpensive approximation of the median.
                  */
-                pivotIndices = arrayPartition(float.class, a, Unsafe.ARRAY_FLOAT_BASE_OFFSET, low, high, e3, e3, DualPivotQuicksort::partitionSinglePivot);
+                int[] pivotIndices = partition(float.class, a, Unsafe.ARRAY_FLOAT_BASE_OFFSET, low, high, e3, e3, DualPivotQuicksort::partitionSinglePivot);
                 lower = pivotIndices[0];
                 upper = pivotIndices[1];
                 /*
@@ -2794,7 +2790,7 @@ && tryMergeRuns(sorter, a, low, size)) {
     }
 
     /**
-     * Partitions the specified range of an array using the two pivots provided.
+     * Partitions the specified range of the array using the two pivots provided.
      *
      * @param array the array to be partitioned
      * @param low the index of the first element, inclusive, for partitioning
@@ -2882,7 +2878,7 @@ private static int[] partitionDualPivot(float[] a, int low, int high, int indexP
 
 
     /**
-     * Partitions the specified range of an array using a single pivot provided.
+     * Partitions the specified range of the array using a single pivot provided.
      *
      * @param array the array to be partitioned
      * @param low the index of the first element, inclusive, for partitioning
@@ -3506,14 +3502,13 @@ static void sort(double[] a, int parallelism, int low, int high) {
      * @param high the index of the last element, exclusive, to be sorted
      */
     static void sort(Sorter sorter, double[] a, int bits, int low, int high) {
-        int[] pivotIndices;
         while (true) {
             int end = high - 1, size = high - low;
             /*
              * Run mixed insertion sort on small non-leftmost parts.
              */
             if (size < MAX_MIXED_INSERTION_SORT_SIZE + bits && (bits & 1) > 0) {
-                arraySort(double.class, a, Unsafe.ARRAY_DOUBLE_BASE_OFFSET, low, high, DualPivotQuicksort::mixedInsertionSort);
+                sort(double.class, a, Unsafe.ARRAY_DOUBLE_BASE_OFFSET, low, high, DualPivotQuicksort::mixedInsertionSort);
                 return;
             }
 
@@ -3521,7 +3516,7 @@ static void sort(Sorter sorter, double[] a, int bits, int low, int high) {
              * Invoke insertion sort on small leftmost part.
              */
             if (size < MAX_INSERTION_SORT_SIZE) {
-                arraySort(double.class, a, Unsafe.ARRAY_DOUBLE_BASE_OFFSET, low, high, DualPivotQuicksort::insertionSort);
+                sort(double.class, a, Unsafe.ARRAY_DOUBLE_BASE_OFFSET, low, high, DualPivotQuicksort::insertionSort);
                 return;
             }
 
@@ -3601,14 +3596,14 @@ && tryMergeRuns(sorter, a, low, size)) {
             /*
              * Partitioning with 2 pivots in case of different elements.
              */
-            if(a[e1] < a[e2] && a[e2] < a[e3] && a[e3] < a[e4] && a[e4] < a[e5]) {
+            if (a[e1] < a[e2] && a[e2] < a[e3] && a[e3] < a[e4] && a[e4] < a[e5]) {
 
                 /*
                 * Use the first and fifth of the five sorted elements as
                 * the pivots. These values are inexpensive approximation
                 * of tertiles. Note, that pivot1 < pivot2.
                 */
-                pivotIndices = arrayPartition(double.class, a, Unsafe.ARRAY_DOUBLE_BASE_OFFSET, low, high, e1, e5, DualPivotQuicksort::partitionDualPivot);
+                int[] pivotIndices = partition(double.class, a, Unsafe.ARRAY_DOUBLE_BASE_OFFSET, low, high, e1, e5, DualPivotQuicksort::partitionDualPivot);
                 lower = pivotIndices[0];
                 upper = pivotIndices[1];
                 /*
@@ -3629,7 +3624,7 @@ && tryMergeRuns(sorter, a, low, size)) {
                  * Use the third of the five sorted elements as the pivot.
                  * This value is inexpensive approximation of the median.
                  */
-                pivotIndices = arrayPartition(double.class, a, Unsafe.ARRAY_DOUBLE_BASE_OFFSET, low, high, e3, e3, DualPivotQuicksort::partitionSinglePivot);
+                int[] pivotIndices = partition(double.class, a, Unsafe.ARRAY_DOUBLE_BASE_OFFSET, low, high, e3, e3, DualPivotQuicksort::partitionSinglePivot);
                 lower = pivotIndices[0];
                 upper = pivotIndices[1];
 
@@ -3649,7 +3644,7 @@ && tryMergeRuns(sorter, a, low, size)) {
     }
 
     /**
-     * Partitions the specified range of an array using the two pivots provided.
+     * Partitions the specified range of the array using the two pivots provided.
      *
      * @param array the array to be partitioned
      * @param low the index of the first element, inclusive, for partitioning
@@ -3738,7 +3733,7 @@ private static int[] partitionDualPivot(double[] a, int low, int high, int index
 
 
     /**
-     * Partitions the specified range of an array using a single pivot provided.
+     * Partitions the specified range of the array using a single pivot provided.
      *
      * @param array the array to be partitioned
      * @param low the index of the first element, inclusive, for partitioning

From 3e0b8cfcc380d6ff9b0511eb763d7f7a49c541f9 Mon Sep 17 00:00:00 2001
From: Srinivas Vamsi Parasa <srinivas.vamsi.parasa@intel.com>
Date: Mon, 18 Sep 2023 18:52:14 -0700
Subject: [PATCH 37/40] Update DualPivotQuicksort.java

---
 src/java.base/share/classes/java/util/DualPivotQuicksort.java | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/java.base/share/classes/java/util/DualPivotQuicksort.java b/src/java.base/share/classes/java/util/DualPivotQuicksort.java
index 3fa87815596db..f93507ea8709f 100644
--- a/src/java.base/share/classes/java/util/DualPivotQuicksort.java
+++ b/src/java.base/share/classes/java/util/DualPivotQuicksort.java
@@ -31,7 +31,6 @@
 import jdk.internal.vm.annotation.IntrinsicCandidate;
 import jdk.internal.vm.annotation.ForceInline;
 
-
 /**
  * This class implements powerful and fully optimized versions, both
  * sequential and parallel, of the Dual-Pivot Quicksort algorithm by
@@ -52,8 +51,6 @@
  *
  * @since 1.7 * 14
  */
-
-
 final class DualPivotQuicksort {
 
     /**

From b04cb6c3c41c7327f9dc67653e24b08693329e3e Mon Sep 17 00:00:00 2001
From: vamsi-parasa <srinivas.vamsi.parasa@intel.com>
Date: Wed, 20 Sep 2023 10:11:28 -0700
Subject: [PATCH 38/40] change variable names of indexPivot* to pivotIndex*

---
 .../classes/java/util/DualPivotQuicksort.java | 92 +++++++++----------
 1 file changed, 43 insertions(+), 49 deletions(-)

diff --git a/src/java.base/share/classes/java/util/DualPivotQuicksort.java b/src/java.base/share/classes/java/util/DualPivotQuicksort.java
index f93507ea8709f..4675b8f8d9ff3 100644
--- a/src/java.base/share/classes/java/util/DualPivotQuicksort.java
+++ b/src/java.base/share/classes/java/util/DualPivotQuicksort.java
@@ -169,10 +169,10 @@ interface PartitionOperation<A> {
          * @param a the array to be sorted
          * @param low the index of the first element, inclusive, to be sorted
          * @param high the index of the last element, exclusive, to be sorted
-         * @param indexPivot1 the index of pivot1, the first pivot
-         * @param indexPivot2 the index of pivot2, the second pivot
+         * @param pivotIndex1 the index of pivot1, the first pivot
+         * @param pivotIndex2 the index of pivot2, the second pivot
          */
-        int[] partition(A a, int low, int high, int indexPivot1, int indexPivot2);
+        int[] partition(A a, int low, int high, int pivotIndex1, int pivotIndex2);
     }
 
     /**
@@ -185,14 +185,14 @@ interface PartitionOperation<A> {
      * address pointing to the first element to partition from.
      * @param low the index of the first element, inclusive, to be sorted
      * @param high the index of the last element, exclusive, to be sorted
-     * @param indexPivot1 the index of pivot1, the first pivot
-     * @param indexPivot2 the index of pivot2, the second pivot
+     * @param pivotIndex1 the index of pivot1, the first pivot
+     * @param pivotIndex2 the index of pivot2, the second pivot
      * @param po the method reference for the fallback implementation
      */
     @IntrinsicCandidate
     @ForceInline
-    private static <A> int[] partition(Class<?> elemType, A array, long offset, int low, int high, int indexPivot1, int indexPivot2, PartitionOperation<A> po) {
-        return po.partition(array, low, high, indexPivot1, indexPivot2);
+    private static <A> int[] partition(Class<?> elemType, A array, long offset, int low, int high, int pivotIndex1, int pivotIndex2, PartitionOperation<A> po) {
+        return po.partition(array, low, high, pivotIndex1, pivotIndex2);
     }
 
     /**
@@ -399,18 +399,18 @@ && tryMergeRuns(sorter, a, low, size)) {
      * @param array the array to be partitioned
      * @param low the index of the first element, inclusive, for partitioning
      * @param high the index of the last element, exclusive, for partitioning
-     * @param indexPivot1 the index of pivot1, the first pivot
-     * @param indexPivot2 the index of pivot2, the second pivot
+     * @param pivotIndex1 the index of pivot1, the first pivot
+     * @param pivotIndex2 the index of pivot2, the second pivot
      *
      */
     @ForceInline
-    private static int[] partitionDualPivot(int[] a, int low, int high, int indexPivot1, int indexPivot2) {
+    private static int[] partitionDualPivot(int[] a, int low, int high, int pivotIndex1, int pivotIndex2) {
         int end = high - 1;
         int lower = low;
         int upper = end;
 
-        int e1 = indexPivot1;
-        int e5 = indexPivot2;
+        int e1 = pivotIndex1;
+        int e5 = pivotIndex2;
         int pivot1 = a[e1];
         int pivot2 = a[e5];
 
@@ -480,25 +480,23 @@ private static int[] partitionDualPivot(int[] a, int low, int high, int indexPiv
         return new int[] {lower, upper};
     }
 
-
-
     /**
      * Partitions the specified range of the array using a single pivot provided.
      *
      * @param array the array to be partitioned
      * @param low the index of the first element, inclusive, for partitioning
      * @param high the index of the last element, exclusive, for partitioning
-     * @param indexPivot1 the index of pivot1, the first pivot
-     * @param indexPivot2 the index of pivot2, the second pivot
+     * @param pivotIndex1 the index of pivot1, the first pivot
+     * @param pivotIndex2 the index of pivot2, the second pivot
      *
      */
     @ForceInline
-    private static int[] partitionSinglePivot(int[] a, int low, int high, int indexPivot1, int indexPivot2) {
+    private static int[] partitionSinglePivot(int[] a, int low, int high, int pivotIndex1, int pivotIndex2) {
 
         int end = high - 1;
         int lower = low;
         int upper = end;
-        int e3 = indexPivot1;
+        int e3 = pivotIndex1;
         int pivot = a[e3];
 
         /*
@@ -1202,18 +1200,18 @@ && tryMergeRuns(sorter, a, low, size)) {
      * @param array the array to be partitioned
      * @param low the index of the first element, inclusive, for partitioning
      * @param high the index of the last element, exclusive, for partitioning
-     * @param indexPivot1 the index of pivot1, the first pivot
-     * @param indexPivot2 the index of pivot2, the second pivot
+     * @param pivotIndex1 the index of pivot1, the first pivot
+     * @param pivotIndex2 the index of pivot2, the second pivot
      *
      */
     @ForceInline
-    private static int[] partitionDualPivot(long[] a, int low, int high, int indexPivot1, int indexPivot2) {
+    private static int[] partitionDualPivot(long[] a, int low, int high, int pivotIndex1, int pivotIndex2) {
         int end = high - 1;
         int lower = low;
         int upper = end;
 
-        int e1 = indexPivot1;
-        int e5 = indexPivot2;
+        int e1 = pivotIndex1;
+        int e5 = pivotIndex2;
         long pivot1 = a[e1];
         long pivot2 = a[e5];
 
@@ -1283,25 +1281,24 @@ private static int[] partitionDualPivot(long[] a, int low, int high, int indexPi
         return new int[] {lower, upper};
     }
 
-
     /**
      * Partitions the specified range of the array using a single pivot provided.
      *
      * @param array the array to be partitioned
      * @param low the index of the first element, inclusive, for partitioning
      * @param high the index of the last element, exclusive, for partitioning
-     * @param indexPivot1 the index of pivot1, the first pivot
-     * @param indexPivot2 the index of pivot2, the second pivot
+     * @param pivotIndex1 the index of pivot1, the first pivot
+     * @param pivotIndex2 the index of pivot2, the second pivot
      *
      */
     @ForceInline
-    private static int[] partitionSinglePivot(long[] a, int low, int high, int indexPivot1, int indexPivot2) {
+    private static int[] partitionSinglePivot(long[] a, int low, int high, int pivotIndex1, int pivotIndex2) {
 
         int end = high - 1;
         int lower = low;
         int upper = end;
 
-        int e3 = indexPivot1;
+        int e3 = pivotIndex1;
         long pivot = a[e3];
 
         /*
@@ -2792,18 +2789,18 @@ && tryMergeRuns(sorter, a, low, size)) {
      * @param array the array to be partitioned
      * @param low the index of the first element, inclusive, for partitioning
      * @param high the index of the last element, exclusive, for partitioning
-     * @param indexPivot1 the index of pivot1, the first pivot
-     * @param indexPivot2 the index of pivot2, the second pivot
+     * @param pivotIndex1 the index of pivot1, the first pivot
+     * @param pivotIndex2 the index of pivot2, the second pivot
      *
      */
     @ForceInline
-    private static int[] partitionDualPivot(float[] a, int low, int high, int indexPivot1, int indexPivot2) {
+    private static int[] partitionDualPivot(float[] a, int low, int high, int pivotIndex1, int pivotIndex2) {
         int end = high - 1;
         int lower = low;
         int upper = end;
 
-        int e1 = indexPivot1;
-        int e5 = indexPivot2;
+        int e1 = pivotIndex1;
+        int e5 = pivotIndex2;
         float pivot1 = a[e1];
         float pivot2 = a[e5];
 
@@ -2873,24 +2870,23 @@ private static int[] partitionDualPivot(float[] a, int low, int high, int indexP
         return new int[] {lower, upper};
     }
 
-
     /**
      * Partitions the specified range of the array using a single pivot provided.
      *
      * @param array the array to be partitioned
      * @param low the index of the first element, inclusive, for partitioning
      * @param high the index of the last element, exclusive, for partitioning
-     * @param indexPivot1 the index of pivot1, the first pivot
-     * @param indexPivot2 the index of pivot2, the second pivot
+     * @param pivotIndex1 the index of pivot1, the first pivot
+     * @param pivotIndex2 the index of pivot2, the second pivot
      *
      */
     @ForceInline
-    private static int[] partitionSinglePivot(float[] a, int low, int high, int indexPivot1, int indexPivot2) {
+    private static int[] partitionSinglePivot(float[] a, int low, int high, int pivotIndex1, int pivotIndex2) {
         int end = high - 1;
         int lower = low;
         int upper = end;
 
-        int e3 = indexPivot1;
+        int e3 = pivotIndex1;
         float pivot = a[e3];
 
         /*
@@ -3646,18 +3642,18 @@ && tryMergeRuns(sorter, a, low, size)) {
      * @param array the array to be partitioned
      * @param low the index of the first element, inclusive, for partitioning
      * @param high the index of the last element, exclusive, for partitioning
-     * @param indexPivot1 the index of pivot1, the first pivot
-     * @param indexPivot2 the index of pivot2, the second pivot
+     * @param pivotIndex1 the index of pivot1, the first pivot
+     * @param pivotIndex2 the index of pivot2, the second pivot
      *
      */
     @ForceInline
-    private static int[] partitionDualPivot(double[] a, int low, int high, int indexPivot1, int indexPivot2) {
+    private static int[] partitionDualPivot(double[] a, int low, int high, int pivotIndex1, int pivotIndex2) {
         int end = high - 1;
         int lower = low;
         int upper = end;
 
-        int e1 = indexPivot1;
-        int e5 = indexPivot2;
+        int e1 = pivotIndex1;
+        int e5 = pivotIndex2;
         double pivot1 = a[e1];
         double pivot2 = a[e5];
 
@@ -3727,25 +3723,23 @@ private static int[] partitionDualPivot(double[] a, int low, int high, int index
         return new int[] {lower, upper};
     }
 
-
-
     /**
      * Partitions the specified range of the array using a single pivot provided.
      *
      * @param array the array to be partitioned
      * @param low the index of the first element, inclusive, for partitioning
      * @param high the index of the last element, exclusive, for partitioning
-     * @param indexPivot1 the index of pivot1, the first pivot
-     * @param indexPivot2 the index of pivot2, the second pivot
+     * @param pivotIndex1 the index of pivot1, the first pivot
+     * @param pivotIndex2 the index of pivot2, the second pivot
      */
     @ForceInline
-    private static int[] partitionSinglePivot(double[] a, int low, int high, int indexPivot1, int indexPivot2) {
+    private static int[] partitionSinglePivot(double[] a, int low, int high, int pivotIndex1, int pivotIndex2) {
 
         int end = high - 1;
         int lower = low;
         int upper = end;
 
-        int e3 = indexPivot1;
+        int e3 = pivotIndex1;
         double pivot = a[e3];
 
         /*

From dbf433215121bcfa64e713951d9373607add922e Mon Sep 17 00:00:00 2001
From: vamsi-parasa <srinivas.vamsi.parasa@intel.com>
Date: Fri, 22 Sep 2023 09:39:18 -0700
Subject: [PATCH 39/40] Update CompileThresholdScaling only for the sort and
 partition intrinsics; update build script to remove nested if

---
 make/modules/java.base/Lib.gmk         | 30 ++++++++++++--------------
 test/jdk/java/util/Arrays/Sorting.java |  4 ++--
 2 files changed, 16 insertions(+), 18 deletions(-)

diff --git a/make/modules/java.base/Lib.gmk b/make/modules/java.base/Lib.gmk
index 976f5e8e75582..47a41d62f9f34 100644
--- a/make/modules/java.base/Lib.gmk
+++ b/make/modules/java.base/Lib.gmk
@@ -226,7 +226,7 @@ ifeq ($(ENABLE_FALLBACK_LINKER), true)
       NAME := fallbackLinker, \
       CFLAGS := $(CFLAGS_JDKLIB) $(LIBFFI_CFLAGS), \
       LDFLAGS := $(LDFLAGS_JDKLIB) \
-                 $(call SET_SHARED_LIBRARY_ORIGIN), \
+                  $(call SET_SHARED_LIBRARY_ORIGIN), \
       LIBS := $(LIBFFI_LIBS), \
       LIBS_windows := $(LIBFFI_LIBS) ws2_32.lib, \
   ))
@@ -236,22 +236,20 @@ endif
 
 ################################################################################
 
-ifeq ($(call isTargetOs, linux)+$(call isTargetCpu, x86_64)+$(INCLUDE_COMPILER2), true+true+true)
-  ifeq ($(TOOLCHAIN_TYPE), gcc)
-    $(eval $(call SetupJdkLibrary, BUILD_LIB_SIMD_SORT, \
-        NAME := simdsort, \
-        TOOLCHAIN := TOOLCHAIN_LINK_CXX, \
-        OPTIMIZATION := HIGH, \
-        CFLAGS := $(CFLAGS_JDKLIB), \
-        CXXFLAGS := $(CXXFLAGS_JDKLIB), \
-        LDFLAGS := $(LDFLAGS_JDKLIB) \
-            $(call SET_SHARED_LIBRARY_ORIGIN), \
-        LIBS := $(LIBCXX), \
-        LIBS_linux := -lc -lm -ldl, \
-    ))
+ifeq ($(call isTargetOs, linux)+$(call isTargetCpu, x86_64)+$(INCLUDE_COMPILER2)+$(filter $(TOOLCHAIN_TYPE), gcc), true+true+true+gcc)
+  $(eval $(call SetupJdkLibrary, BUILD_LIB_SIMD_SORT, \
+      NAME := simdsort, \
+      TOOLCHAIN := TOOLCHAIN_LINK_CXX, \
+      OPTIMIZATION := HIGH, \
+      CFLAGS := $(CFLAGS_JDKLIB), \
+      CXXFLAGS := $(CXXFLAGS_JDKLIB), \
+      LDFLAGS := $(LDFLAGS_JDKLIB) \
+          $(call SET_SHARED_LIBRARY_ORIGIN), \
+      LIBS := $(LIBCXX), \
+      LIBS_linux := -lc -lm -ldl, \
+  ))
 
-    TARGETS += $(BUILD_LIB_SIMD_SORT)
-  endif
+  TARGETS += $(BUILD_LIB_SIMD_SORT)
 endif
 
 ################################################################################
diff --git a/test/jdk/java/util/Arrays/Sorting.java b/test/jdk/java/util/Arrays/Sorting.java
index 113c8a688620c..f285b0c65b72c 100644
--- a/test/jdk/java/util/Arrays/Sorting.java
+++ b/test/jdk/java/util/Arrays/Sorting.java
@@ -26,8 +26,8 @@
  * @compile/module=java.base java/util/SortingHelper.java
  * @bug 6880672 6896573 6899694 6976036 7013585 7018258 8003981 8226297
  * @build Sorting
- * @run main/othervm -XX:+UnlockDiagnosticVMOptions -XX:DisableIntrinsic=_arraySort,_arrayPartition, Sorting -shortrun
- * @run main/othervm -XX:CompileThreshold=1 -XX:-TieredCompilation Sorting -shortrun
+ * @run main/othervm -XX:+UnlockDiagnosticVMOptions -XX:DisableIntrinsic=_arraySort,_arrayPartition Sorting -shortrun
+ * @run main/othervm -XX:-TieredCompilation -XX:CompileCommand=CompileThresholdScaling,java.util.DualPivotQuicksort::sort,0.0001 Sorting -shortrun
  * @summary Exercise Arrays.sort, Arrays.parallelSort
  *
  * @author Vladimir Yaroslavskiy

From a5262d8673c3388638f45204057c7127eda87c7d Mon Sep 17 00:00:00 2001
From: vamsi-parasa <srinivas.vamsi.parasa@intel.com>
Date: Thu, 5 Oct 2023 16:29:35 -0700
Subject: [PATCH 40/40] fix code style and formatting

---
 src/hotspot/cpu/x86/stubGenerator_x86_64.cpp  |  16 +-
 src/hotspot/share/opto/library_call.cpp       |  13 +-
 .../native/libsimdsort/avx512-32bit-qsort.hpp |   2 -
 .../native/libsimdsort/avx512-64bit-common.h  |   2 -
 .../native/libsimdsort/avx512-64bit-qsort.hpp |   2 -
 .../native/libsimdsort/avx512-common-qsort.h  |   2 -
 ...t_linux_x86.cpp => avx512-linux-qsort.cpp} |   2 -
 .../classes/java/util/DualPivotQuicksort.java | 396 +++++++++---------
 8 files changed, 211 insertions(+), 224 deletions(-)
 rename src/java.base/linux/native/libsimdsort/{avxsort_linux_x86.cpp => avx512-linux-qsort.cpp} (98%)

diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
index c76d5ce064914..79ebef8b58113 100644
--- a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
+++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
@@ -4173,14 +4173,13 @@ void StubGenerator::generate_compiler_stubs() {
   }
 
   // Load x86_64_sort library on supported hardware to enable avx512 sort and partition intrinsics
-    if (UseAVX > 2 && VM_Version::supports_avx512dq()) {
-
-      void *libsimdsort = nullptr;
-      char ebuf_[1024];
-      char dll_name_simd_sort[JVM_MAXPATHLEN];
-      if (os::dll_locate_lib(dll_name_simd_sort, sizeof(dll_name_simd_sort), Arguments::get_dll_dir(), "simdsort")) {
-        libsimdsort = os::dll_load(dll_name_simd_sort, ebuf_, sizeof ebuf_);
-      }
+  if (UseAVX > 2 && VM_Version::supports_avx512dq()) {
+    void *libsimdsort = nullptr;
+    char ebuf_[1024];
+    char dll_name_simd_sort[JVM_MAXPATHLEN];
+    if (os::dll_locate_lib(dll_name_simd_sort, sizeof(dll_name_simd_sort), Arguments::get_dll_dir(), "simdsort")) {
+      libsimdsort = os::dll_load(dll_name_simd_sort, ebuf_, sizeof ebuf_);
+    }
     // Get addresses for avx512 sort and partition routines
     if (libsimdsort != nullptr) {
       log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "simdsort" JNI_LIB_SUFFIX, p2i(libsimdsort));
@@ -4190,7 +4189,6 @@ void StubGenerator::generate_compiler_stubs() {
 
       snprintf(ebuf_, sizeof(ebuf_), "avx512_partition");
       StubRoutines::_array_partition = (address)os::dll_lookup(libsimdsort, ebuf_);
-
     }
   }
 
diff --git a/src/hotspot/share/opto/library_call.cpp b/src/hotspot/share/opto/library_call.cpp
index d24fe5dc4495d..4a9d7fb161667 100644
--- a/src/hotspot/share/opto/library_call.cpp
+++ b/src/hotspot/share/opto/library_call.cpp
@@ -5367,9 +5367,7 @@ void LibraryCallKit::create_new_uncommon_trap(CallStaticJavaNode* uncommon_trap_
 //------------------------------inline_array_partition-----------------------
 bool LibraryCallKit::inline_array_partition() {
 
-  address stubAddr = nullptr;
-  const char *stubName;
-  stubName = "array_partition_stub";
+  const char *stubName = "array_partition_stub";
 
   Node* elementType     = null_check(argument(0));
   Node* obj             = argument(1);
@@ -5382,6 +5380,7 @@ bool LibraryCallKit::inline_array_partition() {
   const TypeInstPtr* elem_klass = gvn().type(elementType)->isa_instptr();
   ciType* elem_type = elem_klass->const_oop()->as_instance()->java_mirror_type();
   BasicType bt = elem_type->basic_type();
+  address stubAddr = nullptr;
   stubAddr = StubRoutines::select_array_partition_function();
   // stub not loaded
   if (stubAddr == nullptr) {
@@ -5395,10 +5394,9 @@ bool LibraryCallKit::inline_array_partition() {
   Node* obj_adr = make_unsafe_address(obj, offset);
 
   // create the pivotIndices array of type int and size = 2
-  Node* pivotIndices = nullptr;
   Node* size = intcon(2);
   Node* klass_node = makecon(TypeKlassPtr::make(ciTypeArrayKlass::make(T_INT)));
-  pivotIndices = new_array(klass_node, size, 0);  // no arguments to push
+  Node* pivotIndices = new_array(klass_node, size, 0);  // no arguments to push
   AllocateArrayNode* alloc = tightly_coupled_allocation(pivotIndices);
   guarantee(alloc != nullptr, "created above");
   Node* pivotIndices_adr = basic_plus_adr(pivotIndices, arrayOopDesc::base_offset_in_bytes(T_INT));
@@ -5409,7 +5407,8 @@ bool LibraryCallKit::inline_array_partition() {
   // Call the stub
   make_runtime_call(RC_LEAF|RC_NO_FP, OptoRuntime::array_partition_Type(),
                     stubAddr, stubName, TypePtr::BOTTOM,
-                    obj_adr, elemType, fromIndex, toIndex, pivotIndices_adr, indexPivot1, indexPivot2);
+                    obj_adr, elemType, fromIndex, toIndex, pivotIndices_adr,
+                    indexPivot1, indexPivot2);
 
   if (!stopped()) {
     set_result(pivotIndices);
@@ -5422,7 +5421,6 @@ bool LibraryCallKit::inline_array_partition() {
 //------------------------------inline_array_sort-----------------------
 bool LibraryCallKit::inline_array_sort() {
 
-  address stubAddr = nullptr;
   const char *stubName;
   stubName = "arraysort_stub";
 
@@ -5435,6 +5433,7 @@ bool LibraryCallKit::inline_array_sort() {
   const TypeInstPtr* elem_klass = gvn().type(elementType)->isa_instptr();
   ciType* elem_type = elem_klass->const_oop()->as_instance()->java_mirror_type();
   BasicType bt = elem_type->basic_type();
+  address stubAddr = nullptr;
   stubAddr = StubRoutines::select_arraysort_function();
   //stub not loaded
   if (stubAddr == nullptr) {
diff --git a/src/java.base/linux/native/libsimdsort/avx512-32bit-qsort.hpp b/src/java.base/linux/native/libsimdsort/avx512-32bit-qsort.hpp
index 15e406a822900..4fbe9b97450c6 100644
--- a/src/java.base/linux/native/libsimdsort/avx512-32bit-qsort.hpp
+++ b/src/java.base/linux/native/libsimdsort/avx512-32bit-qsort.hpp
@@ -1,8 +1,6 @@
 /*
  * Copyright (c) 2021, 2023, Intel Corporation. All rights reserved.
  * Copyright (c) 2021 Serge Sans Paille. All rights reserved.
- * Intel x86-simd-sort source code.
- *
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
diff --git a/src/java.base/linux/native/libsimdsort/avx512-64bit-common.h b/src/java.base/linux/native/libsimdsort/avx512-64bit-common.h
index bb7553229eacb..9993cd22e6377 100644
--- a/src/java.base/linux/native/libsimdsort/avx512-64bit-common.h
+++ b/src/java.base/linux/native/libsimdsort/avx512-64bit-common.h
@@ -1,7 +1,5 @@
 /*
  * Copyright (c) 2021, 2023, Intel Corporation. All rights reserved.
- * Intel x86-simd-sort source code.
- *
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
diff --git a/src/java.base/linux/native/libsimdsort/avx512-64bit-qsort.hpp b/src/java.base/linux/native/libsimdsort/avx512-64bit-qsort.hpp
index 3028f45a79407..e28ebe19695de 100644
--- a/src/java.base/linux/native/libsimdsort/avx512-64bit-qsort.hpp
+++ b/src/java.base/linux/native/libsimdsort/avx512-64bit-qsort.hpp
@@ -1,7 +1,5 @@
 /*
  * Copyright (c) 2021, 2023, Intel Corporation. All rights reserved.
- * Intel x86-simd-sort source code.
- *
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
diff --git a/src/java.base/linux/native/libsimdsort/avx512-common-qsort.h b/src/java.base/linux/native/libsimdsort/avx512-common-qsort.h
index 16aeb0d50a30f..b008bcd54b80c 100644
--- a/src/java.base/linux/native/libsimdsort/avx512-common-qsort.h
+++ b/src/java.base/linux/native/libsimdsort/avx512-common-qsort.h
@@ -1,8 +1,6 @@
 /*
  * Copyright (c) 2021, 2023, Intel Corporation. All rights reserved.
  * Copyright (c) 2021 Serge Sans Paille. All rights reserved.
- * Intel x86-simd-sort source code.
- *
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
diff --git a/src/java.base/linux/native/libsimdsort/avxsort_linux_x86.cpp b/src/java.base/linux/native/libsimdsort/avx512-linux-qsort.cpp
similarity index 98%
rename from src/java.base/linux/native/libsimdsort/avxsort_linux_x86.cpp
rename to src/java.base/linux/native/libsimdsort/avx512-linux-qsort.cpp
index a4ac2a8e4955f..6bd0c5871d6cb 100644
--- a/src/java.base/linux/native/libsimdsort/avxsort_linux_x86.cpp
+++ b/src/java.base/linux/native/libsimdsort/avx512-linux-qsort.cpp
@@ -1,7 +1,5 @@
 /*
  * Copyright (c) 2023 Intel Corporation. All rights reserved.
- * Intel x86-simd-sort source code.
- *
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
diff --git a/src/java.base/share/classes/java/util/DualPivotQuicksort.java b/src/java.base/share/classes/java/util/DualPivotQuicksort.java
index 4675b8f8d9ff3..0dd4b6e354aed 100644
--- a/src/java.base/share/classes/java/util/DualPivotQuicksort.java
+++ b/src/java.base/share/classes/java/util/DualPivotQuicksort.java
@@ -415,40 +415,40 @@ private static int[] partitionDualPivot(int[] a, int low, int high, int pivotInd
         int pivot2 = a[e5];
 
         /*
-        * The first and the last elements to be sorted are moved
-        * to the locations formerly occupied by the pivots. When
-        * partitioning is completed, the pivots are swapped back
-        * into their final positions, and excluded from the next
-        * subsequent sorting.
-        */
+         * The first and the last elements to be sorted are moved
+         * to the locations formerly occupied by the pivots. When
+         * partitioning is completed, the pivots are swapped back
+         * into their final positions, and excluded from the next
+         * subsequent sorting.
+         */
         a[e1] = a[lower];
         a[e5] = a[upper];
 
         /*
-        * Skip elements, which are less or greater than the pivots.
-        */
+         * Skip elements, which are less or greater than the pivots.
+         */
         while (a[++lower] < pivot1);
         while (a[--upper] > pivot2);
 
         /*
-        * Backward 3-interval partitioning
-        *
-        *   left part                 central part          right part
-        * +------------------------------------------------------------+
-                 * |  < pivot1  |   ?   |  pivot1 <= && <= pivot2  |  > pivot2  |
-        * +------------------------------------------------------------+
-        *             ^       ^                            ^
-        *             |       |                            |
-        *           lower     k                          upper
-        *
-        * Invariants:
-        *
-        *              all in (low, lower] < pivot1
-        *    pivot1 <= all in (k, upper)  <= pivot2
-        *              all in [upper, end) > pivot2
-        *
-        * Pointer k is the last index of ?-part
-        */
+         * Backward 3-interval partitioning
+         *
+         *   left part                 central part          right part
+         * +------------------------------------------------------------+
+                  * |  < pivot1  |   ?   |  pivot1 <= && <= pivot2  |  > pivot2  |
+         * +------------------------------------------------------------+
+         *             ^       ^                            ^
+         *             |       |                            |
+         *           lower     k                          upper
+         *
+         * Invariants:
+         *
+         *              all in (low, lower] < pivot1
+         *    pivot1 <= all in (k, upper)  <= pivot2
+         *              all in [upper, end) > pivot2
+         *
+         * Pointer k is the last index of ?-part
+         */
         for (int unused = --lower, k = ++upper; --k > lower; ) {
             int ak = a[k];
 
@@ -500,33 +500,33 @@ private static int[] partitionSinglePivot(int[] a, int low, int high, int pivotI
         int pivot = a[e3];
 
         /*
-        * The first element to be sorted is moved to the
-        * location formerly occupied by the pivot. After
-        * completion of partitioning the pivot is swapped
-        * back into its final position, and excluded from
-        * the next subsequent sorting.
-        */
+         * The first element to be sorted is moved to the
+         * location formerly occupied by the pivot. After
+         * completion of partitioning the pivot is swapped
+         * back into its final position, and excluded from
+         * the next subsequent sorting.
+         */
         a[e3] = a[lower];
 
         /*
-        * Traditional 3-way (Dutch National Flag) partitioning
-        *
-        *   left part                 central part    right part
-        * +------------------------------------------------------+
-        * |   < pivot   |     ?     |   == pivot   |   > pivot   |
-        * +------------------------------------------------------+
-        *              ^           ^                ^
-        *              |           |                |
-        *            lower         k              upper
-        *
-        * Invariants:
-        *
-        *   all in (low, lower] < pivot
-        *   all in (k, upper)  == pivot
-        *   all in [upper, end] > pivot
-        *
-        * Pointer k is the last index of ?-part
-        */
+         * Traditional 3-way (Dutch National Flag) partitioning
+         *
+         *   left part                 central part    right part
+         * +------------------------------------------------------+
+         * |   < pivot   |     ?     |   == pivot   |   > pivot   |
+         * +------------------------------------------------------+
+         *              ^           ^                ^
+         *              |           |                |
+         *            lower         k              upper
+         *
+         * Invariants:
+         *
+         *   all in (low, lower] < pivot
+         *   all in (k, upper)  == pivot
+         *   all in [upper, end] > pivot
+         *
+         * Pointer k is the last index of ?-part
+         */
         for (int k = ++upper; --k > lower; ) {
             int ak = a[k];
 
@@ -547,8 +547,8 @@ private static int[] partitionSinglePivot(int[] a, int low, int high, int pivotI
         }
 
         /*
-        * Swap the pivot into its final position.
-        */
+         * Swap the pivot into its final position.
+         */
         a[low] = a[lower]; a[lower] = pivot;
         return new int[] {lower, upper};
     }
@@ -1216,40 +1216,40 @@ private static int[] partitionDualPivot(long[] a, int low, int high, int pivotIn
         long pivot2 = a[e5];
 
         /*
-        * The first and the last elements to be sorted are moved
-        * to the locations formerly occupied by the pivots. When
-        * partitioning is completed, the pivots are swapped back
-        * into their final positions, and excluded from the next
-        * subsequent sorting.
-        */
+         * The first and the last elements to be sorted are moved
+         * to the locations formerly occupied by the pivots. When
+         * partitioning is completed, the pivots are swapped back
+         * into their final positions, and excluded from the next
+         * subsequent sorting.
+         */
         a[e1] = a[lower];
         a[e5] = a[upper];
 
         /*
-        * Skip elements, which are less or greater than the pivots.
-        */
+         * Skip elements, which are less or greater than the pivots.
+         */
         while (a[++lower] < pivot1);
         while (a[--upper] > pivot2);
 
         /*
-        * Backward 3-interval partitioning
-        *
-        *   left part                 central part          right part
-        * +------------------------------------------------------------+
-                 * |  < pivot1  |   ?   |  pivot1 <= && <= pivot2  |  > pivot2  |
-        * +------------------------------------------------------------+
-        *             ^       ^                            ^
-        *             |       |                            |
-        *           lower     k                          upper
-        *
-        * Invariants:
-        *
-        *              all in (low, lower] < pivot1
-        *    pivot1 <= all in (k, upper)  <= pivot2
-        *              all in [upper, end) > pivot2
-        *
-        * Pointer k is the last index of ?-part
-        */
+         * Backward 3-interval partitioning
+         *
+         *   left part                 central part          right part
+         * +------------------------------------------------------------+
+                  * |  < pivot1  |   ?   |  pivot1 <= && <= pivot2  |  > pivot2  |
+         * +------------------------------------------------------------+
+         *             ^       ^                            ^
+         *             |       |                            |
+         *           lower     k                          upper
+         *
+         * Invariants:
+         *
+         *              all in (low, lower] < pivot1
+         *    pivot1 <= all in (k, upper)  <= pivot2
+         *              all in [upper, end) > pivot2
+         *
+         * Pointer k is the last index of ?-part
+         */
         for (int unused = --lower, k = ++upper; --k > lower; ) {
             long ak = a[k];
 
@@ -1302,33 +1302,33 @@ private static int[] partitionSinglePivot(long[] a, int low, int high, int pivot
         long pivot = a[e3];
 
         /*
-        * The first element to be sorted is moved to the
-        * location formerly occupied by the pivot. After
-        * completion of partitioning the pivot is swapped
-        * back into its final position, and excluded from
-        * the next subsequent sorting.
-        */
+         * The first element to be sorted is moved to the
+         * location formerly occupied by the pivot. After
+         * completion of partitioning the pivot is swapped
+         * back into its final position, and excluded from
+         * the next subsequent sorting.
+         */
         a[e3] = a[lower];
 
         /*
-        * Traditional 3-way (Dutch National Flag) partitioning
-        *
-        *   left part                 central part    right part
-        * +------------------------------------------------------+
-        * |   < pivot   |     ?     |   == pivot   |   > pivot   |
-        * +------------------------------------------------------+
-        *              ^           ^                ^
-        *              |           |                |
-        *            lower         k              upper
-        *
-        * Invariants:
-        *
-        *   all in (low, lower] < pivot
-        *   all in (k, upper)  == pivot
-        *   all in [upper, end] > pivot
-        *
-        * Pointer k is the last index of ?-part
-        */
+         * Traditional 3-way (Dutch National Flag) partitioning
+         *
+         *   left part                 central part    right part
+         * +------------------------------------------------------+
+         * |   < pivot   |     ?     |   == pivot   |   > pivot   |
+         * +------------------------------------------------------+
+         *              ^           ^                ^
+         *              |           |                |
+         *            lower         k              upper
+         *
+         * Invariants:
+         *
+         *   all in (low, lower] < pivot
+         *   all in (k, upper)  == pivot
+         *   all in [upper, end] > pivot
+         *
+         * Pointer k is the last index of ?-part
+         */
         for (int k = ++upper; --k > lower; ) {
             long ak = a[k];
 
@@ -1349,8 +1349,8 @@ private static int[] partitionSinglePivot(long[] a, int low, int high, int pivot
         }
 
         /*
-            * Swap the pivot into its final position.
-            */
+         * Swap the pivot into its final position.
+         */
         a[low] = a[lower]; a[lower] = pivot;
         return new int[] {lower, upper};
     }
@@ -2805,40 +2805,40 @@ private static int[] partitionDualPivot(float[] a, int low, int high, int pivotI
         float pivot2 = a[e5];
 
         /*
-        * The first and the last elements to be sorted are moved
-        * to the locations formerly occupied by the pivots. When
-        * partitioning is completed, the pivots are swapped back
-        * into their final positions, and excluded from the next
-        * subsequent sorting.
-        */
+         * The first and the last elements to be sorted are moved
+         * to the locations formerly occupied by the pivots. When
+         * partitioning is completed, the pivots are swapped back
+         * into their final positions, and excluded from the next
+         * subsequent sorting.
+         */
         a[e1] = a[lower];
         a[e5] = a[upper];
 
         /*
-        * Skip elements, which are less or greater than the pivots.
-        */
-                while (a[++lower] < pivot1);
-                while (a[--upper] > pivot2);
+         * Skip elements, which are less or greater than the pivots.
+         */
+        while (a[++lower] < pivot1);
+        while (a[--upper] > pivot2);
 
         /*
-        * Backward 3-interval partitioning
-        *
-        *   left part                 central part          right part
-        * +------------------------------------------------------------+
-                 * |  < pivot1  |   ?   |  pivot1 <= && <= pivot2  |  > pivot2  |
-        * +------------------------------------------------------------+
-        *             ^       ^                            ^
-        *             |       |                            |
-        *           lower     k                          upper
-        *
-        * Invariants:
-        *
-        *              all in (low, lower] < pivot1
-        *    pivot1 <= all in (k, upper)  <= pivot2
-        *              all in [upper, end) > pivot2
-        *
-        * Pointer k is the last index of ?-part
-        */
+         * Backward 3-interval partitioning
+         *
+         *   left part                 central part          right part
+         * +------------------------------------------------------------+
+                  * |  < pivot1  |   ?   |  pivot1 <= && <= pivot2  |  > pivot2  |
+         * +------------------------------------------------------------+
+         *             ^       ^                            ^
+         *             |       |                            |
+         *           lower     k                          upper
+         *
+         * Invariants:
+         *
+         *              all in (low, lower] < pivot1
+         *    pivot1 <= all in (k, upper)  <= pivot2
+         *              all in [upper, end) > pivot2
+         *
+         * Pointer k is the last index of ?-part
+         */
         for (int unused = --lower, k = ++upper; --k > lower; ) {
             float ak = a[k];
 
@@ -2890,33 +2890,33 @@ private static int[] partitionSinglePivot(float[] a, int low, int high, int pivo
         float pivot = a[e3];
 
         /*
-        * The first element to be sorted is moved to the
-        * location formerly occupied by the pivot. After
-        * completion of partitioning the pivot is swapped
-        * back into its final position, and excluded from
-        * the next subsequent sorting.
-        */
+         * The first element to be sorted is moved to the
+         * location formerly occupied by the pivot. After
+         * completion of partitioning the pivot is swapped
+         * back into its final position, and excluded from
+         * the next subsequent sorting.
+         */
         a[e3] = a[lower];
 
         /*
-        * Traditional 3-way (Dutch National Flag) partitioning
-        *
-        *   left part                 central part    right part
-        * +------------------------------------------------------+
-        * |   < pivot   |     ?     |   == pivot   |   > pivot   |
-        * +------------------------------------------------------+
-        *              ^           ^                ^
-        *              |           |                |
-        *            lower         k              upper
-        *
-        * Invariants:
-        *
-        *   all in (low, lower] < pivot
-        *   all in (k, upper)  == pivot
-        *   all in [upper, end] > pivot
-        *
-        * Pointer k is the last index of ?-part
-        */
+         * Traditional 3-way (Dutch National Flag) partitioning
+         *
+         *   left part                 central part    right part
+         * +------------------------------------------------------+
+         * |   < pivot   |     ?     |   == pivot   |   > pivot   |
+         * +------------------------------------------------------+
+         *              ^           ^                ^
+         *              |           |                |
+         *            lower         k              upper
+         *
+         * Invariants:
+         *
+         *   all in (low, lower] < pivot
+         *   all in (k, upper)  == pivot
+         *   all in [upper, end] > pivot
+         *
+         * Pointer k is the last index of ?-part
+         */
         for (int k = ++upper; --k > lower; ) {
             float ak = a[k];
 
@@ -2937,8 +2937,8 @@ private static int[] partitionSinglePivot(float[] a, int low, int high, int pivo
         }
 
         /*
-            * Swap the pivot into its final position.
-            */
+         * Swap the pivot into its final position.
+         */
         a[low] = a[lower]; a[lower] = pivot;
         return new int[] {lower, upper};
     }
@@ -3670,28 +3670,28 @@ private static int[] partitionDualPivot(double[] a, int low, int high, int pivot
         /*
         * Skip elements, which are less or greater than the pivots.
         */
-                while (a[++lower] < pivot1);
-                while (a[--upper] > pivot2);
+        while (a[++lower] < pivot1);
+        while (a[--upper] > pivot2);
 
         /*
-        * Backward 3-interval partitioning
-        *
-        *   left part                 central part          right part
-        * +------------------------------------------------------------+
-                 * |  < pivot1  |   ?   |  pivot1 <= && <= pivot2  |  > pivot2  |
-        * +------------------------------------------------------------+
-        *             ^       ^                            ^
-        *             |       |                            |
-        *           lower     k                          upper
-        *
-        * Invariants:
-        *
-        *              all in (low, lower] < pivot1
-        *    pivot1 <= all in (k, upper)  <= pivot2
-        *              all in [upper, end) > pivot2
-        *
-        * Pointer k is the last index of ?-part
-        */
+         * Backward 3-interval partitioning
+         *
+         *   left part                 central part          right part
+         * +------------------------------------------------------------+
+                  * |  < pivot1  |   ?   |  pivot1 <= && <= pivot2  |  > pivot2  |
+         * +------------------------------------------------------------+
+         *             ^       ^                            ^
+         *             |       |                            |
+         *           lower     k                          upper
+         *
+         * Invariants:
+         *
+         *              all in (low, lower] < pivot1
+         *    pivot1 <= all in (k, upper)  <= pivot2
+         *              all in [upper, end) > pivot2
+         *
+         * Pointer k is the last index of ?-part
+         */
         for (int unused = --lower, k = ++upper; --k > lower; ) {
             double ak = a[k];
 
@@ -3752,24 +3752,24 @@ private static int[] partitionSinglePivot(double[] a, int low, int high, int piv
         a[e3] = a[lower];
 
         /*
-        * Traditional 3-way (Dutch National Flag) partitioning
-        *
-        *   left part                 central part    right part
-        * +------------------------------------------------------+
-        * |   < pivot   |     ?     |   == pivot   |   > pivot   |
-        * +------------------------------------------------------+
-        *              ^           ^                ^
-        *              |           |                |
-        *            lower         k              upper
-        *
-        * Invariants:
-        *
-        *   all in (low, lower] < pivot
-        *   all in (k, upper)  == pivot
-        *   all in [upper, end] > pivot
-        *
-        * Pointer k is the last index of ?-part
-        */
+         * Traditional 3-way (Dutch National Flag) partitioning
+         *
+         *   left part                 central part    right part
+         * +------------------------------------------------------+
+         * |   < pivot   |     ?     |   == pivot   |   > pivot   |
+         * +------------------------------------------------------+
+         *              ^           ^                ^
+         *              |           |                |
+         *            lower         k              upper
+         *
+         * Invariants:
+         *
+         *   all in (low, lower] < pivot
+         *   all in (k, upper)  == pivot
+         *   all in [upper, end] > pivot
+         *
+         * Pointer k is the last index of ?-part
+         */
         for (int k = ++upper; --k > lower; ) {
             double ak = a[k];
 
@@ -3790,8 +3790,8 @@ private static int[] partitionSinglePivot(double[] a, int low, int high, int piv
         }
 
         /*
-            * Swap the pivot into its final position.
-            */
+         * Swap the pivot into its final position.
+         */
         a[low] = a[lower]; a[lower] = pivot;
         return new int[] {lower, upper};
     }