diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp
index 3dd6a270d08cc5..827f7c02abd5cb 100644
--- a/src/coreclr/jit/compiler.cpp
+++ b/src/coreclr/jit/compiler.cpp
@@ -2298,7 +2298,7 @@ void Compiler::compSetProcessor()
instructionSetFlags.AddInstructionSet(InstructionSet_Vector512);
- if ((preferredVectorByteLength == 0) && jitFlags.IsSet(JitFlags::JIT_FLAG_VECTOR512_THROTTLING))
+ if ((preferredVectorByteLength == 0) && opts.Vector512Throttling())
{
// Some architectures can experience frequency throttling when
// executing 512-bit width instructions. To account for this we set the
diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h
index 0d24478ae115d8..db07bec2c5b702 100644
--- a/src/coreclr/jit/compiler.h
+++ b/src/coreclr/jit/compiler.h
@@ -9508,6 +9508,16 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
return jitFlags->IsSet(JitFlags::JIT_FLAG_REVERSE_PINVOKE);
}
+ // true if JitFlags::JIT_FLAG_VECTOR512_THROTTLING is set to true
+ bool Vector512Throttling()
+ {
+#if defined(TARGET_XARCH)
+ return jitFlags->IsSet(JitFlags::JIT_FLAG_VECTOR512_THROTTLING);
+#else
+ return false;
+#endif
+ }
+
bool compScopeInfo; // Generate the LocalVar info ?
bool compDbgCode; // Generate debugger-friendly code?
bool compDbgInfo; // Gather debugging info?
diff --git a/src/coreclr/jit/hwintrinsic.cpp b/src/coreclr/jit/hwintrinsic.cpp
index 4fcae9873de1fc..9e1351a0cbad84 100644
--- a/src/coreclr/jit/hwintrinsic.cpp
+++ b/src/coreclr/jit/hwintrinsic.cpp
@@ -515,6 +515,11 @@ NamedIntrinsic HWIntrinsicInfo::lookupId(Compiler* comp,
}
else if (strcmp(className, "Vector512") == 0)
{
+ // If the JitFlags::JIT_FLAG_VECTOR512_THROTTLING flag is set, we do not need to do any further checks.
+ if (comp->opts.Vector512Throttling())
+ {
+ return NI_IsSupported_False;
+ }
isa = InstructionSet_AVX512F;
}
}
diff --git a/src/libraries/System.Private.CoreLib/src/System/Numerics/BitOperations.cs b/src/libraries/System.Private.CoreLib/src/System/Numerics/BitOperations.cs
index 6df1d1957f901e..db6a16cd29beb6 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Numerics/BitOperations.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Numerics/BitOperations.cs
@@ -944,6 +944,17 @@ internal static uint ResetLowestSetBit(uint value)
return value & (value - 1);
}
+ ///
+ /// Reset specific bit in the given value
+ /// Reset the lowest significant bit in the given value
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal static ulong ResetLowestSetBit(ulong value)
+ {
+ // It's lowered to BLSR on x86
+ return value & (value - 1);
+ }
+
///
/// Flip the bit at a specific position in a given value.
/// Similar in behavior to the x86 instruction BTC (Bit Test and Complement).
@@ -957,5 +968,19 @@ internal static uint FlipBit(uint value, int index)
{
return value ^ (1u << index);
}
+
+ ///
+ /// Flip the bit at a specific position in a given value.
+ /// Similar in behavior to the x86 instruction BTC (Bit Test and Complement).
+ /// ///
+ /// The value.
+ /// The zero-based index of the bit to flip.
+ /// Any value outside the range [0..63] is treated as congruent mod 64.
+ /// The new value.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal static ulong FlipBit(ulong value, int index)
+ {
+ return value ^ (1ul << index);
+ }
}
}
diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector512.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector512.cs
index 27f15724d7baa3..5b740db4300982 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector512.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector512.cs
@@ -1791,6 +1791,21 @@ public static Vector512 LoadUnsafe(ref T source, nuint elementOffset)
return Unsafe.ReadUnaligned>(ref Unsafe.As(ref source));
}
+ /// Loads a vector from the given source and reinterprets it as .
+ /// The source from which the vector will be loaded.
+ /// The vector loaded from .
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal static Vector512 LoadUnsafe(ref char source) =>
+ LoadUnsafe(ref Unsafe.As(ref source));
+
+ /// Loads a vector from the given source and element offset and reinterprets it as .
+ /// The source to which will be added before loading the vector.
+ /// The element offset from from which the vector will be loaded.
+ /// The vector loaded from plus .
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal static Vector512 LoadUnsafe(ref char source, nuint elementOffset) =>
+ LoadUnsafe(ref Unsafe.As(ref source), elementOffset);
+
/// Computes the maximum of two vectors on a per-element basis.
/// The type of the elements in the vector.
/// The vector to compare with .
diff --git a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs
index 88008f7195daa8..7804b74f9ae20d 100644
--- a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs
@@ -63,7 +63,68 @@ ref Unsafe.Add(ref searchSpace, offset + 1),
// Based on http://0x80.pl/articles/simd-strfind.html#algorithm-1-generic-simd "Algorithm 1: Generic SIMD" by Wojciech Mula
// Some details about the implementation can also be found in https://github.com/dotnet/runtime/pull/63285
SEARCH_TWO_BYTES:
- if (Vector256.IsHardwareAccelerated && searchSpaceMinusValueTailLength - Vector256.Count >= 0)
+ if (Vector512.IsHardwareAccelerated && searchSpaceMinusValueTailLength - Vector512.Count >= 0)
+ {
+ // Find the last unique (which is not equal to ch1) byte
+ // the algorithm is fine if both are equal, just a little bit less efficient
+ byte ch2Val = Unsafe.Add(ref value, valueTailLength);
+ nint ch1ch2Distance = (nint)(uint)valueTailLength;
+ while (ch2Val == value && ch1ch2Distance > 1)
+ ch2Val = Unsafe.Add(ref value, --ch1ch2Distance);
+
+ Vector512 ch1 = Vector512.Create(value);
+ Vector512 ch2 = Vector512.Create(ch2Val);
+
+ nint searchSpaceMinusValueTailLengthAndVector =
+ searchSpaceMinusValueTailLength - (nint)Vector512.Count;
+
+ do
+ {
+ Debug.Assert(offset >= 0);
+ // Make sure we don't go out of bounds
+ Debug.Assert(offset + ch1ch2Distance + Vector512.Count <= searchSpaceLength);
+
+ Vector512 cmpCh2 = Vector512.Equals(ch2, Vector512.LoadUnsafe(ref searchSpace, (nuint)(offset + ch1ch2Distance)));
+ Vector512 cmpCh1 = Vector512.Equals(ch1, Vector512.LoadUnsafe(ref searchSpace, (nuint)offset));
+ Vector512 cmpAnd = (cmpCh1 & cmpCh2).AsByte();
+
+ // Early out: cmpAnd is all zeros
+ if (cmpAnd != Vector512.Zero)
+ {
+ goto CANDIDATE_FOUND;
+ }
+
+ LOOP_FOOTER:
+ offset += Vector512.Count;
+
+ if (offset == searchSpaceMinusValueTailLength)
+ return -1;
+
+ // Overlap with the current chunk for trailing elements
+ if (offset > searchSpaceMinusValueTailLengthAndVector)
+ offset = searchSpaceMinusValueTailLengthAndVector;
+
+ continue;
+
+ CANDIDATE_FOUND:
+ ulong mask = cmpAnd.ExtractMostSignificantBits();
+ do
+ {
+ int bitPos = BitOperations.TrailingZeroCount(mask);
+ if (valueLength == 2 || // we already matched two bytes
+ SequenceEqual(
+ ref Unsafe.Add(ref searchSpace, offset + bitPos),
+ ref value, (nuint)(uint)valueLength)) // The (nuint)-cast is necessary to pick the correct overload
+ {
+ return (int)(offset + bitPos);
+ }
+ mask = BitOperations.ResetLowestSetBit(mask); // Clear the lowest set bit
+ } while (mask != 0);
+ goto LOOP_FOOTER;
+
+ } while (true);
+ }
+ else if (Vector256.IsHardwareAccelerated && searchSpaceMinusValueTailLength - Vector256.Count >= 0)
{
// Find the last unique (which is not equal to ch1) byte
// the algorithm is fine if both are equal, just a little bit less efficient
@@ -235,7 +296,54 @@ ref Unsafe.Add(ref searchSpace, relativeIndex + 1),
// Based on http://0x80.pl/articles/simd-strfind.html#algorithm-1-generic-simd "Algorithm 1: Generic SIMD" by Wojciech Mula
// Some details about the implementation can also be found in https://github.com/dotnet/runtime/pull/63285
SEARCH_TWO_BYTES:
- if (Vector256.IsHardwareAccelerated && searchSpaceMinusValueTailLength >= Vector256.Count)
+ if (Vector512.IsHardwareAccelerated && searchSpaceMinusValueTailLength >= Vector512.Count)
+ {
+ offset = searchSpaceMinusValueTailLength - Vector512.Count;
+
+ // Find the last unique (which is not equal to ch1) byte
+ // the algorithm is fine if both are equal, just a little bit less efficient
+ byte ch2Val = Unsafe.Add(ref value, valueTailLength);
+ int ch1ch2Distance = valueTailLength;
+ while (ch2Val == value && ch1ch2Distance > 1)
+ ch2Val = Unsafe.Add(ref value, --ch1ch2Distance);
+
+ Vector512 ch1 = Vector512.Create(value);
+ Vector512 ch2 = Vector512.Create(ch2Val);
+ do
+ {
+ Vector512 cmpCh1 = Vector512.Equals(ch1, Vector512.LoadUnsafe(ref searchSpace, (nuint)offset));
+ Vector512 cmpCh2 = Vector512.Equals(ch2, Vector512.LoadUnsafe(ref searchSpace, (nuint)(offset + ch1ch2Distance)));
+ Vector512 cmpAnd = (cmpCh1 & cmpCh2).AsByte();
+
+ // Early out: cmpAnd is all zeros
+ if (cmpAnd != Vector512.Zero)
+ {
+ ulong mask = cmpAnd.ExtractMostSignificantBits();
+ do
+ {
+ // unlike IndexOf, here we use LZCNT to process matches starting from the end
+ int highestSetBitIndex = 63 - BitOperations.LeadingZeroCount(mask);
+ if (valueLength == 2 || // we already matched two bytes
+ SequenceEqual(
+ ref Unsafe.Add(ref searchSpace, offset + highestSetBitIndex),
+ ref value, (nuint)(uint)valueLength)) // The (nuint)-cast is necessary to pick the correct overload
+ {
+ return highestSetBitIndex + offset;
+ }
+ // Clear the highest set bit.
+ mask = BitOperations.FlipBit(mask, highestSetBitIndex);
+ } while (mask != 0);
+ }
+
+ offset -= Vector512.Count;
+ if (offset == -Vector512.Count)
+ return -1;
+ // Overlap with the current chunk if there is not enough room for the next one
+ if (offset < 0)
+ offset = 0;
+ } while (true) ;
+ }
+ else if (Vector256.IsHardwareAccelerated && searchSpaceMinusValueTailLength >= Vector256.Count)
{
offset = searchSpaceMinusValueTailLength - Vector256.Count;
@@ -345,7 +453,6 @@ private static void ThrowMustBeNullTerminatedString()
internal static unsafe int IndexOfNullByte(byte* searchSpace)
{
const int Length = int.MaxValue;
-
const uint uValue = 0; // Use uint for comparisons to avoid unnecessary 8->32 extensions
nuint offset = 0; // Use nuint for arithmetic to avoid unnecessary 64->32->64 truncations
nuint lengthToExamine = (nuint)(uint)Length;
@@ -416,7 +523,120 @@ internal static unsafe int IndexOfNullByte(byte* searchSpace)
// We get past SequentialScan only if IsHardwareAccelerated is true; and remain length is greater than Vector length.
// However, we still have the redundant check to allow the JIT to see that the code is unreachable and eliminate it when the platform does not
// have hardware accelerated. After processing Vector lengths we return to SequentialScan to finish any remaining.
- if (Vector256.IsHardwareAccelerated)
+ if (Vector512.IsHardwareAccelerated)
+ {
+ if (offset < (nuint)(uint)Length)
+ {
+ if ((((nuint)(uint)searchSpace + offset) & (nuint)(Vector256.Count - 1)) != 0)
+ {
+ // Not currently aligned to Vector256 (is aligned to Vector128); this can cause a problem for searches
+ // with no upper bound e.g. String.strlen.
+ // Start with a check on Vector128 to align to Vector256, before moving to processing Vector256.
+ // This ensures we do not fault across memory pages while searching for an end of string.
+ Vector128 search = Vector128.Load(searchSpace + offset);
+
+ // Same method as below
+ uint matches = Vector128.Equals(Vector128.Zero, search).ExtractMostSignificantBits();
+ if (matches == 0)
+ {
+ // Zero flags set so no matches
+ offset += (nuint)Vector128.Count;
+ }
+ else
+ {
+ // Find bitflag offset of first match and add to current offset
+ return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches));
+ }
+ }
+
+ if ((((nuint)(uint)searchSpace + offset) & (nuint)(Vector512.Count - 1)) != 0)
+ {
+ // Not currently aligned to Vector512 (is aligned to Vector256); this can cause a problem for searches
+ // with no upper bound e.g. String.strlen.
+ // Start with a check on Vector256 to align to Vector512, before moving to processing Vector256.
+ // This ensures we do not fault across memory pages while searching for an end of string.
+ Vector256 search = Vector256.Load(searchSpace + offset);
+
+ // Same method as below
+ uint matches = Vector256.Equals(Vector256.Zero, search).ExtractMostSignificantBits();
+ if (matches == 0)
+ {
+ // Zero flags set so no matches
+ offset += (nuint)Vector256.Count;
+ }
+ else
+ {
+ // Find bitflag offset of first match and add to current offset
+ return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches));
+ }
+ }
+ lengthToExamine = GetByteVector512SpanLength(offset, Length);
+ if (lengthToExamine > offset)
+ {
+ do
+ {
+ Vector512 search = Vector512.Load(searchSpace + offset);
+ ulong matches = Vector512.Equals(Vector512.Zero, search).ExtractMostSignificantBits();
+ // Note that MoveMask has converted the equal vector elements into a set of bit flags,
+ // So the bit position in 'matches' corresponds to the element offset.
+ if (matches == 0)
+ {
+ // Zero flags set so no matches
+ offset += (nuint)Vector512.Count;
+ continue;
+ }
+
+ // Find bitflag offset of first match and add to current offset
+ return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches));
+ } while (lengthToExamine > offset);
+ }
+
+ lengthToExamine = GetByteVector256SpanLength(offset, Length);
+ if (lengthToExamine > offset)
+ {
+ Vector256 search = Vector256.Load(searchSpace + offset);
+
+ // Same method as above
+ uint matches = Vector256.Equals(Vector256.Zero, search).ExtractMostSignificantBits();
+ if (matches == 0)
+ {
+ // Zero flags set so no matches
+ offset += (nuint)Vector256.Count;
+ }
+ else
+ {
+ // Find bitflag offset of first match and add to current offset
+ return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches));
+ }
+ }
+
+ lengthToExamine = GetByteVector128SpanLength(offset, Length);
+ if (lengthToExamine > offset)
+ {
+ Vector128 search = Vector128.Load(searchSpace + offset);
+
+ // Same method as above
+ uint matches = Vector128.Equals(Vector128.Zero, search).ExtractMostSignificantBits();
+ if (matches == 0)
+ {
+ // Zero flags set so no matches
+ offset += (nuint)Vector128.Count;
+ }
+ else
+ {
+ // Find bitflag offset of first match and add to current offset
+ return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches));
+ }
+ }
+
+ if (offset < (nuint)(uint)Length)
+ {
+ lengthToExamine = ((nuint)(uint)Length - offset);
+ goto SequentialScan;
+ }
+ }
+ }
+ else if (Vector256.IsHardwareAccelerated)
{
if (offset < (nuint)(uint)Length)
{
@@ -634,7 +854,37 @@ public static unsafe bool SequenceEqual(ref byte first, ref byte second, nuint l
Vector:
if (Vector128.IsHardwareAccelerated)
{
- if (Vector256.IsHardwareAccelerated && length >= (nuint)Vector256.Count)
+ if (Vector512.IsHardwareAccelerated && length >= (nuint)Vector512.Count)
+ {
+ nuint offset = 0;
+ nuint lengthToExamine = length - (nuint)Vector512.Count;
+ // Unsigned, so it shouldn't have overflowed larger than length (rather than negative)
+ Debug.Assert(lengthToExamine < length);
+ if (lengthToExamine != 0)
+ {
+ do
+ {
+ if (Vector512.LoadUnsafe(ref first, offset) !=
+ Vector512.LoadUnsafe(ref second, offset))
+ {
+ goto NotEqual;
+ }
+ offset += (nuint)Vector512.Count;
+ } while (lengthToExamine > offset);
+ }
+
+ // Do final compare as Vector512.Count from end rather than start
+ if (Vector512.LoadUnsafe(ref first, lengthToExamine) ==
+ Vector512.LoadUnsafe(ref second, lengthToExamine))
+ {
+ // C# compiler inverts this test, making the outer goto the conditional jmp.
+ goto Equal;
+ }
+
+ // This becomes a conditional jmp forward to not favor it.
+ goto NotEqual;
+ }
+ else if (Vector256.IsHardwareAccelerated && length >= (nuint)Vector256.Count)
{
nuint offset = 0;
nuint lengthToExamine = length - (nuint)Vector256.Count;
@@ -789,6 +1039,47 @@ public static unsafe int SequenceCompareTo(ref byte first, int firstLength, ref
if (Vector256.IsHardwareAccelerated)
{
+ if (Vector512.IsHardwareAccelerated && (lengthToExamine >= (nuint)Vector512.Count))
+ {
+ lengthToExamine -= (nuint)Vector512.Count;
+ ulong matches;
+ while (lengthToExamine > offset)
+ {
+ matches = Vector512.Equals(Vector512.LoadUnsafe(ref first, offset), Vector512.LoadUnsafe(ref second, offset)).ExtractMostSignificantBits();
+ // Note that MoveMask has converted the equal vector elements into a set of bit flags,
+ // So the bit position in 'matches' corresponds to the element offset.
+
+ // 32 elements in Vector256 so we compare to uint.MaxValue to check if everything matched
+ if (matches == ulong.MaxValue)
+ {
+ // All matched
+ offset += (nuint)Vector512.Count;
+ continue;
+ }
+
+ goto Difference;
+ }
+ // Move to Vector length from end for final compare
+ offset = lengthToExamine;
+ // Same as method as above
+ matches = Vector512.Equals(Vector512.LoadUnsafe(ref first, offset), Vector512.LoadUnsafe(ref second, offset)).ExtractMostSignificantBits();
+ if (matches == ulong.MaxValue)
+ {
+ // All matched
+ goto Equal;
+ }
+ Difference:
+ // Invert matches to find differences
+ ulong differences = ~matches;
+ // Find bitflag offset of first difference and add to current offset
+ offset += (uint)BitOperations.TrailingZeroCount(differences);
+
+ int result = Unsafe.AddByteOffset(ref first, offset).CompareTo(Unsafe.AddByteOffset(ref second, offset));
+ Debug.Assert(result != 0);
+
+ return result;
+ }
+
if (lengthToExamine >= (nuint)Vector256.Count)
{
lengthToExamine -= (nuint)Vector256.Count;
@@ -1139,6 +1430,10 @@ private static nuint GetByteVector128SpanLength(nuint offset, int length)
private static nuint GetByteVector256SpanLength(nuint offset, int length)
=> (nuint)(uint)((length - (int)offset) & ~(Vector256.Count - 1));
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static nuint GetByteVector512SpanLength(nuint offset, int length)
+ => (nuint)(uint)((length - (int)offset) & ~(Vector512.Count - 1));
+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static unsafe nuint UnalignedCountVector128(byte* searchSpace)
{
@@ -1153,8 +1448,45 @@ public static void Reverse(ref byte buf, nuint length)
nint remainder = (nint)length;
nint offset = 0;
- // overlapping has a positive performance benefit around 48 elements
- if (Avx2.IsSupported && remainder >= (nint)(Vector256.Count * 1.5))
+ if (Vector512.IsHardwareAccelerated && remainder >= Vector512.Count * 2)
+ {
+ nint lastOffset = remainder - Vector512.Count;
+ do
+ {
+ // Load the values into vectors
+ Vector512 tempFirst = Vector512.LoadUnsafe(ref buf, (nuint)offset);
+ Vector512 tempLast = Vector512.LoadUnsafe(ref buf, (nuint)lastOffset);
+
+ // Shuffle to reverse each vector:
+ // +---------------------------------------------------------------+
+ // | A | B | C | D | E | F | G | H | I | J | K | L | M | N | O | P |
+ // +---------------------------------------------------------------+
+ // --->
+ // +---------------------------------------------------------------+
+ // | P | O | N | M | L | K | J | I | H | G | F | E | D | C | B | A |
+ // +---------------------------------------------------------------+
+ tempFirst = Vector512.Shuffle(tempFirst, Vector512.Create(
+ (byte)63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48,
+ 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32,
+ 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16,
+ 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
+ tempLast = Vector512.Shuffle(tempLast, Vector512.Create(
+ (byte)63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48,
+ 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32,
+ 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16,
+ 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
+
+ // Store the reversed vectors
+ tempLast.StoreUnsafe(ref buf, (nuint)offset);
+ tempFirst.StoreUnsafe(ref buf, (nuint)lastOffset);
+
+ offset += Vector512.Count;
+ lastOffset -= Vector512.Count;
+ } while (lastOffset >= offset);
+
+ remainder = lastOffset + Vector512.Count - offset;
+ }
+ else if (Avx2.IsSupported && remainder >= (nint)(Vector256.Count * 1.5))
{
Vector256 reverseMask = Vector256.Create(
(byte)15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, // first 128-bit lane
diff --git a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs
index 70216de36fd49d..9f7b57e40e6c6f 100644
--- a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs
@@ -68,7 +68,74 @@ ref Unsafe.As(ref Unsafe.Add(ref searchSpace, offset + 1)),
// Based on http://0x80.pl/articles/simd-strfind.html#algorithm-1-generic-simd "Algorithm 1: Generic SIMD" by Wojciech Mula
// Some details about the implementation can also be found in https://github.com/dotnet/runtime/pull/63285
SEARCH_TWO_CHARS:
- if (Vector256.IsHardwareAccelerated && searchSpaceMinusValueTailLength - Vector256.Count >= 0)
+ if (Vector512.IsHardwareAccelerated && searchSpaceMinusValueTailLength - Vector512.Count >= 0)
+ {
+ // Find the last unique (which is not equal to ch1) character
+ // the algorithm is fine if both are equal, just a little bit less efficient
+ ushort ch2Val = Unsafe.Add(ref value, valueTailLength);
+ nint ch1ch2Distance = (nint)(uint)valueTailLength;
+ while (ch2Val == valueHead && ch1ch2Distance > 1)
+ ch2Val = Unsafe.Add(ref value, --ch1ch2Distance);
+
+ Vector512 ch1 = Vector512.Create((ushort)valueHead);
+ Vector512 ch2 = Vector512.Create(ch2Val);
+
+ nint searchSpaceMinusValueTailLengthAndVector =
+ searchSpaceMinusValueTailLength - (nint)Vector512.Count;
+
+ do
+ {
+ // Make sure we don't go out of bounds
+ Debug.Assert(offset + ch1ch2Distance + Vector512.Count <= searchSpaceLength);
+
+ Vector512 cmpCh2 = Vector512.Equals(ch2, Vector512.LoadUnsafe(ref searchSpace, (nuint)(offset + ch1ch2Distance)));
+ Vector512 cmpCh1 = Vector512.Equals(ch1, Vector512.LoadUnsafe(ref searchSpace, (nuint)offset));
+ Vector512 cmpAnd = (cmpCh1 & cmpCh2).AsByte();
+
+ // Early out: cmpAnd is all zeros
+ if (cmpAnd != Vector512.Zero)
+ {
+ goto CANDIDATE_FOUND;
+ }
+
+ LOOP_FOOTER:
+ offset += Vector512.Count;
+
+ if (offset == searchSpaceMinusValueTailLength)
+ return -1;
+
+ // Overlap with the current chunk for trailing elements
+ if (offset > searchSpaceMinusValueTailLengthAndVector)
+ offset = searchSpaceMinusValueTailLengthAndVector;
+
+ continue;
+
+ CANDIDATE_FOUND:
+ ulong mask = cmpAnd.ExtractMostSignificantBits();
+ do
+ {
+ int bitPos = BitOperations.TrailingZeroCount(mask);
+ // div by 2 (shr) because we work with 2-byte chars
+ nint charPos = (nint)((uint)bitPos / 2);
+ if (valueLength == 2 || // we already matched two chars
+ SequenceEqual(
+ ref Unsafe.As(ref Unsafe.Add(ref searchSpace, offset + charPos)),
+ ref Unsafe.As(ref value), (nuint)(uint)valueLength * 2))
+ {
+ return (int)(offset + charPos);
+ }
+
+ // Clear two the lowest set bits
+ if (Bmi1.X64.IsSupported)
+ mask = Bmi1.X64.ResetLowestSetBit(Bmi1.X64.ResetLowestSetBit(mask));
+ else
+ mask &= ~(ulong)((ulong)0b11 << bitPos);
+ } while (mask != 0);
+ goto LOOP_FOOTER;
+
+ } while (true);
+ }
+ else if (Vector256.IsHardwareAccelerated && searchSpaceMinusValueTailLength - Vector256.Count >= 0)
{
// Find the last unique (which is not equal to ch1) character
// the algorithm is fine if both are equal, just a little bit less efficient
@@ -253,7 +320,57 @@ ref Unsafe.As(ref Unsafe.Add(ref searchSpace, relativeIndex + 1)),
// Based on http://0x80.pl/articles/simd-strfind.html#algorithm-1-generic-simd "Algorithm 1: Generic SIMD" by Wojciech Mula
// Some details about the implementation can also be found in https://github.com/dotnet/runtime/pull/63285
SEARCH_TWO_CHARS:
- if (Vector256.IsHardwareAccelerated && searchSpaceMinusValueTailLength >= Vector256.Count)
+ if (Vector512.IsHardwareAccelerated && searchSpaceMinusValueTailLength >= Vector512.Count)
+ {
+ offset = searchSpaceMinusValueTailLength - Vector512.Count;
+
+ // Find the last unique (which is not equal to ch1) char
+ // the algorithm is fine if both are equal, just a little bit less efficient
+ char ch2Val = Unsafe.Add(ref value, valueTailLength);
+ int ch1ch2Distance = valueTailLength;
+ while (ch2Val == valueHead && ch1ch2Distance > 1)
+ ch2Val = Unsafe.Add(ref value, --ch1ch2Distance);
+
+ Vector512 ch1 = Vector512.Create((ushort)valueHead);
+ Vector512 ch2 = Vector512.Create((ushort)ch2Val);
+
+ do
+ {
+
+ Vector512 cmpCh1 = Vector512.Equals(ch1, Vector512.LoadUnsafe(ref searchSpace, (nuint)offset));
+ Vector512 cmpCh2 = Vector512.Equals(ch2, Vector512.LoadUnsafe(ref searchSpace, (nuint)(offset + ch1ch2Distance)));
+ Vector512 cmpAnd = (cmpCh1 & cmpCh2).AsByte();
+
+ // Early out: cmpAnd is all zeros
+ if (cmpAnd != Vector512.Zero)
+ {
+ ulong mask = cmpAnd.ExtractMostSignificantBits();
+ do
+ {
+ // unlike IndexOf, here we use LZCNT to process matches starting from the end
+ int bitPos = 62 - BitOperations.LeadingZeroCount(mask);
+ int charPos = (int)((uint)bitPos / 2);
+
+ if (valueLength == 2 || // we already matched two chars
+ SequenceEqual(
+ ref Unsafe.As(ref Unsafe.Add(ref searchSpace, offset + charPos)),
+ ref Unsafe.As(ref value), (nuint)(uint)valueLength * 2))
+ {
+ return charPos + offset;
+ }
+ mask &= ~(ulong)((ulong)0b11 << bitPos); // clear two highest set bits.
+ } while (mask != 0);
+ }
+
+ offset -= Vector512.Count;
+ if (offset == -Vector512.Count)
+ return -1;
+ // Overlap with the current chunk if there is not enough room for the next one
+ if (offset < 0)
+ offset = 0;
+ } while (true);
+ }
+ else if (Vector256.IsHardwareAccelerated && searchSpaceMinusValueTailLength >= Vector256.Count)
{
offset = searchSpaceMinusValueTailLength - Vector256.Count;
@@ -478,7 +595,145 @@ public static unsafe int IndexOfNullCharacter(char* searchSpace)
// We get past SequentialScan only if IsHardwareAccelerated is true. However, we still have the redundant check to allow
// the JIT to see that the code is unreachable and eliminate it when the platform does not have hardware accelerated.
- if (Vector256.IsHardwareAccelerated)
+ if (Vector512.IsHardwareAccelerated)
+ {
+ if (offset < length)
+ {
+ Debug.Assert(length - offset >= Vector128.Count);
+ if (((nint)(searchSpace + (nint)offset) & (nint)(Vector256.Count - 1)) != 0)
+ {
+ // Not currently aligned to Vector256 (is aligned to Vector128); this can cause a problem for searches
+ // with no upper bound e.g. String.wcslen. Start with a check on Vector128 to align to Vector256,
+ // before moving to processing Vector256.
+
+ // This ensures we do not fault across memory pages
+ // while searching for an end of string. Specifically that this assumes that the length is either correct
+ // or that the data is pinned otherwise it may cause an AccessViolation from crossing a page boundary into an
+ // unowned page. If the search is unbounded (e.g. null terminator in wcslen) and the search value is not found,
+ // again this will likely cause an AccessViolation. However, correctly bounded searches will return -1 rather
+ // than ever causing an AV.
+ Vector128 search = *(Vector128*)(searchSpace + (nuint)offset);
+
+ // Same method as below
+ uint matches = Vector128.Equals(Vector128.Zero, search).AsByte().ExtractMostSignificantBits();
+ if (matches == 0)
+ {
+ // Zero flags set so no matches
+ offset += Vector128.Count;
+ }
+ else
+ {
+ // Find bitflag offset of first match and add to current offset
+ return (int)(offset + ((uint)BitOperations.TrailingZeroCount(matches) / sizeof(char)));
+ }
+ }
+ if (((nint)(searchSpace + (nint)offset) & (nint)(Vector512.Count - 1)) != 0)
+ {
+ // Not currently aligned to Vector512 (is aligned to Vector256); this can cause a problem for searches
+ // with no upper bound e.g. String.wcslen. Start with a check on Vector256 to align to Vector512,
+ // before moving to processing Vector256.
+
+ // This ensures we do not fault across memory pages
+ // while searching for an end of string. Specifically that this assumes that the length is either correct
+ // or that the data is pinned otherwise it may cause an AccessViolation from crossing a page boundary into an
+ // unowned page. If the search is unbounded (e.g. null terminator in wcslen) and the search value is not found,
+ // again this will likely cause an AccessViolation. However, correctly bounded searches will return -1 rather
+ // than ever causing an AV.
+ Vector256 search = *(Vector256*)(searchSpace + (nuint)offset);
+
+ // Same method as below
+ uint matches = Vector256.Equals(Vector256.Zero, search).AsByte().ExtractMostSignificantBits();
+ if (matches == 0)
+ {
+ // Zero flags set so no matches
+ offset += Vector256.Count;
+ }
+ else
+ {
+ // Find bitflag offset of first match and add to current offset
+ return (int)(offset + ((uint)BitOperations.TrailingZeroCount(matches) / sizeof(char)));
+ }
+ }
+
+ lengthToExamine = GetCharVector512SpanLength(offset, length);
+ if (lengthToExamine > 0)
+ {
+ do
+ {
+ Debug.Assert(lengthToExamine >= Vector512.Count);
+
+ Vector512 search = *(Vector512*)(searchSpace + (nuint)offset);
+ ulong matches = Vector512.Equals(Vector512.Zero, search).AsByte().ExtractMostSignificantBits();
+ // Note that MoveMask has converted the equal vector elements into a set of bit flags,
+ // So the bit position in 'matches' corresponds to the element offset.
+ if (matches == 0)
+ {
+ // Zero flags set so no matches
+ offset += Vector512.Count;
+ lengthToExamine -= Vector512.Count;
+ continue;
+ }
+
+ // Find bitflag offset of first match and add to current offset,
+ // flags are in bytes so divide for chars
+ return (int)(offset + ((uint)BitOperations.TrailingZeroCount(matches) / sizeof(char)));
+ } while (lengthToExamine > 0);
+ }
+
+ lengthToExamine = GetCharVector256SpanLength(offset, length);
+ if (lengthToExamine > 0)
+ {
+ Debug.Assert(lengthToExamine >= Vector256.Count);
+
+ Vector256 search = *(Vector256*)(searchSpace + (nuint)offset);
+
+ // Same method as above
+ uint matches = Vector256.Equals(Vector256.Zero, search).AsByte().ExtractMostSignificantBits();
+ if (matches == 0)
+ {
+ // Zero flags set so no matches
+ offset += Vector256.Count;
+ // Don't need to change lengthToExamine here as we don't use its current value again.
+ }
+ else
+ {
+ // Find bitflag offset of first match and add to current offset,
+ // flags are in bytes so divide for chars
+ return (int)(offset + ((uint)BitOperations.TrailingZeroCount(matches) / sizeof(char)));
+ }
+ }
+
+ lengthToExamine = GetCharVector128SpanLength(offset, length);
+ if (lengthToExamine > 0)
+ {
+ Debug.Assert(lengthToExamine >= Vector128.Count);
+
+ Vector128 search = *(Vector128*)(searchSpace + (nuint)offset);
+
+ // Same method as above
+ uint matches = Vector128.Equals(Vector128.Zero, search).AsByte().ExtractMostSignificantBits();
+ if (matches == 0)
+ {
+ // Zero flags set so no matches
+ offset += Vector128.Count;
+ // Don't need to change lengthToExamine here as we don't use its current value again.
+ }
+ else
+ {
+ // Find bitflag offset of first match and add to current offset,
+ // flags are in bytes so divide for chars
+ return (int)(offset + ((uint)BitOperations.TrailingZeroCount(matches) / sizeof(char)));
+ }
+ }
+
+ if (offset < length)
+ {
+ lengthToExamine = length - offset;
+ goto SequentialScan;
+ }
+ }
+ }
+ else if (Vector256.IsHardwareAccelerated)
{
if (offset < length)
{
@@ -707,6 +962,10 @@ private static nint GetCharVector128SpanLength(nint offset, nint length)
private static nint GetCharVector256SpanLength(nint offset, nint length)
=> (length - offset) & ~(Vector256.Count - 1);
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static nint GetCharVector512SpanLength(nint offset, nint length)
+ => (length - offset) & ~(Vector512.Count - 1);
+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static unsafe nint UnalignedCountVector128(char* searchSpace)
{
@@ -721,8 +980,42 @@ public static void Reverse(ref char buf, nuint length)
nint remainder = (nint)length;
nint offset = 0;
+ if (Vector512.IsHardwareAccelerated && remainder >= Vector512.Count * 2)
+ {
+ nint lastOffset = remainder - Vector512.Count;
+ do
+ {
+ ref ushort first = ref Unsafe.As(ref Unsafe.Add(ref buf, offset));
+ ref ushort last = ref Unsafe.As(ref Unsafe.Add(ref buf, lastOffset));
+
+ Vector512 tempFirst = Vector512.LoadUnsafe(ref first);
+ Vector512 tempLast = Vector512.LoadUnsafe(ref last);
+
+ // Shuffle to reverse each vector:
+ // +-------------------------------+
+ // | A | B | C | D | E | F | G | H |
+ // +-------------------------------+
+ // --->
+ // +-------------------------------+
+ // | H | G | F | E | D | C | B | A |
+ // +-------------------------------+
+ tempFirst = Vector512.Shuffle(tempFirst, Vector512.Create((ushort)31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16,
+ 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
+ tempLast = Vector512.Shuffle(tempLast, Vector512.Create((ushort)31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16,
+ 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
+
+ // Store the reversed vectors
+ tempLast.StoreUnsafe(ref first);
+ tempFirst.StoreUnsafe(ref last);
+
+ offset += Vector512.Count;
+ lastOffset -= Vector512.Count;
+ } while (lastOffset >= offset);
+
+ remainder = (lastOffset + Vector512.Count - offset);
+ }
// overlapping has a positive performance benefit around 24 elements
- if (Avx2.IsSupported && remainder >= (nint)(Vector256.Count * 1.5))
+ else if (Avx2.IsSupported && remainder >= (nint)(Vector256.Count * 1.5))
{
Vector256 reverseMask = Vector256.Create(
(byte)14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, // first 128-bit lane
diff --git a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Packed.cs b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Packed.cs
index 1851d1e26ffefa..2f70f00959c1c4 100644
--- a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Packed.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Packed.cs
@@ -113,9 +113,59 @@ public static bool Contains(ref short searchSpace, short value, int length)
else
{
ref short currentSearchSpace = ref searchSpace;
+#pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough // The else condition for this if statement is identical in semantics to Avx2 specific code
+ if (Avx512BW.IsSupported && Vector512.IsHardwareAccelerated && length > Vector512.Count)
+#pragma warning restore IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough
+ {
+ Vector512 packedValue = Vector512.Create((byte)value);
+
+ if (length > 2 * Vector512.Count)
+ {
+ // Process the input in chunks of 64 characters (2 * Vector512).
+ // If the input length is a multiple of 64, don't consume the last 16 characters in this loop.
+ // Let the fallback below handle it instead. This is why the condition is
+ // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan".
+ ref short twoVectorsAwayFromEnd = ref Unsafe.Add(ref searchSpace, length - (2 * Vector512.Count));
+
+ do
+ {
+ Vector512 source0 = Vector512.LoadUnsafe(ref currentSearchSpace);
+ Vector512 source1 = Vector512.LoadUnsafe(ref currentSearchSpace, (nuint)Vector512.Count);
+ Vector512 packedSource = PackSources(source0, source1);
+ Vector512 result = Vector512.Equals(packedValue, packedSource);
+
+ if (result != Vector512.Zero)
+ {
+ return true;
+ }
+
+ currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, 2 * Vector512.Count);
+ }
+ while (Unsafe.IsAddressLessThan(ref currentSearchSpace, ref twoVectorsAwayFromEnd));
+ }
+
+ // We have 1-32 characters remaining. Process the first and last vector in the search space.
+ // They may overlap, but we're only interested in whether any value matched.
+ {
+ ref short oneVectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, length - Vector512.Count);
+
+ ref short firstVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref oneVectorAwayFromEnd)
+ ? ref oneVectorAwayFromEnd
+ : ref currentSearchSpace;
+ Vector512 source0 = Vector512.LoadUnsafe(ref firstVector);
+ Vector512 source1 = Vector512.LoadUnsafe(ref oneVectorAwayFromEnd);
+ Vector512 packedSource = PackSources(source0, source1);
+ Vector512 result = Vector512.Equals(packedValue, packedSource);
+
+ if (result != Vector512.Zero)
+ {
+ return true;
+ }
+ }
+ }
#pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough // The else condition for this if statement is identical in semantics to Avx2 specific code
- if (Avx2.IsSupported && length > Vector256.Count)
+ else if (Avx2.IsSupported && length > Vector256.Count)
#pragma warning restore IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough
{
Vector256 packedValue = Vector256.Create((byte)value);
@@ -264,7 +314,60 @@ private static int IndexOf(ref short searchSpace, short value, int len
ref short currentSearchSpace = ref searchSpace;
#pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough // The else condition for this if statement is identical in semantics to Avx2 specific code
- if (Avx2.IsSupported && length > Vector256.Count)
+ if (Avx512BW.IsSupported && Vector512.IsHardwareAccelerated && length > Vector512.Count)
+#pragma warning restore IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough
+ {
+ Vector512 packedValue = Vector512.Create((byte)value);
+
+ if (length > 2 * Vector512.Count)
+ {
+ // Process the input in chunks of 64 characters (2 * Vector512).
+ // If the input length is a multiple of 64, don't consume the last 16 characters in this loop.
+ // Let the fallback below handle it instead. This is why the condition is
+ // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan".
+ ref short twoVectorsAwayFromEnd = ref Unsafe.Add(ref searchSpace, length - (2 * Vector512.Count));
+
+ do
+ {
+ Vector512 source0 = Vector512.LoadUnsafe(ref currentSearchSpace);
+ Vector512 source1 = Vector512.LoadUnsafe(ref currentSearchSpace, (nuint)Vector512.Count);
+ Vector512 packedSource = PackSources(source0, source1);
+ Vector512 result = Vector512.Equals(packedValue, packedSource);
+ result = NegateIfNeeded(result);
+
+ if (result != Vector512.Zero)
+ {
+ return ComputeFirstIndex(ref searchSpace, ref currentSearchSpace, result);
+ }
+
+ currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, 2 * Vector512.Count);
+ }
+ while (Unsafe.IsAddressLessThan(ref currentSearchSpace, ref twoVectorsAwayFromEnd));
+ }
+
+ // We have 1-32 characters remaining. Process the first and last vector in the search space.
+ // They may overlap, but we'll handle that in the index calculation if we do get a match.
+ {
+ ref short oneVectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, length - Vector512.Count);
+
+ ref short firstVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref oneVectorAwayFromEnd)
+ ? ref oneVectorAwayFromEnd
+ : ref currentSearchSpace;
+
+ Vector512 source0 = Vector512.LoadUnsafe(ref firstVector);
+ Vector512 source1 = Vector512.LoadUnsafe(ref oneVectorAwayFromEnd);
+ Vector512 packedSource = PackSources(source0, source1);
+ Vector512 result = Vector512.Equals(packedValue, packedSource);
+ result = NegateIfNeeded(result);
+
+ if (result != Vector512.Zero)
+ {
+ return ComputeFirstIndexOverlapped(ref searchSpace, ref firstVector, ref oneVectorAwayFromEnd, result);
+ }
+ }
+ }
+#pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough // The else condition for this if statement is identical in semantics to Avx2 specific code
+ else if (Avx2.IsSupported && length > Vector256.Count)
#pragma warning restore IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough
{
Vector256 packedValue = Vector256.Create((byte)value);
@@ -422,9 +525,62 @@ private static int IndexOfAny(ref short searchSpace, short value0, sho
else
{
ref short currentSearchSpace = ref searchSpace;
+#pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough // The else condition for this if statement is identical in semantics to Avx2 specific code
+ if (Avx512BW.IsSupported && Vector512.IsHardwareAccelerated && length > Vector512.Count)
+#pragma warning restore IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough
+ {
+ Vector512 packedValue0 = Vector512.Create((byte)value0);
+ Vector512 packedValue1 = Vector512.Create((byte)value1);
+
+ if (length > 2 * Vector512.Count)
+ {
+ // Process the input in chunks of 64 characters (2 * Vector512).
+ // If the input length is a multiple of 64, don't consume the last 16 characters in this loop.
+ // Let the fallback below handle it instead. This is why the condition is
+ // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan".
+ ref short twoVectorsAwayFromEnd = ref Unsafe.Add(ref searchSpace, length - (2 * Vector512.Count));
+
+ do
+ {
+ Vector512 source0 = Vector512.LoadUnsafe(ref currentSearchSpace);
+ Vector512 source1 = Vector512.LoadUnsafe(ref currentSearchSpace, (nuint)Vector512.Count);
+ Vector512 packedSource = PackSources(source0, source1);
+ Vector512 result = Vector512.Equals(packedValue0, packedSource) | Vector512.Equals(packedValue1, packedSource);
+ result = NegateIfNeeded(result);
+ if (result != Vector512.Zero)
+ {
+ return ComputeFirstIndex(ref searchSpace, ref currentSearchSpace, result);
+ }
+
+ currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, 2 * Vector512.Count);
+ }
+ while (Unsafe.IsAddressLessThan(ref currentSearchSpace, ref twoVectorsAwayFromEnd));
+ }
+
+ // We have 1-32 characters remaining. Process the first and last vector in the search space.
+ // They may overlap, but we'll handle that in the index calculation if we do get a match.
+ {
+ ref short oneVectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, length - Vector512.Count);
+
+ ref short firstVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref oneVectorAwayFromEnd)
+ ? ref oneVectorAwayFromEnd
+ : ref currentSearchSpace;
+
+ Vector512 source0 = Vector512.LoadUnsafe(ref firstVector);
+ Vector512 source1 = Vector512.LoadUnsafe(ref oneVectorAwayFromEnd);
+ Vector512 packedSource = PackSources(source0, source1);
+ Vector512 result = Vector512.Equals(packedValue0, packedSource) | Vector512.Equals(packedValue1, packedSource);
+ result = NegateIfNeeded(result);
+
+ if (result != Vector512.Zero)
+ {
+ return ComputeFirstIndexOverlapped(ref searchSpace, ref firstVector, ref oneVectorAwayFromEnd, result);
+ }
+ }
+ }
#pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough // The else condition for this if statement is identical in semantics to Avx2 specific code
- if (Avx2.IsSupported && length > Vector256.Count)
+ else if (Avx2.IsSupported && length > Vector256.Count)
#pragma warning restore IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough
{
Vector256 packedValue0 = Vector256.Create((byte)value0);
@@ -587,7 +743,62 @@ private static int IndexOfAny(ref short searchSpace, short value0, sho
ref short currentSearchSpace = ref searchSpace;
#pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough // The else condition for this if statement is identical in semantics to Avx2 specific code
- if (Avx2.IsSupported && length > Vector256.Count)
+ if (Avx512BW.IsSupported && Vector512.IsHardwareAccelerated && length > Vector512.Count)
+#pragma warning restore IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough
+ {
+ Vector512 packedValue0 = Vector512.Create((byte)value0);
+ Vector512 packedValue1 = Vector512.Create((byte)value1);
+ Vector512 packedValue2 = Vector512.Create((byte)value2);
+
+ if (length > 2 * Vector512.Count)
+ {
+ // Process the input in chunks of 64 characters (2 * Vector512).
+ // If the input length is a multiple of 64, don't consume the last 16 characters in this loop.
+ // Let the fallback below handle it instead. This is why the condition is
+ // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan".
+ ref short twoVectorsAwayFromEnd = ref Unsafe.Add(ref searchSpace, length - (2 * Vector512.Count));
+
+ do
+ {
+ Vector512 source0 = Vector512.LoadUnsafe(ref currentSearchSpace);
+ Vector512 source1 = Vector512.LoadUnsafe(ref currentSearchSpace, (nuint)Vector512.Count);
+ Vector512 packedSource = PackSources(source0, source1);
+ Vector512 result = Vector512.Equals(packedValue0, packedSource) | Vector512.Equals(packedValue1, packedSource) | Vector512.Equals(packedValue2, packedSource);
+ result = NegateIfNeeded(result);
+
+ if (result != Vector512.Zero)
+ {
+ return ComputeFirstIndex(ref searchSpace, ref currentSearchSpace, result);
+ }
+
+ currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, 2 * Vector512.Count);
+ }
+ while (Unsafe.IsAddressLessThan(ref currentSearchSpace, ref twoVectorsAwayFromEnd));
+ }
+
+ // We have 1-32 characters remaining. Process the first and last vector in the search space.
+ // They may overlap, but we'll handle that in the index calculation if we do get a match.
+ {
+ ref short oneVectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, length - Vector512.Count);
+
+ ref short firstVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref oneVectorAwayFromEnd)
+ ? ref oneVectorAwayFromEnd
+ : ref currentSearchSpace;
+
+ Vector512 source0 = Vector512.LoadUnsafe(ref firstVector);
+ Vector512 source1 = Vector512.LoadUnsafe(ref oneVectorAwayFromEnd);
+ Vector512 packedSource = PackSources(source0, source1);
+ Vector512 result = Vector512.Equals(packedValue0, packedSource) | Vector512.Equals(packedValue1, packedSource) | Vector512.Equals(packedValue2, packedSource);
+ result = NegateIfNeeded(result);
+
+ if (result != Vector512.Zero)
+ {
+ return ComputeFirstIndexOverlapped(ref searchSpace, ref firstVector, ref oneVectorAwayFromEnd, result);
+ }
+ }
+ }
+#pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough // The else condition for this if statement is identical in semantics to Avx2 specific code
+ else if (Avx2.IsSupported && length > Vector256.Count)
#pragma warning restore IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough
{
Vector256 packedValue0 = Vector256.Create((byte)value0);
@@ -734,7 +945,61 @@ private static int IndexOfAnyInRange(ref short searchSpace, short lowI
ref short currentSearchSpace = ref searchSpace;
#pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough // The else condition for this if statement is identical in semantics to Avx2 specific code
- if (Avx2.IsSupported && length > Vector256.Count)
+ if (Avx512BW.IsSupported && Vector512.IsHardwareAccelerated && length > Vector512.Count)
+#pragma warning restore IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough
+ {
+ Vector512 lowVector = Vector512.Create((byte)lowInclusive);
+ Vector512 rangeVector = Vector512.Create((byte)rangeInclusive);
+
+ if (length > 2 * Vector512.Count)
+ {
+ // Process the input in chunks of 64 characters (2 * Vector512).
+ // If the input length is a multiple of 64, don't consume the last 16 characters in this loop.
+ // Let the fallback below handle it instead. This is why the condition is
+ // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan".
+ ref short twoVectorsAwayFromEnd = ref Unsafe.Add(ref searchSpace, length - (2 * Vector512.Count));
+
+ do
+ {
+ Vector512 source0 = Vector512.LoadUnsafe(ref currentSearchSpace);
+ Vector512 source1 = Vector512.LoadUnsafe(ref currentSearchSpace, (nuint)Vector512