From 7706131d30a8a70ae2b4c7341d201741592553d8 Mon Sep 17 00:00:00 2001 From: Deepak Rajendrakumaran Date: Fri, 9 Jun 2023 12:21:09 -0700 Subject: [PATCH 1/7] Adding required internal library methods to support Vector512. --- .../src/System/Numerics/BitOperations.cs | 25 +++++++++++++++++++ .../System/Runtime/Intrinsics/Vector512.cs | 15 +++++++++++ 2 files changed, 40 insertions(+) diff --git a/src/libraries/System.Private.CoreLib/src/System/Numerics/BitOperations.cs b/src/libraries/System.Private.CoreLib/src/System/Numerics/BitOperations.cs index 6df1d1957f901e..47b5e7926fa298 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Numerics/BitOperations.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Numerics/BitOperations.cs @@ -944,6 +944,17 @@ internal static uint ResetLowestSetBit(uint value) return value & (value - 1); } + /// + /// Reset specific bit in the given value + /// Reset the lowest significant bit in the given value + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static ulong ResetLowestSetBit(ulong value) + { + // It's lowered to BLSR on x86 + return value & (value - 1); + } + /// /// Flip the bit at a specific position in a given value. /// Similar in behavior to the x86 instruction BTC (Bit Test and Complement). @@ -957,5 +968,19 @@ internal static uint FlipBit(uint value, int index) { return value ^ (1u << index); } + + /// + /// Flip the bit at a specific position in a given value. + /// Similar in behavior to the x86 instruction BTC (Bit Test and Complement). + /// /// + /// The value. + /// The zero-based index of the bit to flip. + /// Any value outside the range [0..63] is treated as congruent mod 64. + /// The new value. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static ulong FlipBit(ulong value, int index) + { + return value ^ (ulong)((ulong)1u << index); + } } } diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector512.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector512.cs index 27f15724d7baa3..5b740db4300982 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector512.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector512.cs @@ -1791,6 +1791,21 @@ public static Vector512 LoadUnsafe(ref T source, nuint elementOffset) return Unsafe.ReadUnaligned>(ref Unsafe.As(ref source)); } + /// Loads a vector from the given source and reinterprets it as . + /// The source from which the vector will be loaded. + /// The vector loaded from . + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static Vector512 LoadUnsafe(ref char source) => + LoadUnsafe(ref Unsafe.As(ref source)); + + /// Loads a vector from the given source and element offset and reinterprets it as . + /// The source to which will be added before loading the vector. + /// The element offset from from which the vector will be loaded. + /// The vector loaded from plus . + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static Vector512 LoadUnsafe(ref char source, nuint elementOffset) => + LoadUnsafe(ref Unsafe.As(ref source), elementOffset); + /// Computes the maximum of two vectors on a per-element basis. /// The type of the elements in the vector. /// The vector to compare with . From ff65162372ee9599934d35c32ee738fe4313c54c Mon Sep 17 00:00:00 2001 From: Deepak Rajendrakumaran Date: Fri, 9 Jun 2023 11:40:28 -0700 Subject: [PATCH 2/7] Making Vector512.IsHardwareAccelerated return 'False' on targets with Vector512Throttling issues --- src/coreclr/jit/compiler.h | 10 ++++++++++ src/coreclr/jit/hwintrinsic.cpp | 6 ++++++ 2 files changed, 16 insertions(+) diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 0d24478ae115d8..db07bec2c5b702 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -9508,6 +9508,16 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX return jitFlags->IsSet(JitFlags::JIT_FLAG_REVERSE_PINVOKE); } + // true if JitFlags::JIT_FLAG_VECTOR512_THROTTLING is set to true + bool Vector512Throttling() + { +#if defined(TARGET_XARCH) + return jitFlags->IsSet(JitFlags::JIT_FLAG_VECTOR512_THROTTLING); +#else + return false; +#endif + } + bool compScopeInfo; // Generate the LocalVar info ? bool compDbgCode; // Generate debugger-friendly code? bool compDbgInfo; // Gather debugging info? diff --git a/src/coreclr/jit/hwintrinsic.cpp b/src/coreclr/jit/hwintrinsic.cpp index 4fcae9873de1fc..66642a4ce97d02 100644 --- a/src/coreclr/jit/hwintrinsic.cpp +++ b/src/coreclr/jit/hwintrinsic.cpp @@ -550,6 +550,12 @@ NamedIntrinsic HWIntrinsicInfo::lookupId(Compiler* comp, if (isIsaSupported && comp->compSupportsHWIntrinsic(isa)) { +#ifdef TARGET_XARCH + if ((isa == InstructionSet_Vector512) && (comp->opts.Vector512Throttling())) + { + return NI_IsSupported_False; + } +#endif if (!comp->IsTargetAbi(CORINFO_NATIVEAOT_ABI) || comp->compExactlyDependsOn(isa)) { return NI_IsSupported_True; From 8996d0df2276861253db37780662525cdd3892eb Mon Sep 17 00:00:00 2001 From: Deepak Rajendrakumaran Date: Fri, 9 Jun 2023 12:43:45 -0700 Subject: [PATCH 3/7] SpanHelper library upgrades. --- .../src/System/SpanHelpers.Byte.cs | 346 +++++++++++- .../src/System/SpanHelpers.Char.cs | 301 ++++++++++- .../src/System/SpanHelpers.Packed.cs | 318 ++++++++++- .../src/System/SpanHelpers.T.cs | 494 +++++++++++++++++- .../src/System/SpanHelpers.cs | 64 ++- 5 files changed, 1499 insertions(+), 24 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs index 88008f7195daa8..40bb130a500a93 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs @@ -63,7 +63,68 @@ ref Unsafe.Add(ref searchSpace, offset + 1), // Based on http://0x80.pl/articles/simd-strfind.html#algorithm-1-generic-simd "Algorithm 1: Generic SIMD" by Wojciech Mula // Some details about the implementation can also be found in https://github.com/dotnet/runtime/pull/63285 SEARCH_TWO_BYTES: - if (Vector256.IsHardwareAccelerated && searchSpaceMinusValueTailLength - Vector256.Count >= 0) + if (Vector512.IsHardwareAccelerated && searchSpaceMinusValueTailLength - Vector512.Count >= 0) + { + // Find the last unique (which is not equal to ch1) byte + // the algorithm is fine if both are equal, just a little bit less efficient + byte ch2Val = Unsafe.Add(ref value, valueTailLength); + nint ch1ch2Distance = (nint)(uint)valueTailLength; + while (ch2Val == value && ch1ch2Distance > 1) + ch2Val = Unsafe.Add(ref value, --ch1ch2Distance); + + Vector512 ch1 = Vector512.Create(value); + Vector512 ch2 = Vector512.Create(ch2Val); + + nint searchSpaceMinusValueTailLengthAndVector = + searchSpaceMinusValueTailLength - (nint)Vector512.Count; + + do + { + Debug.Assert(offset >= 0); + // Make sure we don't go out of bounds + Debug.Assert(offset + ch1ch2Distance + Vector512.Count <= searchSpaceLength); + + Vector512 cmpCh2 = Vector512.Equals(ch2, Vector512.LoadUnsafe(ref searchSpace, (nuint)(offset + ch1ch2Distance))); + Vector512 cmpCh1 = Vector512.Equals(ch1, Vector512.LoadUnsafe(ref searchSpace, (nuint)offset)); + Vector512 cmpAnd = (cmpCh1 & cmpCh2).AsByte(); + + // Early out: cmpAnd is all zeros + if (cmpAnd != Vector512.Zero) + { + goto CANDIDATE_FOUND; + } + + LOOP_FOOTER: + offset += Vector512.Count; + + if (offset == searchSpaceMinusValueTailLength) + return -1; + + // Overlap with the current chunk for trailing elements + if (offset > searchSpaceMinusValueTailLengthAndVector) + offset = searchSpaceMinusValueTailLengthAndVector; + + continue; + + CANDIDATE_FOUND: + ulong mask = cmpAnd.ExtractMostSignificantBits(); + do + { + int bitPos = BitOperations.TrailingZeroCount(mask); + if (valueLength == 2 || // we already matched two bytes + SequenceEqual( + ref Unsafe.Add(ref searchSpace, offset + bitPos), + ref value, (nuint)(uint)valueLength)) // The (nuint)-cast is necessary to pick the correct overload + { + return (int)(offset + bitPos); + } + mask = BitOperations.ResetLowestSetBit(mask); // Clear the lowest set bit + } while (mask != 0); + goto LOOP_FOOTER; + + } while (true); + } + else if (Vector256.IsHardwareAccelerated && searchSpaceMinusValueTailLength - Vector256.Count >= 0) { // Find the last unique (which is not equal to ch1) byte // the algorithm is fine if both are equal, just a little bit less efficient @@ -235,7 +296,54 @@ ref Unsafe.Add(ref searchSpace, relativeIndex + 1), // Based on http://0x80.pl/articles/simd-strfind.html#algorithm-1-generic-simd "Algorithm 1: Generic SIMD" by Wojciech Mula // Some details about the implementation can also be found in https://github.com/dotnet/runtime/pull/63285 SEARCH_TWO_BYTES: - if (Vector256.IsHardwareAccelerated && searchSpaceMinusValueTailLength >= Vector256.Count) + if (Vector512.IsHardwareAccelerated && searchSpaceMinusValueTailLength >= Vector512.Count) + { + offset = searchSpaceMinusValueTailLength - Vector512.Count; + + // Find the last unique (which is not equal to ch1) byte + // the algorithm is fine if both are equal, just a little bit less efficient + byte ch2Val = Unsafe.Add(ref value, valueTailLength); + int ch1ch2Distance = valueTailLength; + while (ch2Val == value && ch1ch2Distance > 1) + ch2Val = Unsafe.Add(ref value, --ch1ch2Distance); + + Vector512 ch1 = Vector512.Create(value); + Vector512 ch2 = Vector512.Create(ch2Val); + do + { + Vector512 cmpCh1 = Vector512.Equals(ch1, Vector512.LoadUnsafe(ref searchSpace, (nuint)offset)); + Vector512 cmpCh2 = Vector512.Equals(ch2, Vector512.LoadUnsafe(ref searchSpace, (nuint)(offset + ch1ch2Distance))); + Vector512 cmpAnd = (cmpCh1 & cmpCh2).AsByte(); + + // Early out: cmpAnd is all zeros + if (cmpAnd != Vector512.Zero) + { + ulong mask = cmpAnd.ExtractMostSignificantBits(); + do + { + // unlike IndexOf, here we use LZCNT to process matches starting from the end + int highestSetBitIndex = 63 - BitOperations.LeadingZeroCount(mask); + if (valueLength == 2 || // we already matched two bytes + SequenceEqual( + ref Unsafe.Add(ref searchSpace, offset + highestSetBitIndex), + ref value, (nuint)(uint)valueLength)) // The (nuint)-cast is necessary to pick the correct overload + { + return highestSetBitIndex + offset; + } + // Clear the highest set bit. + mask = BitOperations.FlipBit(mask, highestSetBitIndex); + } while (mask != 0); + } + + offset -= Vector512.Count; + if (offset == -Vector512.Count) + return -1; + // Overlap with the current chunk if there is not enough room for the next one + if (offset < 0) + offset = 0; + } while (true) ; + } + else if (Vector256.IsHardwareAccelerated && searchSpaceMinusValueTailLength >= Vector256.Count) { offset = searchSpaceMinusValueTailLength - Vector256.Count; @@ -345,7 +453,6 @@ private static void ThrowMustBeNullTerminatedString() internal static unsafe int IndexOfNullByte(byte* searchSpace) { const int Length = int.MaxValue; - const uint uValue = 0; // Use uint for comparisons to avoid unnecessary 8->32 extensions nuint offset = 0; // Use nuint for arithmetic to avoid unnecessary 64->32->64 truncations nuint lengthToExamine = (nuint)(uint)Length; @@ -416,7 +523,120 @@ internal static unsafe int IndexOfNullByte(byte* searchSpace) // We get past SequentialScan only if IsHardwareAccelerated is true; and remain length is greater than Vector length. // However, we still have the redundant check to allow the JIT to see that the code is unreachable and eliminate it when the platform does not // have hardware accelerated. After processing Vector lengths we return to SequentialScan to finish any remaining. - if (Vector256.IsHardwareAccelerated) + if (Vector512.IsHardwareAccelerated) + { + if (offset < (nuint)(uint)Length) + { + if ((((nuint)(uint)searchSpace + offset) & (nuint)(Vector256.Count - 1)) != 0) + { + // Not currently aligned to Vector256 (is aligned to Vector128); this can cause a problem for searches + // with no upper bound e.g. String.strlen. + // Start with a check on Vector128 to align to Vector256, before moving to processing Vector256. + // This ensures we do not fault across memory pages while searching for an end of string. + Vector128 search = Vector128.Load(searchSpace + offset); + + // Same method as below + uint matches = Vector128.Equals(Vector128.Zero, search).ExtractMostSignificantBits(); + if (matches == 0) + { + // Zero flags set so no matches + offset += (nuint)Vector128.Count; + } + else + { + // Find bitflag offset of first match and add to current offset + return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches)); + } + } + + if ((((nuint)(uint)searchSpace + offset) & (nuint)(Vector512.Count - 1)) != 0) + { + // Not currently aligned to Vector256 (is aligned to Vector128); this can cause a problem for searches + // with no upper bound e.g. String.strlen. + // Start with a check on Vector128 to align to Vector256, before moving to processing Vector256. + // This ensures we do not fault across memory pages while searching for an end of string. + Vector256 search = Vector256.Load(searchSpace + offset); + + // Same method as below + uint matches = Vector256.Equals(Vector256.Zero, search).ExtractMostSignificantBits(); + if (matches == 0) + { + // Zero flags set so no matches + offset += (nuint)Vector256.Count; + } + else + { + // Find bitflag offset of first match and add to current offset + return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches)); + } + } + lengthToExamine = GetByteVector512SpanLength(offset, Length); + if (lengthToExamine > offset) + { + do + { + Vector512 search = Vector512.Load(searchSpace + offset); + ulong matches = Vector512.Equals(Vector512.Zero, search).ExtractMostSignificantBits(); + // Note that MoveMask has converted the equal vector elements into a set of bit flags, + // So the bit position in 'matches' corresponds to the element offset. + if (matches == 0) + { + // Zero flags set so no matches + offset += (nuint)Vector512.Count; + continue; + } + + // Find bitflag offset of first match and add to current offset + return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches)); + } while (lengthToExamine > offset); + } + + lengthToExamine = GetByteVector256SpanLength(offset, Length); + if (lengthToExamine > offset) + { + Vector256 search = Vector256.Load(searchSpace + offset); + + // Same method as above + uint matches = Vector256.Equals(Vector256.Zero, search).ExtractMostSignificantBits(); + if (matches == 0) + { + // Zero flags set so no matches + offset += (nuint)Vector256.Count; + } + else + { + // Find bitflag offset of first match and add to current offset + return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches)); + } + } + + lengthToExamine = GetByteVector128SpanLength(offset, Length); + if (lengthToExamine > offset) + { + Vector128 search = Vector128.Load(searchSpace + offset); + + // Same method as above + uint matches = Vector128.Equals(Vector128.Zero, search).ExtractMostSignificantBits(); + if (matches == 0) + { + // Zero flags set so no matches + offset += (nuint)Vector128.Count; + } + else + { + // Find bitflag offset of first match and add to current offset + return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches)); + } + } + + if (offset < (nuint)(uint)Length) + { + lengthToExamine = ((nuint)(uint)Length - offset); + goto SequentialScan; + } + } + } + else if (Vector256.IsHardwareAccelerated) { if (offset < (nuint)(uint)Length) { @@ -634,7 +854,37 @@ public static unsafe bool SequenceEqual(ref byte first, ref byte second, nuint l Vector: if (Vector128.IsHardwareAccelerated) { - if (Vector256.IsHardwareAccelerated && length >= (nuint)Vector256.Count) + if (Vector512.IsHardwareAccelerated && length >= (nuint)Vector512.Count) + { + nuint offset = 0; + nuint lengthToExamine = length - (nuint)Vector512.Count; + // Unsigned, so it shouldn't have overflowed larger than length (rather than negative) + Debug.Assert(lengthToExamine < length); + if (lengthToExamine != 0) + { + do + { + if (Vector512.LoadUnsafe(ref first, offset) != + Vector512.LoadUnsafe(ref second, offset)) + { + goto NotEqual; + } + offset += (nuint)Vector512.Count; + } while (lengthToExamine > offset); + } + + // Do final compare as Vector512.Count from end rather than start + if (Vector512.LoadUnsafe(ref first, lengthToExamine) == + Vector512.LoadUnsafe(ref second, lengthToExamine)) + { + // C# compiler inverts this test, making the outer goto the conditional jmp. + goto Equal; + } + + // This becomes a conditional jmp forward to not favor it. + goto NotEqual; + } + else if (Vector256.IsHardwareAccelerated && length >= (nuint)Vector256.Count) { nuint offset = 0; nuint lengthToExamine = length - (nuint)Vector256.Count; @@ -789,6 +1039,47 @@ public static unsafe int SequenceCompareTo(ref byte first, int firstLength, ref if (Vector256.IsHardwareAccelerated) { + if (Vector512.IsHardwareAccelerated && (lengthToExamine >= (nuint)Vector512.Count)) + { + lengthToExamine -= (nuint)Vector512.Count; + ulong matches; + while (lengthToExamine > offset) + { + matches = Vector512.Equals(Vector512.LoadUnsafe(ref first, offset), Vector512.LoadUnsafe(ref second, offset)).ExtractMostSignificantBits(); + // Note that MoveMask has converted the equal vector elements into a set of bit flags, + // So the bit position in 'matches' corresponds to the element offset. + + // 32 elements in Vector256 so we compare to uint.MaxValue to check if everything matched + if (matches == ulong.MaxValue) + { + // All matched + offset += (nuint)Vector512.Count; + continue; + } + + goto Difference; + } + // Move to Vector length from end for final compare + offset = lengthToExamine; + // Same as method as above + matches = Vector512.Equals(Vector512.LoadUnsafe(ref first, offset), Vector512.LoadUnsafe(ref second, offset)).ExtractMostSignificantBits(); + if (matches == ulong.MaxValue) + { + // All matched + goto Equal; + } + Difference: + // Invert matches to find differences + ulong differences = ~matches; + // Find bitflag offset of first difference and add to current offset + offset += (uint)BitOperations.TrailingZeroCount(differences); + + int result = Unsafe.AddByteOffset(ref first, offset).CompareTo(Unsafe.AddByteOffset(ref second, offset)); + Debug.Assert(result != 0); + + return result; + } + if (lengthToExamine >= (nuint)Vector256.Count) { lengthToExamine -= (nuint)Vector256.Count; @@ -1139,6 +1430,10 @@ private static nuint GetByteVector128SpanLength(nuint offset, int length) private static nuint GetByteVector256SpanLength(nuint offset, int length) => (nuint)(uint)((length - (int)offset) & ~(Vector256.Count - 1)); + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static nuint GetByteVector512SpanLength(nuint offset, int length) + => (nuint)(uint)((length - (int)offset) & ~(Vector512.Count - 1)); + [MethodImpl(MethodImplOptions.AggressiveInlining)] private static unsafe nuint UnalignedCountVector128(byte* searchSpace) { @@ -1153,8 +1448,45 @@ public static void Reverse(ref byte buf, nuint length) nint remainder = (nint)length; nint offset = 0; - // overlapping has a positive performance benefit around 48 elements - if (Avx2.IsSupported && remainder >= (nint)(Vector256.Count * 1.5)) + if (Vector512.IsHardwareAccelerated && remainder >= Vector512.Count * 2) + { + nint lastOffset = remainder - Vector512.Count; + do + { + // Load the values into vectors + Vector512 tempFirst = Vector512.LoadUnsafe(ref buf, (nuint)offset); + Vector512 tempLast = Vector512.LoadUnsafe(ref buf, (nuint)lastOffset); + + // Shuffle to reverse each vector: + // +---------------------------------------------------------------+ + // | A | B | C | D | E | F | G | H | I | J | K | L | M | N | O | P | + // +---------------------------------------------------------------+ + // ---> + // +---------------------------------------------------------------+ + // | P | O | N | M | L | K | J | I | H | G | F | E | D | C | B | A | + // +---------------------------------------------------------------+ + tempFirst = Vector512.Shuffle(tempFirst, Vector512.Create( + (byte)63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, + 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, + 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, + 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + tempLast = Vector512.Shuffle(tempLast, Vector512.Create( + (byte)63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, + 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, + 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, + 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + + // Store the reversed vectors + tempLast.StoreUnsafe(ref buf, (nuint)offset); + tempFirst.StoreUnsafe(ref buf, (nuint)lastOffset); + + offset += Vector512.Count; + lastOffset -= Vector512.Count; + } while (lastOffset >= offset); + + remainder = lastOffset + Vector512.Count - offset; + } + else if (Avx2.IsSupported && remainder >= (nint)(Vector256.Count * 1.5)) { Vector256 reverseMask = Vector256.Create( (byte)15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, // first 128-bit lane diff --git a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs index 70216de36fd49d..6637d4d2637cb7 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs @@ -68,7 +68,74 @@ ref Unsafe.As(ref Unsafe.Add(ref searchSpace, offset + 1)), // Based on http://0x80.pl/articles/simd-strfind.html#algorithm-1-generic-simd "Algorithm 1: Generic SIMD" by Wojciech Mula // Some details about the implementation can also be found in https://github.com/dotnet/runtime/pull/63285 SEARCH_TWO_CHARS: - if (Vector256.IsHardwareAccelerated && searchSpaceMinusValueTailLength - Vector256.Count >= 0) + if (Vector512.IsHardwareAccelerated && searchSpaceMinusValueTailLength - Vector512.Count >= 0) + { + // Find the last unique (which is not equal to ch1) character + // the algorithm is fine if both are equal, just a little bit less efficient + ushort ch2Val = Unsafe.Add(ref value, valueTailLength); + nint ch1ch2Distance = (nint)(uint)valueTailLength; + while (ch2Val == valueHead && ch1ch2Distance > 1) + ch2Val = Unsafe.Add(ref value, --ch1ch2Distance); + + Vector512 ch1 = Vector512.Create((ushort)valueHead); + Vector512 ch2 = Vector512.Create(ch2Val); + + nint searchSpaceMinusValueTailLengthAndVector = + searchSpaceMinusValueTailLength - (nint)Vector512.Count; + + do + { + // Make sure we don't go out of bounds + Debug.Assert(offset + ch1ch2Distance + Vector512.Count <= searchSpaceLength); + + Vector512 cmpCh2 = Vector512.Equals(ch2, Vector512.LoadUnsafe(ref searchSpace, (nuint)(offset + ch1ch2Distance))); + Vector512 cmpCh1 = Vector512.Equals(ch1, Vector512.LoadUnsafe(ref searchSpace, (nuint)offset)); + Vector512 cmpAnd = (cmpCh1 & cmpCh2).AsByte(); + + // Early out: cmpAnd is all zeros + if (cmpAnd != Vector512.Zero) + { + goto CANDIDATE_FOUND; + } + + LOOP_FOOTER: + offset += Vector512.Count; + + if (offset == searchSpaceMinusValueTailLength) + return -1; + + // Overlap with the current chunk for trailing elements + if (offset > searchSpaceMinusValueTailLengthAndVector) + offset = searchSpaceMinusValueTailLengthAndVector; + + continue; + + CANDIDATE_FOUND: + ulong mask = cmpAnd.ExtractMostSignificantBits(); + do + { + int bitPos = BitOperations.TrailingZeroCount(mask); + // div by 2 (shr) because we work with 2-byte chars + nint charPos = (nint)((uint)bitPos / 2); + if (valueLength == 2 || // we already matched two chars + SequenceEqual( + ref Unsafe.As(ref Unsafe.Add(ref searchSpace, offset + charPos)), + ref Unsafe.As(ref value), (nuint)(uint)valueLength * 2)) + { + return (int)(offset + charPos); + } + + // Clear two the lowest set bits + if (Bmi1.X64.IsSupported) + mask = Bmi1.X64.ResetLowestSetBit(Bmi1.X64.ResetLowestSetBit(mask)); + else + mask &= ~(ulong)((ulong)0b11 << bitPos); + } while (mask != 0); + goto LOOP_FOOTER; + + } while (true); + } + else if (Vector256.IsHardwareAccelerated && searchSpaceMinusValueTailLength - Vector256.Count >= 0) { // Find the last unique (which is not equal to ch1) character // the algorithm is fine if both are equal, just a little bit less efficient @@ -253,7 +320,57 @@ ref Unsafe.As(ref Unsafe.Add(ref searchSpace, relativeIndex + 1)), // Based on http://0x80.pl/articles/simd-strfind.html#algorithm-1-generic-simd "Algorithm 1: Generic SIMD" by Wojciech Mula // Some details about the implementation can also be found in https://github.com/dotnet/runtime/pull/63285 SEARCH_TWO_CHARS: - if (Vector256.IsHardwareAccelerated && searchSpaceMinusValueTailLength >= Vector256.Count) + if (Vector512.IsHardwareAccelerated && searchSpaceMinusValueTailLength >= Vector512.Count) + { + offset = searchSpaceMinusValueTailLength - Vector512.Count; + + // Find the last unique (which is not equal to ch1) char + // the algorithm is fine if both are equal, just a little bit less efficient + char ch2Val = Unsafe.Add(ref value, valueTailLength); + int ch1ch2Distance = valueTailLength; + while (ch2Val == valueHead && ch1ch2Distance > 1) + ch2Val = Unsafe.Add(ref value, --ch1ch2Distance); + + Vector512 ch1 = Vector512.Create((ushort)valueHead); + Vector512 ch2 = Vector512.Create((ushort)ch2Val); + + do + { + + Vector512 cmpCh1 = Vector512.Equals(ch1, Vector512.LoadUnsafe(ref searchSpace, (nuint)offset)); + Vector512 cmpCh2 = Vector512.Equals(ch2, Vector512.LoadUnsafe(ref searchSpace, (nuint)(offset + ch1ch2Distance))); + Vector512 cmpAnd = (cmpCh1 & cmpCh2).AsByte(); + + // Early out: cmpAnd is all zeros + if (cmpAnd != Vector512.Zero) + { + ulong mask = cmpAnd.ExtractMostSignificantBits(); + do + { + // unlike IndexOf, here we use LZCNT to process matches starting from the end + int bitPos = 62 - BitOperations.LeadingZeroCount(mask); + int charPos = (int)((uint)bitPos / 2); + + if (valueLength == 2 || // we already matched two chars + SequenceEqual( + ref Unsafe.As(ref Unsafe.Add(ref searchSpace, offset + charPos)), + ref Unsafe.As(ref value), (nuint)(uint)valueLength * 2)) + { + return charPos + offset; + } + mask &= ~(ulong)((ulong)0b11 << bitPos); // clear two highest set bits. + } while (mask != 0); + } + + offset -= Vector512.Count; + if (offset == -Vector512.Count) + return -1; + // Overlap with the current chunk if there is not enough room for the next one + if (offset < 0) + offset = 0; + } while (true); + } + else if (Vector256.IsHardwareAccelerated && searchSpaceMinusValueTailLength >= Vector256.Count) { offset = searchSpaceMinusValueTailLength - Vector256.Count; @@ -478,7 +595,145 @@ public static unsafe int IndexOfNullCharacter(char* searchSpace) // We get past SequentialScan only if IsHardwareAccelerated is true. However, we still have the redundant check to allow // the JIT to see that the code is unreachable and eliminate it when the platform does not have hardware accelerated. - if (Vector256.IsHardwareAccelerated) + if (Vector512.IsHardwareAccelerated) + { + if (offset < length) + { + Debug.Assert(length - offset >= Vector128.Count); + if (((nint)(searchSpace + (nint)offset) & (nint)(Vector256.Count - 1)) != 0) + { + // Not currently aligned to Vector256 (is aligned to Vector128); this can cause a problem for searches + // with no upper bound e.g. String.wcslen. Start with a check on Vector128 to align to Vector256, + // before moving to processing Vector256. + + // This ensures we do not fault across memory pages + // while searching for an end of string. Specifically that this assumes that the length is either correct + // or that the data is pinned otherwise it may cause an AccessViolation from crossing a page boundary into an + // unowned page. If the search is unbounded (e.g. null terminator in wcslen) and the search value is not found, + // again this will likely cause an AccessViolation. However, correctly bounded searches will return -1 rather + // than ever causing an AV. + Vector128 search = *(Vector128*)(searchSpace + (nuint)offset); + + // Same method as below + uint matches = Vector128.Equals(Vector128.Zero, search).AsByte().ExtractMostSignificantBits(); + if (matches == 0) + { + // Zero flags set so no matches + offset += Vector128.Count; + } + else + { + // Find bitflag offset of first match and add to current offset + return (int)(offset + ((uint)BitOperations.TrailingZeroCount(matches) / sizeof(char))); + } + } + if (((nint)(searchSpace + (nint)offset) & (nint)(Vector512.Count - 1)) != 0) + { + // Not currently aligned to Vector256 (is aligned to Vector128); this can cause a problem for searches + // with no upper bound e.g. String.wcslen. Start with a check on Vector128 to align to Vector256, + // before moving to processing Vector256. + + // This ensures we do not fault across memory pages + // while searching for an end of string. Specifically that this assumes that the length is either correct + // or that the data is pinned otherwise it may cause an AccessViolation from crossing a page boundary into an + // unowned page. If the search is unbounded (e.g. null terminator in wcslen) and the search value is not found, + // again this will likely cause an AccessViolation. However, correctly bounded searches will return -1 rather + // than ever causing an AV. + Vector256 search = *(Vector256*)(searchSpace + (nuint)offset); + + // Same method as below + uint matches = Vector256.Equals(Vector256.Zero, search).AsByte().ExtractMostSignificantBits(); + if (matches == 0) + { + // Zero flags set so no matches + offset += Vector256.Count; + } + else + { + // Find bitflag offset of first match and add to current offset + return (int)(offset + ((uint)BitOperations.TrailingZeroCount(matches) / sizeof(char))); + } + } + + lengthToExamine = GetCharVector512SpanLength(offset, length); + if (lengthToExamine > 0) + { + do + { + Debug.Assert(lengthToExamine >= Vector512.Count); + + Vector512 search = *(Vector512*)(searchSpace + (nuint)offset); + ulong matches = Vector512.Equals(Vector512.Zero, search).AsByte().ExtractMostSignificantBits(); + // Note that MoveMask has converted the equal vector elements into a set of bit flags, + // So the bit position in 'matches' corresponds to the element offset. + if (matches == 0) + { + // Zero flags set so no matches + offset += Vector512.Count; + lengthToExamine -= Vector512.Count; + continue; + } + + // Find bitflag offset of first match and add to current offset, + // flags are in bytes so divide for chars + return (int)(offset + ((uint)BitOperations.TrailingZeroCount(matches) / sizeof(char))); + } while (lengthToExamine > 0); + } + + lengthToExamine = GetCharVector256SpanLength(offset, length); + if (lengthToExamine > 0) + { + Debug.Assert(lengthToExamine >= Vector256.Count); + + Vector256 search = *(Vector256*)(searchSpace + (nuint)offset); + + // Same method as above + uint matches = Vector256.Equals(Vector256.Zero, search).AsByte().ExtractMostSignificantBits(); + if (matches == 0) + { + // Zero flags set so no matches + offset += Vector256.Count; + // Don't need to change lengthToExamine here as we don't use its current value again. + } + else + { + // Find bitflag offset of first match and add to current offset, + // flags are in bytes so divide for chars + return (int)(offset + ((uint)BitOperations.TrailingZeroCount(matches) / sizeof(char))); + } + } + + lengthToExamine = GetCharVector128SpanLength(offset, length); + if (lengthToExamine > 0) + { + Debug.Assert(lengthToExamine >= Vector128.Count); + + Vector128 search = *(Vector128*)(searchSpace + (nuint)offset); + + // Same method as above + uint matches = Vector128.Equals(Vector128.Zero, search).AsByte().ExtractMostSignificantBits(); + if (matches == 0) + { + // Zero flags set so no matches + offset += Vector128.Count; + // Don't need to change lengthToExamine here as we don't use its current value again. + } + else + { + // Find bitflag offset of first match and add to current offset, + // flags are in bytes so divide for chars + return (int)(offset + ((uint)BitOperations.TrailingZeroCount(matches) / sizeof(char))); + } + } + + if (offset < length) + { + lengthToExamine = length - offset; + goto SequentialScan; + } + } + } + else if (Vector256.IsHardwareAccelerated) { if (offset < length) { @@ -707,6 +962,10 @@ private static nint GetCharVector128SpanLength(nint offset, nint length) private static nint GetCharVector256SpanLength(nint offset, nint length) => (length - offset) & ~(Vector256.Count - 1); + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static nint GetCharVector512SpanLength(nint offset, nint length) + => (length - offset) & ~(Vector512.Count - 1); + [MethodImpl(MethodImplOptions.AggressiveInlining)] private static unsafe nint UnalignedCountVector128(char* searchSpace) { @@ -721,8 +980,42 @@ public static void Reverse(ref char buf, nuint length) nint remainder = (nint)length; nint offset = 0; + if (Vector512.IsHardwareAccelerated && remainder >= Vector512.Count * 2) + { + nint lastOffset = remainder - Vector512.Count; + do + { + ref ushort first = ref Unsafe.As(ref Unsafe.Add(ref buf, offset)); + ref ushort last = ref Unsafe.As(ref Unsafe.Add(ref buf, lastOffset)); + + Vector512 tempFirst = Vector512.LoadUnsafe(ref first); + Vector512 tempLast = Vector512.LoadUnsafe(ref last); + + // Shuffle to reverse each vector: + // +-------------------------------+ + // | A | B | C | D | E | F | G | H | + // +-------------------------------+ + // ---> + // +-------------------------------+ + // | H | G | F | E | D | C | B | A | + // +-------------------------------+ + tempFirst = Vector512.Shuffle(tempFirst, Vector512.Create((ushort)31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, + 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + tempLast = Vector512.Shuffle(tempLast, Vector512.Create((ushort)31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, + 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + + // Store the reversed vectors + tempLast.StoreUnsafe(ref first); + tempFirst.StoreUnsafe(ref last); + + offset += Vector512.Count; + lastOffset -= Vector512.Count; + } while (lastOffset >= offset); + + remainder = (lastOffset + Vector512.Count - offset); + } // overlapping has a positive performance benefit around 24 elements - if (Avx2.IsSupported && remainder >= (nint)(Vector256.Count * 1.5)) + else if (Avx2.IsSupported && remainder >= (nint)(Vector256.Count * 1.5)) { Vector256 reverseMask = Vector256.Create( (byte)14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, // first 128-bit lane diff --git a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Packed.cs b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Packed.cs index 1851d1e26ffefa..fe90ec14fbb635 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Packed.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Packed.cs @@ -113,9 +113,59 @@ public static bool Contains(ref short searchSpace, short value, int length) else { ref short currentSearchSpace = ref searchSpace; +#pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough // The else condition for this if statement is identical in semantics to Avx2 specific code + if (Avx512F.IsSupported && length > Vector512.Count) +#pragma warning restore IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough + { + Vector512 packedValue = Vector512.Create((byte)value); + + if (length > 2 * Vector512.Count) + { + // Process the input in chunks of 64 characters (2 * Vector512). + // If the input length is a multiple of 32, don't consume the last 16 characters in this loop. + // Let the fallback below handle it instead. This is why the condition is + // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan". + ref short twoVectorsAwayFromEnd = ref Unsafe.Add(ref searchSpace, length - (2 * Vector512.Count)); + + do + { + Vector512 source0 = Vector512.LoadUnsafe(ref currentSearchSpace); + Vector512 source1 = Vector512.LoadUnsafe(ref currentSearchSpace, (nuint)Vector512.Count); + Vector512 packedSource = PackSources(source0, source1); + Vector512 result = Vector512.Equals(packedValue, packedSource); + + if (result != Vector512.Zero) + { + return true; + } + + currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, 2 * Vector512.Count); + } + while (Unsafe.IsAddressLessThan(ref currentSearchSpace, ref twoVectorsAwayFromEnd)); + } + + // We have 1-32 characters remaining. Process the first and last vector in the search space. + // They may overlap, but we're only interested in whether any value matched. + { + ref short oneVectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, length - Vector512.Count); + + ref short firstVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref oneVectorAwayFromEnd) + ? ref oneVectorAwayFromEnd + : ref currentSearchSpace; + + Vector512 source0 = Vector512.LoadUnsafe(ref firstVector); + Vector512 source1 = Vector512.LoadUnsafe(ref oneVectorAwayFromEnd); + Vector512 packedSource = PackSources(source0, source1); + Vector512 result = Vector512.Equals(packedValue, packedSource); + if (result != Vector512.Zero) + { + return true; + } + } + } #pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough // The else condition for this if statement is identical in semantics to Avx2 specific code - if (Avx2.IsSupported && length > Vector256.Count) + else if (Avx2.IsSupported && length > Vector256.Count) #pragma warning restore IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough { Vector256 packedValue = Vector256.Create((byte)value); @@ -264,7 +314,60 @@ private static int IndexOf(ref short searchSpace, short value, int len ref short currentSearchSpace = ref searchSpace; #pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough // The else condition for this if statement is identical in semantics to Avx2 specific code - if (Avx2.IsSupported && length > Vector256.Count) + if (Avx512F.IsSupported && length > Vector512.Count) +#pragma warning restore IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough + { + Vector512 packedValue = Vector512.Create((byte)value); + + if (length > 2 * Vector512.Count) + { + // Process the input in chunks of 32 characters (2 * Vector256). + // If the input length is a multiple of 32, don't consume the last 16 characters in this loop. + // Let the fallback below handle it instead. This is why the condition is + // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan". + ref short twoVectorsAwayFromEnd = ref Unsafe.Add(ref searchSpace, length - (2 * Vector512.Count)); + + do + { + Vector512 source0 = Vector512.LoadUnsafe(ref currentSearchSpace); + Vector512 source1 = Vector512.LoadUnsafe(ref currentSearchSpace, (nuint)Vector512.Count); + Vector512 packedSource = PackSources(source0, source1); + Vector512 result = Vector512.Equals(packedValue, packedSource); + result = NegateIfNeeded(result); + + if (result != Vector512.Zero) + { + return ComputeFirstIndex(ref searchSpace, ref currentSearchSpace, result); + } + + currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, 2 * Vector512.Count); + } + while (Unsafe.IsAddressLessThan(ref currentSearchSpace, ref twoVectorsAwayFromEnd)); + } + + // We have 1-32 characters remaining. Process the first and last vector in the search space. + // They may overlap, but we'll handle that in the index calculation if we do get a match. + { + ref short oneVectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, length - Vector512.Count); + + ref short firstVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref oneVectorAwayFromEnd) + ? ref oneVectorAwayFromEnd + : ref currentSearchSpace; + + Vector512 source0 = Vector512.LoadUnsafe(ref firstVector); + Vector512 source1 = Vector512.LoadUnsafe(ref oneVectorAwayFromEnd); + Vector512 packedSource = PackSources(source0, source1); + Vector512 result = Vector512.Equals(packedValue, packedSource); + result = NegateIfNeeded(result); + + if (result != Vector512.Zero) + { + return ComputeFirstIndexOverlapped(ref searchSpace, ref firstVector, ref oneVectorAwayFromEnd, result); + } + } + } +#pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough // The else condition for this if statement is identical in semantics to Avx2 specific code + else if (Avx2.IsSupported && length > Vector256.Count) #pragma warning restore IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough { Vector256 packedValue = Vector256.Create((byte)value); @@ -422,9 +525,62 @@ private static int IndexOfAny(ref short searchSpace, short value0, sho else { ref short currentSearchSpace = ref searchSpace; +#pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough // The else condition for this if statement is identical in semantics to Avx2 specific code + if (Avx512F.IsSupported && length > Vector512.Count) +#pragma warning restore IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough + { + Vector512 packedValue0 = Vector512.Create((byte)value0); + Vector512 packedValue1 = Vector512.Create((byte)value1); + + if (length > 2 * Vector512.Count) + { + // Process the input in chunks of 32 characters (2 * Vector256). + // If the input length is a multiple of 32, don't consume the last 16 characters in this loop. + // Let the fallback below handle it instead. This is why the condition is + // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan". + ref short twoVectorsAwayFromEnd = ref Unsafe.Add(ref searchSpace, length - (2 * Vector512.Count)); + + do + { + Vector512 source0 = Vector512.LoadUnsafe(ref currentSearchSpace); + Vector512 source1 = Vector512.LoadUnsafe(ref currentSearchSpace, (nuint)Vector512.Count); + Vector512 packedSource = PackSources(source0, source1); + Vector512 result = Vector512.Equals(packedValue0, packedSource) | Vector512.Equals(packedValue1, packedSource); + result = NegateIfNeeded(result); + + if (result != Vector512.Zero) + { + return ComputeFirstIndex(ref searchSpace, ref currentSearchSpace, result); + } + + currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, 2 * Vector512.Count); + } + while (Unsafe.IsAddressLessThan(ref currentSearchSpace, ref twoVectorsAwayFromEnd)); + } + // We have 1-32 characters remaining. Process the first and last vector in the search space. + // They may overlap, but we'll handle that in the index calculation if we do get a match. + { + ref short oneVectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, length - Vector512.Count); + + ref short firstVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref oneVectorAwayFromEnd) + ? ref oneVectorAwayFromEnd + : ref currentSearchSpace; + + Vector512 source0 = Vector512.LoadUnsafe(ref firstVector); + Vector512 source1 = Vector512.LoadUnsafe(ref oneVectorAwayFromEnd); + Vector512 packedSource = PackSources(source0, source1); + Vector512 result = Vector512.Equals(packedValue0, packedSource) | Vector512.Equals(packedValue1, packedSource); + result = NegateIfNeeded(result); + + if (result != Vector512.Zero) + { + return ComputeFirstIndexOverlapped(ref searchSpace, ref firstVector, ref oneVectorAwayFromEnd, result); + } + } + } #pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough // The else condition for this if statement is identical in semantics to Avx2 specific code - if (Avx2.IsSupported && length > Vector256.Count) + else if (Avx2.IsSupported && length > Vector256.Count) #pragma warning restore IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough { Vector256 packedValue0 = Vector256.Create((byte)value0); @@ -587,7 +743,62 @@ private static int IndexOfAny(ref short searchSpace, short value0, sho ref short currentSearchSpace = ref searchSpace; #pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough // The else condition for this if statement is identical in semantics to Avx2 specific code - if (Avx2.IsSupported && length > Vector256.Count) + if (Avx512F.IsSupported && length > Vector512.Count) +#pragma warning restore IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough + { + Vector512 packedValue0 = Vector512.Create((byte)value0); + Vector512 packedValue1 = Vector512.Create((byte)value1); + Vector512 packedValue2 = Vector512.Create((byte)value2); + + if (length > 2 * Vector512.Count) + { + // Process the input in chunks of 32 characters (2 * Vector256). + // If the input length is a multiple of 32, don't consume the last 16 characters in this loop. + // Let the fallback below handle it instead. This is why the condition is + // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan". + ref short twoVectorsAwayFromEnd = ref Unsafe.Add(ref searchSpace, length - (2 * Vector512.Count)); + + do + { + Vector512 source0 = Vector512.LoadUnsafe(ref currentSearchSpace); + Vector512 source1 = Vector512.LoadUnsafe(ref currentSearchSpace, (nuint)Vector256.Count); + Vector512 packedSource = PackSources(source0, source1); + Vector512 result = Vector512.Equals(packedValue0, packedSource) | Vector512.Equals(packedValue1, packedSource) | Vector512.Equals(packedValue2, packedSource); + result = NegateIfNeeded(result); + + if (result != Vector512.Zero) + { + return ComputeFirstIndex(ref searchSpace, ref currentSearchSpace, result); + } + + currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, 2 * Vector512.Count); + } + while (Unsafe.IsAddressLessThan(ref currentSearchSpace, ref twoVectorsAwayFromEnd)); + } + + // We have 1-32 characters remaining. Process the first and last vector in the search space. + // They may overlap, but we'll handle that in the index calculation if we do get a match. + { + ref short oneVectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, length - Vector512.Count); + + ref short firstVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref oneVectorAwayFromEnd) + ? ref oneVectorAwayFromEnd + : ref currentSearchSpace; + + Vector512 source0 = Vector512.LoadUnsafe(ref firstVector); + Vector512 source1 = Vector512.LoadUnsafe(ref oneVectorAwayFromEnd); + Vector512 packedSource = PackSources(source0, source1); + Vector512 result = Vector512.Equals(packedValue0, packedSource) | Vector512.Equals(packedValue1, packedSource) | Vector512.Equals(packedValue2, packedSource); + result = NegateIfNeeded(result); + + if (result != Vector512.Zero) + { + return ComputeFirstIndexOverlapped(ref searchSpace, ref firstVector, ref oneVectorAwayFromEnd, result); + } + } + } +#pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough // The else condition for this if statement is identical in semantics to Avx2 specific code + else if (Avx2.IsSupported && length > Vector256.Count) #pragma warning restore IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough { Vector256 packedValue0 = Vector256.Create((byte)value0); @@ -734,7 +945,61 @@ private static int IndexOfAnyInRange(ref short searchSpace, short lowI ref short currentSearchSpace = ref searchSpace; #pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough // The else condition for this if statement is identical in semantics to Avx2 specific code - if (Avx2.IsSupported && length > Vector256.Count) + if (Avx512F.IsSupported && length > Vector512.Count) +#pragma warning restore IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough + { + Vector512 lowVector = Vector512.Create((byte)lowInclusive); + Vector512 rangeVector = Vector512.Create((byte)rangeInclusive); + + if (length > 2 * Vector512.Count) + { + // Process the input in chunks of 32 characters (2 * Vector256). + // If the input length is a multiple of 32, don't consume the last 16 characters in this loop. + // Let the fallback below handle it instead. This is why the condition is + // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan". + ref short twoVectorsAwayFromEnd = ref Unsafe.Add(ref searchSpace, length - (2 * Vector512.Count)); + + do + { + Vector512 source0 = Vector512.LoadUnsafe(ref currentSearchSpace); + Vector512 source1 = Vector512.LoadUnsafe(ref currentSearchSpace, (nuint)Vector512.Count); + Vector512 packedSource = PackSources(source0, source1); + Vector512 result = Vector512.LessThanOrEqual(packedSource - lowVector, rangeVector); + result = NegateIfNeeded(result); + + if (result != Vector512.Zero) + { + return ComputeFirstIndex(ref searchSpace, ref currentSearchSpace, result); + } + + currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, 2 * Vector512.Count); + } + while (Unsafe.IsAddressLessThan(ref currentSearchSpace, ref twoVectorsAwayFromEnd)); + } + + // We have 1-32 characters remaining. Process the first and last vector in the search space. + // They may overlap, but we'll handle that in the index calculation if we do get a match. + { + ref short oneVectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, length - Vector512.Count); + + ref short firstVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref oneVectorAwayFromEnd) + ? ref oneVectorAwayFromEnd + : ref currentSearchSpace; + + Vector512 source0 = Vector512.LoadUnsafe(ref firstVector); + Vector512 source1 = Vector512.LoadUnsafe(ref oneVectorAwayFromEnd); + Vector512 packedSource = PackSources(source0, source1); + Vector512 result = Vector512.LessThanOrEqual(packedSource - lowVector, rangeVector); + result = NegateIfNeeded(result); + + if (result != Vector512.Zero) + { + return ComputeFirstIndexOverlapped(ref searchSpace, ref firstVector, ref oneVectorAwayFromEnd, result); + } + } + } +#pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough // The else condition for this if statement is identical in semantics to Avx2 specific code + else if (Avx2.IsSupported && length > Vector256.Count) #pragma warning restore IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough { Vector256 lowVector = Vector256.Create((byte)lowInclusive); @@ -853,6 +1118,19 @@ private static int IndexOfAnyInRange(ref short searchSpace, short lowI return -1; } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + [CompExactlyDependsOn(typeof(Avx512F))] + private static Vector512 PackSources(Vector512 source0, Vector512 source1) + { + Debug.Assert(Vector512.IsHardwareAccelerated); + // Pack two vectors of characters into bytes. While the type is Vector256, these are really UInt16 characters. + // X86: Downcast every character using saturation. + // - Values <= 32767 result in min(value, 255). + // - Values > 32767 result in 0. Because of this we can't accept needles that contain 0. + return Vector512.Narrow(source0, source1).AsByte(); + //return Avx512BW.PackUnsignedSaturate(source0, source1).AsByte(); + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] [CompExactlyDependsOn(typeof(Avx2))] private static Vector256 PackSources(Vector256 source0, Vector256 source1) @@ -887,6 +1165,11 @@ private static Vector256 NegateIfNeeded(Vector256 result) where TNegator : struct, SpanHelpers.INegator => typeof(TNegator) == typeof(SpanHelpers.DontNegate) ? result : ~result; + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector512 NegateIfNeeded(Vector512 result) + where TNegator : struct, SpanHelpers.INegator => + typeof(TNegator) == typeof(SpanHelpers.DontNegate) ? result : ~result; + [MethodImpl(MethodImplOptions.AggressiveInlining)] private static int ComputeFirstIndex(ref short searchSpace, ref short current, Vector128 equals) { @@ -904,6 +1187,16 @@ private static int ComputeFirstIndex(ref short searchSpace, ref short current, V return index + (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref current) / sizeof(short)); } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + [CompExactlyDependsOn(typeof(Avx512F))] + private static int ComputeFirstIndex(ref short searchSpace, ref short current, Vector512 equals) + { + //ulong notEqualsElements = FixUpPackedVector512Result(equals).ExtractMostSignificantBits(); + ulong notEqualsElements = equals.ExtractMostSignificantBits(); + int index = BitOperations.TrailingZeroCount(notEqualsElements); + return index + (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref current) / sizeof(short)); + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] private static int ComputeFirstIndexOverlapped(ref short searchSpace, ref short current0, ref short current1, Vector128 equals) { @@ -933,6 +1226,21 @@ private static int ComputeFirstIndexOverlapped(ref short searchSpace, ref short return offsetInVector + (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref current0) / sizeof(short)); } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + [CompExactlyDependsOn(typeof(Avx512F))] + private static int ComputeFirstIndexOverlapped(ref short searchSpace, ref short current0, ref short current1, Vector512 equals) + { + ulong notEqualsElements = equals.ExtractMostSignificantBits(); + int offsetInVector = BitOperations.TrailingZeroCount(notEqualsElements); + if (offsetInVector >= Vector512.Count) + { + // We matched within the second vector + current0 = ref current1; + offsetInVector -= Vector512.Count; + } + return offsetInVector + (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref current0) / sizeof(short)); + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] [CompExactlyDependsOn(typeof(Avx2))] private static Vector256 FixUpPackedVector256Result(Vector256 result) diff --git a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.T.cs b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.T.cs index 7db0c81e96027d..834e165e798fb1 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.T.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.T.cs @@ -1367,6 +1367,36 @@ internal static bool NonPackedContainsValueType(ref T searchSpace, T value, i offset += 1; } } + else if (Vector512.IsHardwareAccelerated && length >= Vector512.Count) + { + Vector512 equals, values = Vector512.Create(value); + ref T currentSearchSpace = ref searchSpace; + ref T oneVectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, (uint)(length - Vector512.Count)); + + // Loop until either we've finished all elements or there's less than a vector's-worth remaining. + do + { + equals = Vector512.Equals(values, Vector512.LoadUnsafe(ref currentSearchSpace)); + if (equals == Vector512.Zero) + { + currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, Vector512.Count); + continue; + } + + return true; + } + while (!Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref oneVectorAwayFromEnd)); + + // If any elements remain, process the last vector in the search space. + if ((uint)length % Vector512.Count != 0) + { + equals = Vector512.Equals(values, Vector512.LoadUnsafe(ref oneVectorAwayFromEnd)); + if (equals != Vector512.Zero) + { + return true; + } + } + } else if (Vector256.IsHardwareAccelerated && length >= Vector256.Count) { Vector256 equals, values = Vector256.Create(value); @@ -1527,6 +1557,36 @@ internal static int NonPackedIndexOfValueType(ref TValue searc Found: return (int)(offset); } + else if (Vector512.IsHardwareAccelerated && length >= Vector512.Count) + { + Vector512 equals, values = Vector512.Create(value); + ref TValue currentSearchSpace = ref searchSpace; + ref TValue oneVectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, length - Vector512.Count); + + // Loop until either we've finished all elements or there's less than a vector's-worth remaining. + do + { + equals = TNegator.NegateIfNeeded(Vector512.Equals(values, Vector512.LoadUnsafe(ref currentSearchSpace))); + if (equals == Vector512.Zero) + { + currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, Vector512.Count); + continue; + } + + return ComputeFirstIndex(ref searchSpace, ref currentSearchSpace, equals); + } + while (!Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref oneVectorAwayFromEnd)); + + // If any elements remain, process the last vector in the search space. + if ((uint)length % Vector512.Count != 0) + { + equals = TNegator.NegateIfNeeded(Vector512.Equals(values, Vector512.LoadUnsafe(ref oneVectorAwayFromEnd))); + if (equals != Vector512.Zero) + { + return ComputeFirstIndex(ref searchSpace, ref oneVectorAwayFromEnd, equals); + } + } + } else if (Vector256.IsHardwareAccelerated && length >= Vector256.Count) { Vector256 equals, values = Vector256.Create(value); @@ -1703,6 +1763,38 @@ internal static int NonPackedIndexOfAnyValueType(ref TValue se Found: return (int)(offset); } + else if (Vector512.IsHardwareAccelerated && length >= Vector512.Count) + { + Vector512 equals, current, values0 = Vector512.Create(value0), values1 = Vector512.Create(value1); + ref TValue currentSearchSpace = ref searchSpace; + ref TValue oneVectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, length - Vector512.Count); + + // Loop until either we've finished all elements or there's less than a vector's-worth remaining. + do + { + current = Vector512.LoadUnsafe(ref currentSearchSpace); + equals = TNegator.NegateIfNeeded(Vector512.Equals(values0, current) | Vector512.Equals(values1, current)); + if (equals == Vector512.Zero) + { + currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, Vector512.Count); + continue; + } + + return ComputeFirstIndex(ref searchSpace, ref currentSearchSpace, equals); + } + while (!Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref oneVectorAwayFromEnd)); + + // If any elements remain, process the last vector in the search space. + if ((uint)length % Vector512.Count != 0) + { + current = Vector512.LoadUnsafe(ref oneVectorAwayFromEnd); + equals = TNegator.NegateIfNeeded(Vector512.Equals(values0, current) | Vector512.Equals(values1, current)); + if (equals != Vector512.Zero) + { + return ComputeFirstIndex(ref searchSpace, ref oneVectorAwayFromEnd, equals); + } + } + } else if (Vector256.IsHardwareAccelerated && length >= Vector256.Count) { Vector256 equals, current, values0 = Vector256.Create(value0), values1 = Vector256.Create(value1); @@ -1878,6 +1970,38 @@ internal static int NonPackedIndexOfAnyValueType(ref TValue se Found: return (int)(offset); } + else if (Vector512.IsHardwareAccelerated && length >= Vector512.Count) + { + Vector512 equals, current, values0 = Vector512.Create(value0), values1 = Vector512.Create(value1), values2 = Vector512.Create(value2); + ref TValue currentSearchSpace = ref searchSpace; + ref TValue oneVectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, length - Vector512.Count); + + // Loop until either we've finished all elements or there's less than a vector's-worth remaining. + do + { + current = Vector512.LoadUnsafe(ref currentSearchSpace); + equals = TNegator.NegateIfNeeded(Vector512.Equals(values0, current) | Vector512.Equals(values1, current) | Vector512.Equals(values2, current)); + if (equals == Vector512.Zero) + { + currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, Vector512.Count); + continue; + } + + return ComputeFirstIndex(ref searchSpace, ref currentSearchSpace, equals); + } + while (!Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref oneVectorAwayFromEnd)); + + // If any elements remain, process the last vector in the search space. + if ((uint)length % Vector512.Count != 0) + { + current = Vector512.LoadUnsafe(ref oneVectorAwayFromEnd); + equals = TNegator.NegateIfNeeded(Vector512.Equals(values0, current) | Vector512.Equals(values1, current) | Vector512.Equals(values2, current)); + if (equals != Vector512.Zero) + { + return ComputeFirstIndex(ref searchSpace, ref oneVectorAwayFromEnd, equals); + } + } + } else if (Vector256.IsHardwareAccelerated && length >= Vector256.Count) { Vector256 equals, current, values0 = Vector256.Create(value0), values1 = Vector256.Create(value1), values2 = Vector256.Create(value2); @@ -2002,6 +2126,40 @@ private static int IndexOfAnyValueType(ref TValue searchSpace, Found: return (int)(offset); } + else if (Vector512.IsHardwareAccelerated && length >= Vector512.Count) + { + Vector512 equals, current, values0 = Vector512.Create(value0), values1 = Vector512.Create(value1), values2 = Vector512.Create(value2), values3 = Vector512.Create(value3); + ref TValue currentSearchSpace = ref searchSpace; + ref TValue oneVectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, length - Vector512.Count); + + // Loop until either we've finished all elements or there's less than a vector's-worth remaining. + do + { + current = Vector512.LoadUnsafe(ref currentSearchSpace); + equals = TNegator.NegateIfNeeded(Vector512.Equals(values0, current) | Vector512.Equals(values1, current) + | Vector512.Equals(values2, current) | Vector512.Equals(values3, current)); + if (equals == Vector512.Zero) + { + currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, Vector512.Count); + continue; + } + + return ComputeFirstIndex(ref searchSpace, ref currentSearchSpace, equals); + } + while (!Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref oneVectorAwayFromEnd)); + + // If any elements remain, process the last vector in the search space. + if ((uint)length % Vector512.Count != 0) + { + current = Vector512.LoadUnsafe(ref oneVectorAwayFromEnd); + equals = TNegator.NegateIfNeeded(Vector512.Equals(values0, current) | Vector512.Equals(values1, current) + | Vector512.Equals(values2, current) | Vector512.Equals(values3, current)); + if (equals != Vector512.Zero) + { + return ComputeFirstIndex(ref searchSpace, ref oneVectorAwayFromEnd, equals); + } + } + } else if (Vector256.IsHardwareAccelerated && length >= Vector256.Count) { Vector256 equals, current, values0 = Vector256.Create(value0), values1 = Vector256.Create(value1), values2 = Vector256.Create(value2), values3 = Vector256.Create(value3); @@ -2131,6 +2289,41 @@ private static int IndexOfAnyValueType(ref TValue searchSpace, Found: return (int)(offset); } + else if (Vector512.IsHardwareAccelerated && length >= Vector512.Count) + { + Vector512 equals, current, values0 = Vector512.Create(value0), values1 = Vector512.Create(value1), + values2 = Vector512.Create(value2), values3 = Vector512.Create(value3), values4 = Vector512.Create(value4); + ref TValue currentSearchSpace = ref searchSpace; + ref TValue oneVectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, (uint)(length - Vector512.Count)); + + // Loop until either we've finished all elements or there's less than a vector's-worth remaining. + do + { + current = Vector512.LoadUnsafe(ref currentSearchSpace); + equals = TNegator.NegateIfNeeded(Vector512.Equals(values0, current) | Vector512.Equals(values1, current) | Vector512.Equals(values2, current) + | Vector512.Equals(values3, current) | Vector512.Equals(values4, current)); + if (equals == Vector512.Zero) + { + currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, Vector512.Count); + continue; + } + + return ComputeFirstIndex(ref searchSpace, ref currentSearchSpace, equals); + } + while (!Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref oneVectorAwayFromEnd)); + + // If any elements remain, process the last vector in the search space. + if ((uint)length % Vector512.Count != 0) + { + current = Vector512.LoadUnsafe(ref oneVectorAwayFromEnd); + equals = TNegator.NegateIfNeeded(Vector512.Equals(values0, current) | Vector512.Equals(values1, current) | Vector512.Equals(values2, current) + | Vector512.Equals(values3, current) | Vector512.Equals(values4, current)); + if (equals != Vector512.Zero) + { + return ComputeFirstIndex(ref searchSpace, ref oneVectorAwayFromEnd, equals); + } + } + } else if (Vector256.IsHardwareAccelerated && length >= Vector256.Count) { Vector256 equals, current, values0 = Vector256.Create(value0), values1 = Vector256.Create(value1), @@ -2278,6 +2471,34 @@ private static int LastIndexOfValueType(ref TValue searchSpace Found: return (int)(offset); } + else if (Vector512.IsHardwareAccelerated && length >= Vector512.Count) + { + Vector512 equals, values = Vector512.Create(value); + nint offset = length - Vector512.Count; + + // Loop until either we've finished all elements -or- there's one or less than a vector's-worth remaining. + while (offset > 0) + { + equals = TNegator.NegateIfNeeded(Vector512.Equals(values, Vector512.LoadUnsafe(ref searchSpace, (nuint)(offset)))); + + if (equals == Vector512.Zero) + { + offset -= Vector512.Count; + continue; + } + + return ComputeLastIndex(offset, equals); + } + + // Process the first vector in the search space. + + equals = TNegator.NegateIfNeeded(Vector512.Equals(values, Vector512.LoadUnsafe(ref searchSpace))); + + if (equals != Vector512.Zero) + { + return ComputeLastIndex(offset: 0, equals); + } + } else if (Vector256.IsHardwareAccelerated && length >= Vector256.Count) { Vector256 equals, values = Vector256.Create(value); @@ -2431,6 +2652,36 @@ private static int LastIndexOfAnyValueType(ref TValue searchSp Found: return (int)(offset); } + else if (Vector512.IsHardwareAccelerated && length >= Vector512.Count) + { + Vector512 equals, current, values0 = Vector512.Create(value0), values1 = Vector512.Create(value1); + nint offset = length - Vector512.Count; + + // Loop until either we've finished all elements or there's less than a vector's-worth remaining. + while (offset > 0) + { + current = Vector512.LoadUnsafe(ref searchSpace, (nuint)(offset)); + equals = TNegator.NegateIfNeeded(Vector512.Equals(current, values0) | Vector512.Equals(current, values1)); + + if (equals == Vector512.Zero) + { + offset -= Vector512.Count; + continue; + } + + return ComputeLastIndex(offset, equals); + } + + // Process the first vector in the search space. + + current = Vector512.LoadUnsafe(ref searchSpace); + equals = TNegator.NegateIfNeeded(Vector512.Equals(current, values0) | Vector512.Equals(current, values1)); + + if (equals != Vector512.Zero) + { + return ComputeLastIndex(offset: 0, equals); + } + } else if (Vector256.IsHardwareAccelerated && length >= Vector256.Count) { Vector256 equals, current, values0 = Vector256.Create(value0), values1 = Vector256.Create(value1); @@ -2586,6 +2837,36 @@ private static int LastIndexOfAnyValueType(ref TValue searchSp Found: return (int)(offset); } + else if (Vector512.IsHardwareAccelerated && length >= Vector512.Count) + { + Vector512 equals, current, values0 = Vector512.Create(value0), values1 = Vector512.Create(value1), values2 = Vector512.Create(value2); + nint offset = length - Vector512.Count; + + // Loop until either we've finished all elements or there's less than a vector's-worth remaining. + while (offset > 0) + { + current = Vector512.LoadUnsafe(ref searchSpace, (nuint)(offset)); + equals = TNegator.NegateIfNeeded(Vector512.Equals(current, values0) | Vector512.Equals(current, values1) | Vector512.Equals(current, values2)); + + if (equals == Vector512.Zero) + { + offset -= Vector512.Count; + continue; + } + + return ComputeLastIndex(offset, equals); + } + + // Process the first vector in the search space. + + current = Vector512.LoadUnsafe(ref searchSpace); + equals = TNegator.NegateIfNeeded(Vector512.Equals(current, values0) | Vector512.Equals(current, values1) | Vector512.Equals(current, values2)); + + if (equals != Vector512.Zero) + { + return ComputeLastIndex(offset: 0, equals); + } + } else if (Vector256.IsHardwareAccelerated && length >= Vector256.Count) { Vector256 equals, current, values0 = Vector256.Create(value0), values1 = Vector256.Create(value1), values2 = Vector256.Create(value2); @@ -2706,6 +2987,36 @@ private static int LastIndexOfAnyValueType(ref TValue searchSp Found: return (int)(offset); } + else if (Vector512.IsHardwareAccelerated && length >= Vector512.Count) + { + Vector512 equals, current, values0 = Vector512.Create(value0), values1 = Vector512.Create(value1), values2 = Vector512.Create(value2), values3 = Vector512.Create(value3); + nint offset = length - Vector512.Count; + + // Loop until either we've finished all elements or there's less than a vector's-worth remaining. + while (offset > 0) + { + current = Vector512.LoadUnsafe(ref searchSpace, (nuint)(offset)); + equals = TNegator.NegateIfNeeded(Vector512.Equals(current, values0) | Vector512.Equals(current, values1) + | Vector512.Equals(current, values2) | Vector512.Equals(current, values3)); + if (equals == Vector512.Zero) + { + offset -= Vector512.Count; + continue; + } + + return ComputeLastIndex(offset, equals); + } + + // Process the first vector in the search space. + + current = Vector512.LoadUnsafe(ref searchSpace); + equals = TNegator.NegateIfNeeded(Vector512.Equals(current, values0) | Vector512.Equals(current, values1) | Vector512.Equals(current, values2) | Vector512.Equals(current, values3)); + + if (equals != Vector512.Zero) + { + return ComputeLastIndex(offset: 0, equals); + } + } else if (Vector256.IsHardwareAccelerated && length >= Vector256.Count) { Vector256 equals, current, values0 = Vector256.Create(value0), values1 = Vector256.Create(value1), values2 = Vector256.Create(value2), values3 = Vector256.Create(value3); @@ -2836,10 +3147,8 @@ public static void ReplaceValueType(ref T src, ref T dst, T oldValue, T newVa result = Vector128.ConditionalSelect(mask, newValues, original); result.StoreUnsafe(ref dst, lastVectorIndex); } - else + else if (!Vector512.IsHardwareAccelerated || length < (uint)Vector512.Count) { - Debug.Assert(Vector256.IsHardwareAccelerated && Vector256.IsSupported, "Vector256 is not HW-accelerated or not supported"); - nuint lastVectorIndex = length - (uint)Vector256.Count; Vector256 oldValues = Vector256.Create(oldValue); Vector256 newValues = Vector256.Create(newValue); @@ -2861,6 +3170,31 @@ public static void ReplaceValueType(ref T src, ref T dst, T oldValue, T newVa result = Vector256.ConditionalSelect(mask, newValues, original); result.StoreUnsafe(ref dst, lastVectorIndex); } + else + { + Debug.Assert(Vector512.IsHardwareAccelerated && Vector512.IsSupported, "Vector512 is not HW-accelerated or not supported"); + + nuint lastVectorIndex = length - (uint)Vector512.Count; + Vector512 oldValues = Vector512.Create(oldValue); + Vector512 newValues = Vector512.Create(newValue); + Vector512 original, mask, result; + + do + { + original = Vector512.LoadUnsafe(ref src, idx); + mask = Vector512.Equals(oldValues, original); + result = Vector512.ConditionalSelect(mask, newValues, original); + result.StoreUnsafe(ref dst, idx); + + idx += (uint)Vector512.Count; + } + while (idx < lastVectorIndex); + + original = Vector512.LoadUnsafe(ref src, lastVectorIndex); + mask = Vector512.Equals(oldValues, original); + result = Vector512.ConditionalSelect(mask, newValues, original); + result.StoreUnsafe(ref dst, lastVectorIndex); + } } } @@ -2911,6 +3245,38 @@ private static int LastIndexOfAnyValueType(ref TValue searchSp offset -= 1; } } + else if (Vector512.IsHardwareAccelerated && length >= Vector512.Count) + { + Vector512 equals, current, values0 = Vector512.Create(value0), values1 = Vector512.Create(value1), + values2 = Vector512.Create(value2), values3 = Vector512.Create(value3), values4 = Vector512.Create(value4); + nint offset = length - Vector512.Count; + + // Loop until either we've finished all elements or there's less than a vector's-worth remaining. + while (offset > 0) + { + current = Vector512.LoadUnsafe(ref searchSpace, (nuint)(offset)); + equals = TNegator.NegateIfNeeded(Vector512.Equals(current, values0) | Vector512.Equals(current, values1) | Vector512.Equals(current, values2) + | Vector512.Equals(current, values3) | Vector512.Equals(current, values4)); + if (equals == Vector512.Zero) + { + offset -= Vector512.Count; + continue; + } + + return ComputeLastIndex(offset, equals); + } + + // Process the first vector in the search space. + + current = Vector512.LoadUnsafe(ref searchSpace); + equals = TNegator.NegateIfNeeded(Vector512.Equals(current, values0) | Vector512.Equals(current, values1) | Vector512.Equals(current, values2) + | Vector512.Equals(current, values3) | Vector512.Equals(current, values4)); + + if (equals != Vector512.Zero) + { + return ComputeLastIndex(offset: 0, equals); + } + } else if (Vector256.IsHardwareAccelerated && length >= Vector256.Count) { Vector256 equals, current, values0 = Vector256.Create(value0), values1 = Vector256.Create(value1), @@ -2996,6 +3362,14 @@ private static unsafe int ComputeFirstIndex(ref T searchSpace, ref T current, return index + (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref current) / (nuint)sizeof(T)); } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe int ComputeFirstIndex(ref T searchSpace, ref T current, Vector512 equals) where T : struct + { + ulong notEqualsElements = equals.ExtractMostSignificantBits(); + int index = BitOperations.TrailingZeroCount(notEqualsElements); + return index + (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref current) / (nuint)sizeof(T)); + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] private static int ComputeLastIndex(nint offset, Vector128 equals) where T : struct { @@ -3012,11 +3386,20 @@ private static int ComputeLastIndex(nint offset, Vector256 equals) where T return (int)offset + index; } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static int ComputeLastIndex(nint offset, Vector512 equals) where T : struct + { + ulong notEqualsElements = equals.ExtractMostSignificantBits(); + int index = 63 - BitOperations.LeadingZeroCount(notEqualsElements); // 31 = 32 (bits in Int32) - 1 (indexing from zero) + return (int)offset + index; + } + internal interface INegator where T : struct { static abstract bool NegateIfNeeded(bool equals); static abstract Vector128 NegateIfNeeded(Vector128 equals); static abstract Vector256 NegateIfNeeded(Vector256 equals); + static abstract Vector512 NegateIfNeeded(Vector512 equals); } internal readonly struct DontNegate : INegator where T : struct @@ -3024,6 +3407,7 @@ internal interface INegator where T : struct public static bool NegateIfNeeded(bool equals) => equals; public static Vector128 NegateIfNeeded(Vector128 equals) => equals; public static Vector256 NegateIfNeeded(Vector256 equals) => equals; + public static Vector512 NegateIfNeeded(Vector512 equals) => equals; } internal readonly struct Negate : INegator where T : struct @@ -3031,6 +3415,7 @@ internal interface INegator where T : struct public static bool NegateIfNeeded(bool equals) => !equals; public static Vector128 NegateIfNeeded(Vector128 equals) => ~equals; public static Vector256 NegateIfNeeded(Vector256 equals) => ~equals; + public static Vector512 NegateIfNeeded(Vector512 equals) => ~equals; } internal static int IndexOfAnyInRange(ref T searchSpace, T lowInclusive, T highInclusive, int length) @@ -3137,7 +3522,7 @@ internal static int NonPackedIndexOfAnyInRangeUnsignedNumber(ref T return ComputeFirstIndex(ref searchSpace, ref oneVectorAwayFromEnd, inRangeVector); } } - else + else if (!Vector512.IsHardwareAccelerated || length < (uint)Vector512.Count) { Vector256 lowVector = Vector256.Create(lowInclusive); Vector256 rangeVector = Vector256.Create(highInclusive - lowInclusive); @@ -3166,6 +3551,35 @@ internal static int NonPackedIndexOfAnyInRangeUnsignedNumber(ref T return ComputeFirstIndex(ref searchSpace, ref oneVectorAwayFromEnd, inRangeVector); } } + else + { + Vector512 lowVector = Vector512.Create(lowInclusive); + Vector512 rangeVector = Vector512.Create(highInclusive - lowInclusive); + Vector512 inRangeVector; + + ref T current = ref searchSpace; + ref T oneVectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, (uint)(length - Vector512.Count)); + + // Loop until either we've finished all elements or there's less than a vector's-worth remaining. + do + { + inRangeVector = TNegator.NegateIfNeeded(Vector512.LessThanOrEqual(Vector512.LoadUnsafe(ref current) - lowVector, rangeVector)); + if (inRangeVector != Vector512.Zero) + { + return ComputeFirstIndex(ref searchSpace, ref current, inRangeVector); + } + + current = ref Unsafe.Add(ref current, Vector256.Count); + } + while (Unsafe.IsAddressLessThan(ref current, ref oneVectorAwayFromEnd)); + + // Process the last vector in the search space (which might overlap with already processed elements). + inRangeVector = TNegator.NegateIfNeeded(Vector512.LessThanOrEqual(Vector512.LoadUnsafe(ref oneVectorAwayFromEnd) - lowVector, rangeVector)); + if (inRangeVector != Vector512.Zero) + { + return ComputeFirstIndex(ref searchSpace, ref oneVectorAwayFromEnd, inRangeVector); + } + } return -1; } @@ -3253,7 +3667,7 @@ private static int LastIndexOfAnyInRangeUnsignedNumber(ref T search return ComputeLastIndex(offset: 0, inRangeVector); } } - else + else if (!Vector512.IsHardwareAccelerated || length < Vector512.Count) { Vector256 lowVector = Vector256.Create(lowInclusive); Vector256 rangeVector = Vector256.Create(highInclusive - lowInclusive); @@ -3280,6 +3694,33 @@ private static int LastIndexOfAnyInRangeUnsignedNumber(ref T search return ComputeLastIndex(offset: 0, inRangeVector); } } + else + { + Vector512 lowVector = Vector512.Create(lowInclusive); + Vector512 rangeVector = Vector512.Create(highInclusive - lowInclusive); + Vector512 inRangeVector; + + nint offset = length - Vector512.Count; + + // Loop until either we've finished all elements or there's a vector's-worth or less remaining. + while (offset > 0) + { + inRangeVector = TNegator.NegateIfNeeded(Vector512.LessThanOrEqual(Vector512.LoadUnsafe(ref searchSpace, (nuint)offset) - lowVector, rangeVector)); + if (inRangeVector != Vector512.Zero) + { + return ComputeLastIndex(offset, inRangeVector); + } + + offset -= Vector512.Count; + } + + // Process the first vector in the search space. + inRangeVector = TNegator.NegateIfNeeded(Vector512.LessThanOrEqual(Vector512.LoadUnsafe(ref searchSpace) - lowVector, rangeVector)); + if (inRangeVector != Vector512.Zero) + { + return ComputeLastIndex(offset: 0, inRangeVector); + } + } return -1; } @@ -3324,7 +3765,48 @@ public static int CountValueType(ref T current, T value, int length) where T if (Vector128.IsHardwareAccelerated && length >= Vector128.Count) { - if (Vector256.IsHardwareAccelerated && length >= Vector256.Count) + if (Vector512.IsHardwareAccelerated && length >= Vector512.Count) + { + Vector512 targetVector = Vector512.Create(value); + ref T oneVectorAwayFromEnd = ref Unsafe.Subtract(ref end, Vector512.Count); + do + { + count += BitOperations.PopCount(Vector512.Equals(Vector512.LoadUnsafe(ref current), targetVector).ExtractMostSignificantBits()); + current = ref Unsafe.Add(ref current, Vector512.Count); + } + while (!Unsafe.IsAddressGreaterThan(ref current, ref oneVectorAwayFromEnd)); + + // If there are just a few elements remaining, then processing these elements by the scalar loop + // is cheaper than doing bitmask + popcount on the full last vector. To avoid complicated type + // based checks, other remainder-count based logic to determine the correct cut-off, for simplicity + // a half-vector size is chosen (based on benchmarks). + uint remaining = (uint)Unsafe.ByteOffset(ref current, ref end) / (uint)Unsafe.SizeOf(); + if (remaining > Vector512.Count / 2) + { + ulong mask = Vector512.Equals(Vector512.LoadUnsafe(ref oneVectorAwayFromEnd), targetVector).ExtractMostSignificantBits(); + + // The mask contains some elements that may be double-checked, so shift them away in order to get the correct pop-count. + uint overlaps = (uint)Vector512.Count - remaining; + mask >>= (int)overlaps; + count += BitOperations.PopCount(mask); + + return count; + } + // TODO : Verify this makes sense + /*if (remaining > Vector256.Count / 2) + { + Vector256 targetVector256 = Vector256.Create(value); + uint mask = Vector256.Equals(Vector256.LoadUnsafe(ref oneVectorAwayFromEnd), targetVector256).ExtractMostSignificantBits(); + + // The mask contains some elements that may be double-checked, so shift them away in order to get the correct pop-count. + uint overlaps = (uint)Vector256.Count - remaining; + mask >>= (int)overlaps; + count += BitOperations.PopCount(mask); + + return count; + }*/ + } + else if (Vector256.IsHardwareAccelerated && length >= Vector256.Count) { Vector256 targetVector = Vector256.Create(value); ref T oneVectorAwayFromEnd = ref Unsafe.Subtract(ref end, Vector256.Count); diff --git a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.cs b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.cs index 8cff9345627f26..a7e5f48d63180d 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.cs @@ -415,7 +415,37 @@ public static void Reverse(ref int buf, nuint length) nint remainder = (nint)length; nint offset = 0; - if (Avx2.IsSupported && remainder >= Vector256.Count * 2) + if (Vector512.IsHardwareAccelerated && remainder >= Vector512.Count * 2) + { + nint lastOffset = remainder - Vector512.Count; + do + { + // Load in values from beginning and end of the array. + Vector512 tempFirst = Vector512.LoadUnsafe(ref buf, (nuint)offset); + Vector512 tempLast = Vector512.LoadUnsafe(ref buf, (nuint)lastOffset); + + // Shuffle to reverse each vector: + // +---------------+ + // | A | B | C | D | + // +---------------+ + // ---> + // +---------------+ + // | D | C | B | A | + // +---------------+ + tempFirst = Vector512.Shuffle(tempFirst, Vector512.Create(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + tempLast = Vector512.Shuffle(tempLast, Vector512.Create(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + + // Store the reversed vectors + tempLast.StoreUnsafe(ref buf, (nuint)offset); + tempFirst.StoreUnsafe(ref buf, (nuint)lastOffset); + + offset += Vector512.Count; + lastOffset -= Vector512.Count; + } while (lastOffset >= offset); + + remainder = lastOffset + Vector512.Count - offset; + } + else if (Avx2.IsSupported && remainder >= Vector256.Count * 2) { nint lastOffset = remainder - Vector256.Count; do @@ -490,7 +520,37 @@ public static void Reverse(ref long buf, nuint length) nint remainder = (nint)length; nint offset = 0; - if (Avx2.IsSupported && remainder >= Vector256.Count * 2) + if (Vector512.IsHardwareAccelerated && remainder >= Vector512.Count * 2) + { + nint lastOffset = remainder - Vector512.Count; + do + { + // Load in values from beginning and end of the array. + Vector512 tempFirst = Vector512.LoadUnsafe(ref buf, (nuint)offset); + Vector512 tempLast = Vector512.LoadUnsafe(ref buf, (nuint)lastOffset); + + // Shuffle to reverse each vector: + // +-------+ + // | A | B | + // +-------+ + // ---> + // +-------+ + // | B | A | + // +-------+ + tempFirst = Vector512.Shuffle(tempFirst, Vector512.Create(7, 6, 5, 4, 3, 2, 1, 0)); + tempLast = Vector512.Shuffle(tempLast, Vector512.Create(7, 6, 5, 4, 3, 2, 1, 0)); + + // Store the reversed vectors + tempLast.StoreUnsafe(ref buf, (nuint)offset); + tempFirst.StoreUnsafe(ref buf, (nuint)lastOffset); + + offset += Vector512.Count; + lastOffset -= Vector512.Count; + } while (lastOffset >= offset); + + remainder = lastOffset + Vector512.Count - offset; + } + else if (Avx2.IsSupported && remainder >= Vector256.Count * 2) { nint lastOffset = remainder - Vector256.Count; do From 28c80e570f14a44aff017d1795f41909d9b2644d Mon Sep 17 00:00:00 2001 From: Deepak Rajendrakumaran Date: Mon, 12 Jun 2023 13:54:45 -0700 Subject: [PATCH 4/7] Using AVX512 directly in packed implementation --- .../src/System/SpanHelpers.Packed.cs | 31 +++++++++++++------ 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Packed.cs b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Packed.cs index fe90ec14fbb635..0fd03bc656241c 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Packed.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Packed.cs @@ -114,7 +114,7 @@ public static bool Contains(ref short searchSpace, short value, int length) { ref short currentSearchSpace = ref searchSpace; #pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough // The else condition for this if statement is identical in semantics to Avx2 specific code - if (Avx512F.IsSupported && length > Vector512.Count) + if (Avx512BW.IsSupported && Vector512.IsHardwareAccelerated && length > Vector512.Count) #pragma warning restore IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough { Vector512 packedValue = Vector512.Create((byte)value); @@ -314,7 +314,7 @@ private static int IndexOf(ref short searchSpace, short value, int len ref short currentSearchSpace = ref searchSpace; #pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough // The else condition for this if statement is identical in semantics to Avx2 specific code - if (Avx512F.IsSupported && length > Vector512.Count) + if (Avx512BW.IsSupported && Vector512.IsHardwareAccelerated && length > Vector512.Count) #pragma warning restore IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough { Vector512 packedValue = Vector512.Create((byte)value); @@ -526,7 +526,7 @@ private static int IndexOfAny(ref short searchSpace, short value0, sho { ref short currentSearchSpace = ref searchSpace; #pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough // The else condition for this if statement is identical in semantics to Avx2 specific code - if (Avx512F.IsSupported && length > Vector512.Count) + if (Avx512BW.IsSupported && Vector512.IsHardwareAccelerated && length > Vector512.Count) #pragma warning restore IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough { Vector512 packedValue0 = Vector512.Create((byte)value0); @@ -743,7 +743,7 @@ private static int IndexOfAny(ref short searchSpace, short value0, sho ref short currentSearchSpace = ref searchSpace; #pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough // The else condition for this if statement is identical in semantics to Avx2 specific code - if (Avx512F.IsSupported && length > Vector512.Count) + if (Avx512BW.IsSupported && Vector512.IsHardwareAccelerated && length > Vector512.Count) #pragma warning restore IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough { Vector512 packedValue0 = Vector512.Create((byte)value0); @@ -945,7 +945,7 @@ private static int IndexOfAnyInRange(ref short searchSpace, short lowI ref short currentSearchSpace = ref searchSpace; #pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough // The else condition for this if statement is identical in semantics to Avx2 specific code - if (Avx512F.IsSupported && length > Vector512.Count) + if (Avx512BW.IsSupported && Vector512.IsHardwareAccelerated && length > Vector512.Count) #pragma warning restore IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough { Vector512 lowVector = Vector512.Create((byte)lowInclusive); @@ -1119,15 +1119,15 @@ private static int IndexOfAnyInRange(ref short searchSpace, short lowI } [MethodImpl(MethodImplOptions.AggressiveInlining)] - [CompExactlyDependsOn(typeof(Avx512F))] + [CompExactlyDependsOn(typeof(Avx512BW))] private static Vector512 PackSources(Vector512 source0, Vector512 source1) { - Debug.Assert(Vector512.IsHardwareAccelerated); + Debug.Assert(Avx512BW.IsSupported); // Pack two vectors of characters into bytes. While the type is Vector256, these are really UInt16 characters. // X86: Downcast every character using saturation. // - Values <= 32767 result in min(value, 255). // - Values > 32767 result in 0. Because of this we can't accept needles that contain 0. - return Vector512.Narrow(source0, source1).AsByte(); + return Avx512BW.PackUnsignedSaturate(source0, source1).AsByte(); //return Avx512BW.PackUnsignedSaturate(source0, source1).AsByte(); } @@ -1191,8 +1191,7 @@ private static int ComputeFirstIndex(ref short searchSpace, ref short current, V [CompExactlyDependsOn(typeof(Avx512F))] private static int ComputeFirstIndex(ref short searchSpace, ref short current, Vector512 equals) { - //ulong notEqualsElements = FixUpPackedVector512Result(equals).ExtractMostSignificantBits(); - ulong notEqualsElements = equals.ExtractMostSignificantBits(); + ulong notEqualsElements = FixUpPackedVector512Result(equals).ExtractMostSignificantBits(); int index = BitOperations.TrailingZeroCount(notEqualsElements); return index + (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref current) / sizeof(short)); } @@ -1252,5 +1251,17 @@ private static Vector256 FixUpPackedVector256Result(Vector256 result // 1, 1, 1, 1, 1, 1, 1, 1, X, X, X, X, X, X, X, X, Y, Y, Y, Y, Y, Y, Y, Y, 2, 2, 2, 2, 2, 2, 2, 2 return Avx2.Permute4x64(result.AsInt64(), 0b_11_01_10_00).AsByte(); } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + [CompExactlyDependsOn(typeof(Avx512F))] + private static Vector512 FixUpPackedVector512Result(Vector512 result) + { + Debug.Assert(Avx512F.IsSupported); + // Avx512BW.PackUnsignedSaturate(Vector512.Create((short)1), Vector512.Create((short)2)) will result in + // 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2 + // We want to swap the X and Y bits + // 1, 1, 1, 1, 1, 1, 1, 1, X, X, X, X, X, X, X, X, Y, Y, Y, Y, Y, Y, Y, Y, 2, 2, 2, 2, 2, 2, 2, 2 + return Avx512F.PermuteVar8x64(result.AsInt64(), Vector512.Create((long)0, 2, 4, 6, 1, 3, 5, 7)).AsByte(); + } } } From 26aac45bb6505f33d0d727aa724e66f9b1a946c1 Mon Sep 17 00:00:00 2001 From: Deepak Rajendrakumaran Date: Tue, 13 Jun 2023 13:58:03 -0700 Subject: [PATCH 5/7] Fixing cpuid tests --- .../HardwareIntrinsics/X86/X86Base/CpuId.cs | 33 +++++++++++++++++++ .../HardwareIntrinsics/X86/CpuId.cs | 33 +++++++++++++++++++ 2 files changed, 66 insertions(+) diff --git a/src/tests/JIT/HardwareIntrinsics/X86/X86Base/CpuId.cs b/src/tests/JIT/HardwareIntrinsics/X86/X86Base/CpuId.cs index 169b8d8925c9de..d02c7f49c03a4e 100644 --- a/src/tests/JIT/HardwareIntrinsics/X86/X86Base/CpuId.cs +++ b/src/tests/JIT/HardwareIntrinsics/X86/X86Base/CpuId.cs @@ -64,6 +64,8 @@ public unsafe static void CpuId() (eax, ebx, ecx, edx) = X86Base.CpuId(0x00000001, 0x00000000); + int xarchCpuInfo = eax; + if (IsBitIncorrect(edx, 25, typeof(Sse), Sse.IsSupported, "SSE", ref isHierarchyDisabled)) { testResult = Fail; @@ -214,6 +216,37 @@ public unsafe static void CpuId() } bool isAvx512HierarchyDisabled = isHierarchyDisabled; + if (isGenuineIntel && !isAvx512HierarchyDisabled) + { + int steppingId = xarchCpuInfo & (int)0b1111; + int model = (xarchCpuInfo >> 4) & (int)0b1111; + int familyID = (xarchCpuInfo >> 8) & (int)0b1111; + int extendedModelID = (xarchCpuInfo >> 16) & (int)0b1111; + if (familyID == 0x06) + { + if (extendedModelID == 0x05) + { + if (model == 0x05) + { + // * Skylake (Server) + // * Cascade Lake + // * Cooper Lake + + isAvx512HierarchyDisabled = true; + } + } + else if (extendedModelID == 0x06) + { + if (model == 0x06) + { + // * Cannon Lake + + isAvx512HierarchyDisabled = true; + } + } + } + + } if (IsBitIncorrect(ecx, 1, typeof(Avx512Vbmi), Avx512Vbmi.IsSupported, "AVX512VBMI", ref isHierarchyDisabled)) { diff --git a/src/tests/readytorun/HardwareIntrinsics/X86/CpuId.cs b/src/tests/readytorun/HardwareIntrinsics/X86/CpuId.cs index 907dfb89561f0f..3861b4dc0291e4 100644 --- a/src/tests/readytorun/HardwareIntrinsics/X86/CpuId.cs +++ b/src/tests/readytorun/HardwareIntrinsics/X86/CpuId.cs @@ -60,6 +60,8 @@ public unsafe static int Main() (eax, ebx, ecx, edx) = X86Base.CpuId(0x00000001, 0x00000000); + int xarchCpuInfo = eax; + if (IsBitIncorrect(edx, 25, typeof(Sse), Sse.IsSupported, "SSE", ref isHierarchyDisabled)) { testResult = Fail; @@ -209,6 +211,37 @@ public unsafe static int Main() } bool isAvx512HierarchyDisabled = isHierarchyDisabled; + if (isGenuineIntel && !isAvx512HierarchyDisabled) + { + int steppingId = xarchCpuInfo & (int)0b1111; + int model = (xarchCpuInfo >> 4) & (int)0b1111; + int familyID = (xarchCpuInfo >> 8) & (int)0b1111; + int extendedModelID = (xarchCpuInfo >> 16) & (int)0b1111; + if (familyID == 0x06) + { + if (extendedModelID == 0x05) + { + if (model == 0x05) + { + // * Skylake (Server) + // * Cascade Lake + // * Cooper Lake + + isAvx512HierarchyDisabled = true; + } + } + else if (extendedModelID == 0x06) + { + if (model == 0x06) + { + // * Cannon Lake + + isAvx512HierarchyDisabled = true; + } + } + } + + } if (IsBitIncorrect(ecx, 1, typeof(Avx512Vbmi), Avx512Vbmi.IsSupported, "AVX512VBMI", ref isHierarchyDisabled)) { From 1f3a533c7dedddb14363d9a9dd26bd968131fca8 Mon Sep 17 00:00:00 2001 From: Deepak Rajendrakumaran Date: Fri, 16 Jun 2023 10:34:33 -0700 Subject: [PATCH 6/7] Cleanup + Addressing review comments --- src/coreclr/jit/compiler.cpp | 2 +- src/coreclr/jit/hwintrinsic.cpp | 11 ++++----- .../src/System/Numerics/BitOperations.cs | 2 +- .../src/System/SpanHelpers.Byte.cs | 4 ++-- .../src/System/SpanHelpers.Char.cs | 4 ++-- .../src/System/SpanHelpers.Packed.cs | 23 +++++++++---------- 6 files changed, 22 insertions(+), 24 deletions(-) diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp index 3dd6a270d08cc5..827f7c02abd5cb 100644 --- a/src/coreclr/jit/compiler.cpp +++ b/src/coreclr/jit/compiler.cpp @@ -2298,7 +2298,7 @@ void Compiler::compSetProcessor() instructionSetFlags.AddInstructionSet(InstructionSet_Vector512); - if ((preferredVectorByteLength == 0) && jitFlags.IsSet(JitFlags::JIT_FLAG_VECTOR512_THROTTLING)) + if ((preferredVectorByteLength == 0) && opts.Vector512Throttling()) { // Some architectures can experience frequency throttling when // executing 512-bit width instructions. To account for this we set the diff --git a/src/coreclr/jit/hwintrinsic.cpp b/src/coreclr/jit/hwintrinsic.cpp index 66642a4ce97d02..9e1351a0cbad84 100644 --- a/src/coreclr/jit/hwintrinsic.cpp +++ b/src/coreclr/jit/hwintrinsic.cpp @@ -515,6 +515,11 @@ NamedIntrinsic HWIntrinsicInfo::lookupId(Compiler* comp, } else if (strcmp(className, "Vector512") == 0) { + // If the JitFlags::JIT_FLAG_VECTOR512_THROTTLING flag is set, we do not need to do any further checks. + if (comp->opts.Vector512Throttling()) + { + return NI_IsSupported_False; + } isa = InstructionSet_AVX512F; } } @@ -550,12 +555,6 @@ NamedIntrinsic HWIntrinsicInfo::lookupId(Compiler* comp, if (isIsaSupported && comp->compSupportsHWIntrinsic(isa)) { -#ifdef TARGET_XARCH - if ((isa == InstructionSet_Vector512) && (comp->opts.Vector512Throttling())) - { - return NI_IsSupported_False; - } -#endif if (!comp->IsTargetAbi(CORINFO_NATIVEAOT_ABI) || comp->compExactlyDependsOn(isa)) { return NI_IsSupported_True; diff --git a/src/libraries/System.Private.CoreLib/src/System/Numerics/BitOperations.cs b/src/libraries/System.Private.CoreLib/src/System/Numerics/BitOperations.cs index 47b5e7926fa298..c1e9e3b6d96cb6 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Numerics/BitOperations.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Numerics/BitOperations.cs @@ -980,7 +980,7 @@ internal static uint FlipBit(uint value, int index) [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static ulong FlipBit(ulong value, int index) { - return value ^ (ulong)((ulong)1u << index); + return value ^ (1UL << index); } } } diff --git a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs index 40bb130a500a93..7804b74f9ae20d 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs @@ -551,9 +551,9 @@ internal static unsafe int IndexOfNullByte(byte* searchSpace) if ((((nuint)(uint)searchSpace + offset) & (nuint)(Vector512.Count - 1)) != 0) { - // Not currently aligned to Vector256 (is aligned to Vector128); this can cause a problem for searches + // Not currently aligned to Vector512 (is aligned to Vector256); this can cause a problem for searches // with no upper bound e.g. String.strlen. - // Start with a check on Vector128 to align to Vector256, before moving to processing Vector256. + // Start with a check on Vector256 to align to Vector512, before moving to processing Vector256. // This ensures we do not fault across memory pages while searching for an end of string. Vector256 search = Vector256.Load(searchSpace + offset); diff --git a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs index 6637d4d2637cb7..9f7b57e40e6c6f 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs @@ -629,8 +629,8 @@ public static unsafe int IndexOfNullCharacter(char* searchSpace) } if (((nint)(searchSpace + (nint)offset) & (nint)(Vector512.Count - 1)) != 0) { - // Not currently aligned to Vector256 (is aligned to Vector128); this can cause a problem for searches - // with no upper bound e.g. String.wcslen. Start with a check on Vector128 to align to Vector256, + // Not currently aligned to Vector512 (is aligned to Vector256); this can cause a problem for searches + // with no upper bound e.g. String.wcslen. Start with a check on Vector256 to align to Vector512, // before moving to processing Vector256. // This ensures we do not fault across memory pages diff --git a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Packed.cs b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Packed.cs index 0fd03bc656241c..2f70f00959c1c4 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Packed.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Packed.cs @@ -122,7 +122,7 @@ public static bool Contains(ref short searchSpace, short value, int length) if (length > 2 * Vector512.Count) { // Process the input in chunks of 64 characters (2 * Vector512). - // If the input length is a multiple of 32, don't consume the last 16 characters in this loop. + // If the input length is a multiple of 64, don't consume the last 16 characters in this loop. // Let the fallback below handle it instead. This is why the condition is // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan". ref short twoVectorsAwayFromEnd = ref Unsafe.Add(ref searchSpace, length - (2 * Vector512.Count)); @@ -321,8 +321,8 @@ private static int IndexOf(ref short searchSpace, short value, int len if (length > 2 * Vector512.Count) { - // Process the input in chunks of 32 characters (2 * Vector256). - // If the input length is a multiple of 32, don't consume the last 16 characters in this loop. + // Process the input in chunks of 64 characters (2 * Vector512). + // If the input length is a multiple of 64, don't consume the last 16 characters in this loop. // Let the fallback below handle it instead. This is why the condition is // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan". ref short twoVectorsAwayFromEnd = ref Unsafe.Add(ref searchSpace, length - (2 * Vector512.Count)); @@ -534,8 +534,8 @@ private static int IndexOfAny(ref short searchSpace, short value0, sho if (length > 2 * Vector512.Count) { - // Process the input in chunks of 32 characters (2 * Vector256). - // If the input length is a multiple of 32, don't consume the last 16 characters in this loop. + // Process the input in chunks of 64 characters (2 * Vector512). + // If the input length is a multiple of 64, don't consume the last 16 characters in this loop. // Let the fallback below handle it instead. This is why the condition is // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan". ref short twoVectorsAwayFromEnd = ref Unsafe.Add(ref searchSpace, length - (2 * Vector512.Count)); @@ -752,8 +752,8 @@ private static int IndexOfAny(ref short searchSpace, short value0, sho if (length > 2 * Vector512.Count) { - // Process the input in chunks of 32 characters (2 * Vector256). - // If the input length is a multiple of 32, don't consume the last 16 characters in this loop. + // Process the input in chunks of 64 characters (2 * Vector512). + // If the input length is a multiple of 64, don't consume the last 16 characters in this loop. // Let the fallback below handle it instead. This is why the condition is // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan". ref short twoVectorsAwayFromEnd = ref Unsafe.Add(ref searchSpace, length - (2 * Vector512.Count)); @@ -761,7 +761,7 @@ private static int IndexOfAny(ref short searchSpace, short value0, sho do { Vector512 source0 = Vector512.LoadUnsafe(ref currentSearchSpace); - Vector512 source1 = Vector512.LoadUnsafe(ref currentSearchSpace, (nuint)Vector256.Count); + Vector512 source1 = Vector512.LoadUnsafe(ref currentSearchSpace, (nuint)Vector512.Count); Vector512 packedSource = PackSources(source0, source1); Vector512 result = Vector512.Equals(packedValue0, packedSource) | Vector512.Equals(packedValue1, packedSource) | Vector512.Equals(packedValue2, packedSource); result = NegateIfNeeded(result); @@ -953,8 +953,8 @@ private static int IndexOfAnyInRange(ref short searchSpace, short lowI if (length > 2 * Vector512.Count) { - // Process the input in chunks of 32 characters (2 * Vector256). - // If the input length is a multiple of 32, don't consume the last 16 characters in this loop. + // Process the input in chunks of 64 characters (2 * Vector512). + // If the input length is a multiple of 64, don't consume the last 16 characters in this loop. // Let the fallback below handle it instead. This is why the condition is // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan". ref short twoVectorsAwayFromEnd = ref Unsafe.Add(ref searchSpace, length - (2 * Vector512.Count)); @@ -1128,7 +1128,6 @@ private static Vector512 PackSources(Vector512 source0, Vector512 32767 result in 0. Because of this we can't accept needles that contain 0. return Avx512BW.PackUnsignedSaturate(source0, source1).AsByte(); - //return Avx512BW.PackUnsignedSaturate(source0, source1).AsByte(); } [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -1229,7 +1228,7 @@ private static int ComputeFirstIndexOverlapped(ref short searchSpace, ref short [CompExactlyDependsOn(typeof(Avx512F))] private static int ComputeFirstIndexOverlapped(ref short searchSpace, ref short current0, ref short current1, Vector512 equals) { - ulong notEqualsElements = equals.ExtractMostSignificantBits(); + ulong notEqualsElements = FixUpPackedVector512Result(equals).ExtractMostSignificantBits(); int offsetInVector = BitOperations.TrailingZeroCount(notEqualsElements); if (offsetInVector >= Vector512.Count) { From 2f6889a646eae77e6556c777e519304054aa6273 Mon Sep 17 00:00:00 2001 From: Deepak Rajendrakumaran Date: Wed, 21 Jun 2023 09:28:30 -0700 Subject: [PATCH 7/7] Address review comments. --- .../src/System/Numerics/BitOperations.cs | 2 +- .../src/System/SpanHelpers.T.cs | 13 ------------- 2 files changed, 1 insertion(+), 14 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Numerics/BitOperations.cs b/src/libraries/System.Private.CoreLib/src/System/Numerics/BitOperations.cs index c1e9e3b6d96cb6..db6a16cd29beb6 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Numerics/BitOperations.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Numerics/BitOperations.cs @@ -980,7 +980,7 @@ internal static uint FlipBit(uint value, int index) [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static ulong FlipBit(ulong value, int index) { - return value ^ (1UL << index); + return value ^ (1ul << index); } } } diff --git a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.T.cs b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.T.cs index 834e165e798fb1..0806bbc95daa33 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.T.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.T.cs @@ -3792,19 +3792,6 @@ public static int CountValueType(ref T current, T value, int length) where T return count; } - // TODO : Verify this makes sense - /*if (remaining > Vector256.Count / 2) - { - Vector256 targetVector256 = Vector256.Create(value); - uint mask = Vector256.Equals(Vector256.LoadUnsafe(ref oneVectorAwayFromEnd), targetVector256).ExtractMostSignificantBits(); - - // The mask contains some elements that may be double-checked, so shift them away in order to get the correct pop-count. - uint overlaps = (uint)Vector256.Count - remaining; - mask >>= (int)overlaps; - count += BitOperations.PopCount(mask); - - return count; - }*/ } else if (Vector256.IsHardwareAccelerated && length >= Vector256.Count) {