diff --git a/src/ImageSharp/Compression/Zlib/Adler32.cs b/src/ImageSharp/Compression/Zlib/Adler32.cs index 7eb3f4516f..1f3cbbca64 100644 --- a/src/ImageSharp/Compression/Zlib/Adler32.cs +++ b/src/ImageSharp/Compression/Zlib/Adler32.cs @@ -3,6 +3,7 @@ using System; using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; #if SUPPORTS_RUNTIME_INTRINSICS using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; @@ -31,6 +32,8 @@ internal static class Adler32 #if SUPPORTS_RUNTIME_INTRINSICS private const int MinBufferSize = 64; + private const int BlockSize = 1 << 5; + // The C# compiler emits this as a compile-time constant embedded in the PE file. private static ReadOnlySpan Tap1Tap2 => new byte[] { @@ -63,6 +66,11 @@ public static uint Calculate(uint adler, ReadOnlySpan buffer) } #if SUPPORTS_RUNTIME_INTRINSICS + if (Avx2.IsSupported && buffer.Length >= MinBufferSize) + { + return CalculateAvx2(adler, buffer); + } + if (Ssse3.IsSupported && buffer.Length >= MinBufferSize) { return CalculateSse(adler, buffer); @@ -83,19 +91,15 @@ private static unsafe uint CalculateSse(uint adler, ReadOnlySpan buffer) uint s2 = (adler >> 16) & 0xFFFF; // Process the data in blocks. - const int BLOCK_SIZE = 1 << 5; - uint length = (uint)buffer.Length; - uint blocks = length / BLOCK_SIZE; - length -= blocks * BLOCK_SIZE; + uint blocks = length / BlockSize; + length -= blocks * BlockSize; - int index = 0; - fixed (byte* bufferPtr = buffer) + fixed (byte* bufferPtr = &MemoryMarshal.GetReference(buffer)) { - fixed (byte* tapPtr = Tap1Tap2) + fixed (byte* tapPtr = &MemoryMarshal.GetReference(Tap1Tap2)) { - index += (int)blocks * BLOCK_SIZE; - var localBufferPtr = bufferPtr; + byte* localBufferPtr = bufferPtr; // _mm_setr_epi8 on x86 Vector128 tap1 = Sse2.LoadVector128((sbyte*)tapPtr); @@ -105,7 +109,7 @@ private static unsafe uint CalculateSse(uint adler, ReadOnlySpan buffer) while (blocks > 0) { - uint n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */ + uint n = NMAX / BlockSize; /* The NMAX constraint. */ if (n > blocks) { n = blocks; @@ -138,7 +142,7 @@ private static unsafe uint CalculateSse(uint adler, ReadOnlySpan buffer) Vector128 mad2 = Ssse3.MultiplyAddAdjacent(bytes2, tap2); v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad2, ones).AsUInt32()); - localBufferPtr += BLOCK_SIZE; + localBufferPtr += BlockSize; } while (--n > 0); @@ -164,45 +168,127 @@ private static unsafe uint CalculateSse(uint adler, ReadOnlySpan buffer) if (length > 0) { - if (length >= 16) - { - s2 += s1 += localBufferPtr[0]; - s2 += s1 += localBufferPtr[1]; - s2 += s1 += localBufferPtr[2]; - s2 += s1 += localBufferPtr[3]; - s2 += s1 += localBufferPtr[4]; - s2 += s1 += localBufferPtr[5]; - s2 += s1 += localBufferPtr[6]; - s2 += s1 += localBufferPtr[7]; - s2 += s1 += localBufferPtr[8]; - s2 += s1 += localBufferPtr[9]; - s2 += s1 += localBufferPtr[10]; - s2 += s1 += localBufferPtr[11]; - s2 += s1 += localBufferPtr[12]; - s2 += s1 += localBufferPtr[13]; - s2 += s1 += localBufferPtr[14]; - s2 += s1 += localBufferPtr[15]; - - localBufferPtr += 16; - length -= 16; - } + HandleLeftOver(localBufferPtr, length, ref s1, ref s2); + } - while (length-- > 0) - { - s2 += s1 += *localBufferPtr++; - } + return s1 | (s2 << 16); + } + } + } - if (s1 >= BASE) - { - s1 -= BASE; - } + // Based on: https://github.com/zlib-ng/zlib-ng/blob/develop/arch/x86/adler32_avx2.c + [MethodImpl(InliningOptions.HotPath | InliningOptions.ShortMethod)] + public static unsafe uint CalculateAvx2(uint adler, ReadOnlySpan buffer) + { + uint s1 = adler & 0xFFFF; + uint s2 = (adler >> 16) & 0xFFFF; + uint length = (uint)buffer.Length; - s2 %= BASE; + fixed (byte* bufferPtr = &MemoryMarshal.GetReference(buffer)) + { + byte* localBufferPtr = bufferPtr; + + Vector256 zero = Vector256.Zero; + var dot3v = Vector256.Create((short)1); + var dot2v = Vector256.Create(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1); + + // Process n blocks of data. At most NMAX data bytes can be + // processed before s2 must be reduced modulo BASE. + var vs1 = Vector256.CreateScalar(s1); + var vs2 = Vector256.CreateScalar(s2); + + while (length >= 32) + { + int k = length < NMAX ? (int)length : (int)NMAX; + k -= k % 32; + length -= (uint)k; + + Vector256 vs10 = vs1; + Vector256 vs3 = Vector256.Zero; + + while (k >= 32) + { + // Load 32 input bytes. + Vector256 block = Avx.LoadVector256(localBufferPtr); + + // Sum of abs diff, resulting in 2 x int32's + Vector256 vs1sad = Avx2.SumAbsoluteDifferences(block, zero); + + vs1 = Avx2.Add(vs1, vs1sad.AsUInt32()); + vs3 = Avx2.Add(vs3, vs10); + + // sum 32 uint8s to 16 shorts. + Vector256 vshortsum2 = Avx2.MultiplyAddAdjacent(block, dot2v); + + // sum 16 shorts to 8 uint32s. + Vector256 vsum2 = Avx2.MultiplyAddAdjacent(vshortsum2, dot3v); + + vs2 = Avx2.Add(vsum2.AsUInt32(), vs2); + vs10 = vs1; + + localBufferPtr += BlockSize; + k -= 32; } - return s1 | (s2 << 16); + // Defer the multiplication with 32 to outside of the loop. + vs3 = Avx2.ShiftLeftLogical(vs3, 5); + vs2 = Avx2.Add(vs2, vs3); + + s1 = (uint)Numerics.EvenReduceSum(vs1.AsInt32()); + s2 = (uint)Numerics.ReduceSum(vs2.AsInt32()); + + s1 %= BASE; + s2 %= BASE; + + vs1 = Vector256.CreateScalar(s1); + vs2 = Vector256.CreateScalar(s2); } + + if (length > 0) + { + HandleLeftOver(localBufferPtr, length, ref s1, ref s2); + } + + return s1 | (s2 << 16); + } + } + + private static unsafe void HandleLeftOver(byte* localBufferPtr, uint length, ref uint s1, ref uint s2) + { + if (length >= 16) + { + s2 += s1 += localBufferPtr[0]; + s2 += s1 += localBufferPtr[1]; + s2 += s1 += localBufferPtr[2]; + s2 += s1 += localBufferPtr[3]; + s2 += s1 += localBufferPtr[4]; + s2 += s1 += localBufferPtr[5]; + s2 += s1 += localBufferPtr[6]; + s2 += s1 += localBufferPtr[7]; + s2 += s1 += localBufferPtr[8]; + s2 += s1 += localBufferPtr[9]; + s2 += s1 += localBufferPtr[10]; + s2 += s1 += localBufferPtr[11]; + s2 += s1 += localBufferPtr[12]; + s2 += s1 += localBufferPtr[13]; + s2 += s1 += localBufferPtr[14]; + s2 += s1 += localBufferPtr[15]; + + localBufferPtr += 16; + length -= 16; } + + while (length-- > 0) + { + s2 += s1 += *localBufferPtr++; + } + + if (s1 >= BASE) + { + s1 -= BASE; + } + + s2 %= BASE; } #endif diff --git a/tests/ImageSharp.Tests/Formats/Png/Adler32Tests.cs b/tests/ImageSharp.Tests/Formats/Png/Adler32Tests.cs index 0886bd84dc..77f2b76634 100644 --- a/tests/ImageSharp.Tests/Formats/Png/Adler32Tests.cs +++ b/tests/ImageSharp.Tests/Formats/Png/Adler32Tests.cs @@ -3,6 +3,7 @@ using System; using SixLabors.ImageSharp.Compression.Zlib; +using SixLabors.ImageSharp.Tests.TestUtilities; using Xunit; using SharpAdler32 = ICSharpCode.SharpZipLib.Checksum.Adler32; @@ -15,10 +16,7 @@ public class Adler32Tests [InlineData(0)] [InlineData(1)] [InlineData(2)] - public void ReturnsCorrectWhenEmpty(uint input) - { - Assert.Equal(input, Adler32.Calculate(input, default)); - } + public void CalculateAdler_ReturnsCorrectWhenEmpty(uint input) => Assert.Equal(input, Adler32.Calculate(input, default)); [Theory] [InlineData(0)] @@ -28,24 +26,49 @@ public void ReturnsCorrectWhenEmpty(uint input) [InlineData(1024 + 15)] [InlineData(2034)] [InlineData(4096)] - public void MatchesReference(int length) + public void CalculateAdler_MatchesReference(int length) => CalculateAdlerAndCompareToReference(length); + + private static void CalculateAdlerAndCompareToReference(int length) { - var data = GetBuffer(length); + // arrange + byte[] data = GetBuffer(length); var adler = new SharpAdler32(); adler.Update(data); - long expected = adler.Value; + + // act long actual = Adler32.Calculate(data); + // assert Assert.Equal(expected, actual); } private static byte[] GetBuffer(int length) { - var data = new byte[length]; + byte[] data = new byte[length]; new Random(1).NextBytes(data); return data; } + +#if SUPPORTS_RUNTIME_INTRINSICS + [Fact] + public void RunCalculateAdlerTest_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCalculateAdlerTest, HwIntrinsics.AllowAll); + + [Fact] + public void RunCalculateAdlerTest_WithAvxDisabled_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCalculateAdlerTest, HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2); + + [Fact] + public void RunCalculateAdlerTest_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCalculateAdlerTest, HwIntrinsics.DisableHWIntrinsic); + + private static void RunCalculateAdlerTest() + { + int[] testData = { 0, 8, 215, 1024, 1024 + 15, 2034, 4096 }; + for (int i = 0; i < testData.Length; i++) + { + CalculateAdlerAndCompareToReference(testData[i]); + } + } +#endif } }