From 27fc3b01c6678ed5c96e5c137250028b779e449c Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Mon, 21 Feb 2022 22:36:47 +0100 Subject: [PATCH 1/5] Add AVX2 version of adler --- src/ImageSharp/Compression/Zlib/Adler32.cs | 155 ++++++++++++++++----- 1 file changed, 121 insertions(+), 34 deletions(-) diff --git a/src/ImageSharp/Compression/Zlib/Adler32.cs b/src/ImageSharp/Compression/Zlib/Adler32.cs index 7eb3f4516f..1f3b7e2a23 100644 --- a/src/ImageSharp/Compression/Zlib/Adler32.cs +++ b/src/ImageSharp/Compression/Zlib/Adler32.cs @@ -31,6 +31,8 @@ internal static class Adler32 #if SUPPORTS_RUNTIME_INTRINSICS private const int MinBufferSize = 64; + private const int BLOCK_SIZE = 1 << 5; + // The C# compiler emits this as a compile-time constant embedded in the PE file. private static ReadOnlySpan Tap1Tap2 => new byte[] { @@ -63,6 +65,11 @@ public static uint Calculate(uint adler, ReadOnlySpan buffer) } #if SUPPORTS_RUNTIME_INTRINSICS + if (Avx2.IsSupported && buffer.Length >= MinBufferSize) + { + return CalculateAvx2(adler, buffer); + } + if (Ssse3.IsSupported && buffer.Length >= MinBufferSize) { return CalculateSse(adler, buffer); @@ -83,8 +90,6 @@ private static unsafe uint CalculateSse(uint adler, ReadOnlySpan buffer) uint s2 = (adler >> 16) & 0xFFFF; // Process the data in blocks. - const int BLOCK_SIZE = 1 << 5; - uint length = (uint)buffer.Length; uint blocks = length / BLOCK_SIZE; length -= blocks * BLOCK_SIZE; @@ -164,45 +169,127 @@ private static unsafe uint CalculateSse(uint adler, ReadOnlySpan buffer) if (length > 0) { - if (length >= 16) - { - s2 += s1 += localBufferPtr[0]; - s2 += s1 += localBufferPtr[1]; - s2 += s1 += localBufferPtr[2]; - s2 += s1 += localBufferPtr[3]; - s2 += s1 += localBufferPtr[4]; - s2 += s1 += localBufferPtr[5]; - s2 += s1 += localBufferPtr[6]; - s2 += s1 += localBufferPtr[7]; - s2 += s1 += localBufferPtr[8]; - s2 += s1 += localBufferPtr[9]; - s2 += s1 += localBufferPtr[10]; - s2 += s1 += localBufferPtr[11]; - s2 += s1 += localBufferPtr[12]; - s2 += s1 += localBufferPtr[13]; - s2 += s1 += localBufferPtr[14]; - s2 += s1 += localBufferPtr[15]; - - localBufferPtr += 16; - length -= 16; - } + HandleLeftOver(localBufferPtr, length, ref s1, ref s2); + } - while (length-- > 0) - { - s2 += s1 += *localBufferPtr++; - } + return s1 | (s2 << 16); + } + } + } - if (s1 >= BASE) - { - s1 -= BASE; - } + // Based on: https://github.com/zlib-ng/zlib-ng/blob/develop/arch/x86/adler32_avx2.c + [MethodImpl(InliningOptions.HotPath | InliningOptions.ShortMethod)] + public static unsafe uint CalculateAvx2(uint adler, ReadOnlySpan buffer) + { + uint s1 = adler & 0xFFFF; + uint s2 = (adler >> 16) & 0xFFFF; + uint length = (uint)buffer.Length; - s2 %= BASE; + fixed (byte* bufferPtr = buffer) + { + byte* localBufferPtr = bufferPtr; + + Vector256 zero = Vector256.Zero; + var dot3v = Vector256.Create((short)1); + var dot2v = Vector256.Create(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1); + + // Process n blocks of data. At most NMAX data bytes can be + // processed before s2 must be reduced modulo BASE. + var vs1 = Vector256.CreateScalar(s1); + var vs2 = Vector256.CreateScalar(s2); + + while (length >= 32) + { + int k = length < NMAX ? (int)length : (int)NMAX; + k -= k % 32; + length -= (uint)k; + + Vector256 vs10 = vs1; + Vector256 vs3 = Vector256.Zero; + + while (k >= 32) + { + // Load 32 input bytes. + Vector256 block = Avx.LoadVector256(localBufferPtr); + + // Sum of abs diff, resulting in 2 x int32's + Vector256 vs1sad = Avx2.SumAbsoluteDifferences(block, zero); + + vs1 = Avx2.Add(vs1, vs1sad.AsUInt32()); + vs3 = Avx2.Add(vs3, vs10); + + // sum 32 uint8s to 16 shorts. + Vector256 vshortsum2 = Avx2.MultiplyAddAdjacent(block, dot2v); + + // sum 16 shorts to 8 uint32s. + Vector256 vsum2 = Avx2.MultiplyAddAdjacent(vshortsum2, dot3v); + + vs2 = Avx2.Add(vsum2.AsUInt32(), vs2); + vs10 = vs1; + + localBufferPtr += BLOCK_SIZE; + k -= 32; } - return s1 | (s2 << 16); + // Defer the multiplication with 32 to outside of the loop. + vs3 = Avx2.ShiftLeftLogical(vs3, 5); + vs2 = Avx2.Add(vs2, vs3); + + s1 = (uint)Numerics.EvenReduceSum(vs1.AsInt32()); + s2 = (uint)Numerics.ReduceSum(vs2.AsInt32()); + + s1 %= BASE; + s2 %= BASE; + + vs1 = Vector256.CreateScalar(s1); + vs2 = Vector256.CreateScalar(s2); } + + if (length > 0) + { + HandleLeftOver(localBufferPtr, length, ref s1, ref s2); + } + + return s1 | (s2 << 16); + } + } + + private static unsafe void HandleLeftOver(byte* localBufferPtr, uint length, ref uint s1, ref uint s2) + { + if (length >= 16) + { + s2 += s1 += localBufferPtr[0]; + s2 += s1 += localBufferPtr[1]; + s2 += s1 += localBufferPtr[2]; + s2 += s1 += localBufferPtr[3]; + s2 += s1 += localBufferPtr[4]; + s2 += s1 += localBufferPtr[5]; + s2 += s1 += localBufferPtr[6]; + s2 += s1 += localBufferPtr[7]; + s2 += s1 += localBufferPtr[8]; + s2 += s1 += localBufferPtr[9]; + s2 += s1 += localBufferPtr[10]; + s2 += s1 += localBufferPtr[11]; + s2 += s1 += localBufferPtr[12]; + s2 += s1 += localBufferPtr[13]; + s2 += s1 += localBufferPtr[14]; + s2 += s1 += localBufferPtr[15]; + + localBufferPtr += 16; + length -= 16; } + + while (length-- > 0) + { + s2 += s1 += *localBufferPtr++; + } + + if (s1 >= BASE) + { + s1 -= BASE; + } + + s2 %= BASE; } #endif From cbf46759194b228660535b8d21122420337e88ed Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Sun, 20 Feb 2022 14:23:57 +0100 Subject: [PATCH 2/5] Add adler tests with and without intrinsics --- .../Formats/Png/Adler32Tests.cs | 36 ++++++++++++++----- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/tests/ImageSharp.Tests/Formats/Png/Adler32Tests.cs b/tests/ImageSharp.Tests/Formats/Png/Adler32Tests.cs index 0886bd84dc..97d9e904e1 100644 --- a/tests/ImageSharp.Tests/Formats/Png/Adler32Tests.cs +++ b/tests/ImageSharp.Tests/Formats/Png/Adler32Tests.cs @@ -3,6 +3,7 @@ using System; using SixLabors.ImageSharp.Compression.Zlib; +using SixLabors.ImageSharp.Tests.TestUtilities; using Xunit; using SharpAdler32 = ICSharpCode.SharpZipLib.Checksum.Adler32; @@ -15,10 +16,7 @@ public class Adler32Tests [InlineData(0)] [InlineData(1)] [InlineData(2)] - public void ReturnsCorrectWhenEmpty(uint input) - { - Assert.Equal(input, Adler32.Calculate(input, default)); - } + public void CalculateAdler_ReturnsCorrectWhenEmpty(uint input) => Assert.Equal(input, Adler32.Calculate(input, default)); [Theory] [InlineData(0)] @@ -28,24 +26,46 @@ public void ReturnsCorrectWhenEmpty(uint input) [InlineData(1024 + 15)] [InlineData(2034)] [InlineData(4096)] - public void MatchesReference(int length) + public void CalculateAdler_MatchesReference(int length) => CalculateAdlerAndCompareToReference(length); + + private static void CalculateAdlerAndCompareToReference(int length) { - var data = GetBuffer(length); + // arrange + byte[] data = GetBuffer(length); var adler = new SharpAdler32(); adler.Update(data); - long expected = adler.Value; + + // act long actual = Adler32.Calculate(data); + // assert Assert.Equal(expected, actual); } private static byte[] GetBuffer(int length) { - var data = new byte[length]; + byte[] data = new byte[length]; new Random(1).NextBytes(data); return data; } + +#if SUPPORTS_RUNTIME_INTRINSICS + [Fact] + public void RunCalculateAdlerTest_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCalculateAdlerTest, HwIntrinsics.AllowAll); + + [Fact] + public void RunCalculateAdlerTest_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCalculateAdlerTest, HwIntrinsics.DisableHWIntrinsic); + + private static void RunCalculateAdlerTest() + { + int[] testData = { 0, 8, 215, 1024, 1024 + 15, 2034, 4096 }; + for (int i = 0; i < testData.Length; i++) + { + CalculateAdlerAndCompareToReference(testData[i]); + } + } +#endif } } From 6e6673fcb9d38586394cd56730bd1da879e1b9d4 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Mon, 21 Feb 2022 22:55:21 +0100 Subject: [PATCH 3/5] Add test with AVX disabled --- tests/ImageSharp.Tests/Formats/Png/Adler32Tests.cs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/ImageSharp.Tests/Formats/Png/Adler32Tests.cs b/tests/ImageSharp.Tests/Formats/Png/Adler32Tests.cs index 97d9e904e1..77f2b76634 100644 --- a/tests/ImageSharp.Tests/Formats/Png/Adler32Tests.cs +++ b/tests/ImageSharp.Tests/Formats/Png/Adler32Tests.cs @@ -55,6 +55,9 @@ private static byte[] GetBuffer(int length) [Fact] public void RunCalculateAdlerTest_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCalculateAdlerTest, HwIntrinsics.AllowAll); + [Fact] + public void RunCalculateAdlerTest_WithAvxDisabled_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCalculateAdlerTest, HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2); + [Fact] public void RunCalculateAdlerTest_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCalculateAdlerTest, HwIntrinsics.DisableHWIntrinsic); From d4fc063bbc1368d4162c02bdf0121f982959b47e Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Mon, 21 Feb 2022 22:57:20 +0100 Subject: [PATCH 4/5] Rename BLOCK_SIZE to BlockSize --- src/ImageSharp/Compression/Zlib/Adler32.cs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/ImageSharp/Compression/Zlib/Adler32.cs b/src/ImageSharp/Compression/Zlib/Adler32.cs index 1f3b7e2a23..b13e2e4211 100644 --- a/src/ImageSharp/Compression/Zlib/Adler32.cs +++ b/src/ImageSharp/Compression/Zlib/Adler32.cs @@ -31,7 +31,7 @@ internal static class Adler32 #if SUPPORTS_RUNTIME_INTRINSICS private const int MinBufferSize = 64; - private const int BLOCK_SIZE = 1 << 5; + private const int BlockSize = 1 << 5; // The C# compiler emits this as a compile-time constant embedded in the PE file. private static ReadOnlySpan Tap1Tap2 => new byte[] @@ -91,15 +91,15 @@ private static unsafe uint CalculateSse(uint adler, ReadOnlySpan buffer) // Process the data in blocks. uint length = (uint)buffer.Length; - uint blocks = length / BLOCK_SIZE; - length -= blocks * BLOCK_SIZE; + uint blocks = length / BlockSize; + length -= blocks * BlockSize; int index = 0; fixed (byte* bufferPtr = buffer) { fixed (byte* tapPtr = Tap1Tap2) { - index += (int)blocks * BLOCK_SIZE; + index += (int)blocks * BlockSize; var localBufferPtr = bufferPtr; // _mm_setr_epi8 on x86 @@ -110,7 +110,7 @@ private static unsafe uint CalculateSse(uint adler, ReadOnlySpan buffer) while (blocks > 0) { - uint n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */ + uint n = NMAX / BlockSize; /* The NMAX constraint. */ if (n > blocks) { n = blocks; @@ -143,7 +143,7 @@ private static unsafe uint CalculateSse(uint adler, ReadOnlySpan buffer) Vector128 mad2 = Ssse3.MultiplyAddAdjacent(bytes2, tap2); v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad2, ones).AsUInt32()); - localBufferPtr += BLOCK_SIZE; + localBufferPtr += BlockSize; } while (--n > 0); @@ -227,7 +227,7 @@ public static unsafe uint CalculateAvx2(uint adler, ReadOnlySpan buffer) vs2 = Avx2.Add(vsum2.AsUInt32(), vs2); vs10 = vs1; - localBufferPtr += BLOCK_SIZE; + localBufferPtr += BlockSize; k -= 32; } From 09b2cdb83aab0a889dbde8ec2453d605a1be1725 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Tue, 22 Feb 2022 15:12:00 +0100 Subject: [PATCH 5/5] Review changes from gfoidl --- src/ImageSharp/Compression/Zlib/Adler32.cs | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/ImageSharp/Compression/Zlib/Adler32.cs b/src/ImageSharp/Compression/Zlib/Adler32.cs index b13e2e4211..1f3cbbca64 100644 --- a/src/ImageSharp/Compression/Zlib/Adler32.cs +++ b/src/ImageSharp/Compression/Zlib/Adler32.cs @@ -3,6 +3,7 @@ using System; using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; #if SUPPORTS_RUNTIME_INTRINSICS using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; @@ -94,13 +95,11 @@ private static unsafe uint CalculateSse(uint adler, ReadOnlySpan buffer) uint blocks = length / BlockSize; length -= blocks * BlockSize; - int index = 0; - fixed (byte* bufferPtr = buffer) + fixed (byte* bufferPtr = &MemoryMarshal.GetReference(buffer)) { - fixed (byte* tapPtr = Tap1Tap2) + fixed (byte* tapPtr = &MemoryMarshal.GetReference(Tap1Tap2)) { - index += (int)blocks * BlockSize; - var localBufferPtr = bufferPtr; + byte* localBufferPtr = bufferPtr; // _mm_setr_epi8 on x86 Vector128 tap1 = Sse2.LoadVector128((sbyte*)tapPtr); @@ -185,7 +184,7 @@ public static unsafe uint CalculateAvx2(uint adler, ReadOnlySpan buffer) uint s2 = (adler >> 16) & 0xFFFF; uint length = (uint)buffer.Length; - fixed (byte* bufferPtr = buffer) + fixed (byte* bufferPtr = &MemoryMarshal.GetReference(buffer)) { byte* localBufferPtr = bufferPtr;