From 5416edb6bcac75e5c099e2f2f8496c8949475e68 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCnther=20Foidl?= Date: Thu, 27 Jul 2023 21:22:09 +0200 Subject: [PATCH 1/2] Vectorize TrimTransparentPixels in GifEncoderCore --- src/ImageSharp/Formats/Gif/GifEncoderCore.cs | 133 ++++++++++++++++++- 1 file changed, 126 insertions(+), 7 deletions(-) diff --git a/src/ImageSharp/Formats/Gif/GifEncoderCore.cs b/src/ImageSharp/Formats/Gif/GifEncoderCore.cs index a66bc58960..5d18823274 100644 --- a/src/ImageSharp/Formats/Gif/GifEncoderCore.cs +++ b/src/ImageSharp/Formats/Gif/GifEncoderCore.cs @@ -412,23 +412,142 @@ private static Buffer2DRegion TrimTransparentPixels(Buffer2D buffer, int bottom = int.MaxValue; int left = int.MaxValue; int right = int.MinValue; - - // Run through th buffer in a single pass. Use variables to track the min/max values. int minY = -1; bool isTransparentRow = true; + + // Run through the buffer in a single pass. Use variables to track the min/max values. for (int y = 0; y < buffer.Height; y++) { isTransparentRow = true; Span rowSpan = buffer.DangerousGetRowSpan(y); + ref byte rowPtr = ref MemoryMarshal.GetReference(rowSpan); + nint rowLength = (nint)(uint)rowSpan.Length; + nint x = 0; + +#if NET7_0_OR_GREATER + if (Vector128.IsHardwareAccelerated && rowLength >= Vector128.Count) + { + Vector256 trimmableVec256 = Vector256.Create(trimmableIndex); + + if (Vector256.IsHardwareAccelerated && rowLength >= Vector256.Count) + { + do + { + Vector256 vec = Vector256.LoadUnsafe(ref rowPtr, (nuint)x); + Vector256 notEquals = ~Vector256.Equals(vec, trimmableVec256); + + if (notEquals != Vector256.Zero) + { + isTransparentRow = false; + uint mask = notEquals.ExtractMostSignificantBits(); + nint start = x + (nint)uint.TrailingZeroCount(mask); + nint end = (nint)uint.LeadingZeroCount(mask); + + // end is from the end, but we need the index from the beginning + end = x + Vector256.Count - 1 - end; + + left = Math.Min(left, (int)start); + right = Math.Max(right, (int)end); + } + + x += Vector256.Count; + } + while (x <= rowLength - Vector256.Count); + } + + Vector128 trimmableVec = Vector256.IsHardwareAccelerated + ? trimmableVec256.GetLower() + : Vector128.Create(trimmableIndex); + + while (x <= rowLength - Vector128.Count) + { + Vector128 vec = Vector128.LoadUnsafe(ref rowPtr, (nuint)x); + Vector128 notEquals = ~Vector128.Equals(vec, trimmableVec); + + if (notEquals != Vector128.Zero) + { + isTransparentRow = false; + uint mask = notEquals.ExtractMostSignificantBits(); + nint start = x + (nint)uint.TrailingZeroCount(mask); + nint end = (nint)uint.LeadingZeroCount(mask) - Vector128.Count; + + // end is from the end, but we need the index from the beginning + end = x + Vector128.Count - 1 - end; + + left = Math.Min(left, (int)start); + right = Math.Max(right, (int)end); + } + + x += Vector128.Count; + } + } +#else + if (Sse41.IsSupported && rowLength >= Vector128.Count) + { + Vector256 trimmableVec256 = Vector256.Create(trimmableIndex); + + if (Avx2.IsSupported && rowLength >= Vector256.Count) + { + do + { + Vector256 vec = Unsafe.ReadUnaligned>(ref Unsafe.Add(ref rowPtr, x)); + Vector256 notEquals = Avx2.CompareEqual(vec, trimmableVec256); + notEquals = Avx2.Xor(notEquals, Vector256.AllBitsSet); + + if (!Avx.TestZ(notEquals, notEquals)) + { + isTransparentRow = false; + int mask = Avx2.MoveMask(notEquals); + nint start = x + (nint)(uint)BitOperations.TrailingZeroCount(mask); + nint end = (nint)(uint)BitOperations.LeadingZeroCount((uint)mask); + + // end is from the end, but we need the index from the beginning + end = x + Vector256.Count - 1 - end; + + left = Math.Min(left, (int)start); + right = Math.Max(right, (int)end); + } + + x += Vector256.Count; + } + while (x <= rowLength - Vector256.Count); + } + + Vector128 trimmableVec = Sse41.IsSupported + ? trimmableVec256.GetLower() + : Vector128.Create(trimmableIndex); + + while (x <= rowLength - Vector128.Count) + { + Vector128 vec = Unsafe.ReadUnaligned>(ref Unsafe.Add(ref rowPtr, x)); + Vector128 notEquals = Sse2.CompareEqual(vec, trimmableVec); + notEquals = Sse2.Xor(notEquals, Vector128.AllBitsSet); + + if (!Sse41.TestZ(notEquals, notEquals)) + { + isTransparentRow = false; + int mask = Sse2.MoveMask(notEquals); + nint start = x + (nint)(uint)BitOperations.TrailingZeroCount(mask); + nint end = (nint)(uint)BitOperations.LeadingZeroCount((uint)mask) - Vector128.Count; - // TODO: It may be possible to optimize this inner loop using SIMD. - for (int x = 0; x < rowSpan.Length; x++) + // end is from the end, but we need the index from the beginning + end = x + Vector128.Count - 1 - end; + + left = Math.Min(left, (int)start); + right = Math.Max(right, (int)end); + } + + x += Vector128.Count; + } + } +#endif + for (; x < rowLength; ++x) { - if (rowSpan[x] != trimmableIndex) + if (Unsafe.Add(ref rowPtr, x) != trimmableIndex) { isTransparentRow = false; - left = Math.Min(left, x); - right = Math.Max(right, x); + left = Math.Min(left, (int)x); + right = Math.Max(right, (int)x); } } From c8f1f2c89df590e5a8804ddce21c03696fd8b468 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCnther=20Foidl?= Date: Thu, 27 Jul 2023 22:26:20 +0200 Subject: [PATCH 2/2] Simplified check if there are any non-equal bytes Hm, I remembered that movemask isn't the fastest, and ptest (TestZ in .NET-terms) is faster but current benchmarks didn't prove this, also Intel's instruction table didn't show any benefit in terms of latency or throughput. Thus simplified that check. --- src/ImageSharp/Formats/Gif/GifEncoderCore.cs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/ImageSharp/Formats/Gif/GifEncoderCore.cs b/src/ImageSharp/Formats/Gif/GifEncoderCore.cs index 5d18823274..be08c0da90 100644 --- a/src/ImageSharp/Formats/Gif/GifEncoderCore.cs +++ b/src/ImageSharp/Formats/Gif/GifEncoderCore.cs @@ -435,11 +435,11 @@ private static Buffer2DRegion TrimTransparentPixels(Buffer2D buffer, { Vector256 vec = Vector256.LoadUnsafe(ref rowPtr, (nuint)x); Vector256 notEquals = ~Vector256.Equals(vec, trimmableVec256); + uint mask = notEquals.ExtractMostSignificantBits(); - if (notEquals != Vector256.Zero) + if (mask != 0) { isTransparentRow = false; - uint mask = notEquals.ExtractMostSignificantBits(); nint start = x + (nint)uint.TrailingZeroCount(mask); nint end = (nint)uint.LeadingZeroCount(mask); @@ -463,11 +463,11 @@ private static Buffer2DRegion TrimTransparentPixels(Buffer2D buffer, { Vector128 vec = Vector128.LoadUnsafe(ref rowPtr, (nuint)x); Vector128 notEquals = ~Vector128.Equals(vec, trimmableVec); + uint mask = notEquals.ExtractMostSignificantBits(); - if (notEquals != Vector128.Zero) + if (mask != 0) { isTransparentRow = false; - uint mask = notEquals.ExtractMostSignificantBits(); nint start = x + (nint)uint.TrailingZeroCount(mask); nint end = (nint)uint.LeadingZeroCount(mask) - Vector128.Count; @@ -493,11 +493,11 @@ private static Buffer2DRegion TrimTransparentPixels(Buffer2D buffer, Vector256 vec = Unsafe.ReadUnaligned>(ref Unsafe.Add(ref rowPtr, x)); Vector256 notEquals = Avx2.CompareEqual(vec, trimmableVec256); notEquals = Avx2.Xor(notEquals, Vector256.AllBitsSet); + int mask = Avx2.MoveMask(notEquals); - if (!Avx.TestZ(notEquals, notEquals)) + if (mask != 0) { isTransparentRow = false; - int mask = Avx2.MoveMask(notEquals); nint start = x + (nint)(uint)BitOperations.TrailingZeroCount(mask); nint end = (nint)(uint)BitOperations.LeadingZeroCount((uint)mask); @@ -522,11 +522,11 @@ private static Buffer2DRegion TrimTransparentPixels(Buffer2D buffer, Vector128 vec = Unsafe.ReadUnaligned>(ref Unsafe.Add(ref rowPtr, x)); Vector128 notEquals = Sse2.CompareEqual(vec, trimmableVec); notEquals = Sse2.Xor(notEquals, Vector128.AllBitsSet); + int mask = Sse2.MoveMask(notEquals); - if (!Sse41.TestZ(notEquals, notEquals)) + if (mask != 0) { isTransparentRow = false; - int mask = Sse2.MoveMask(notEquals); nint start = x + (nint)(uint)BitOperations.TrailingZeroCount(mask); nint end = (nint)(uint)BitOperations.LeadingZeroCount((uint)mask) - Vector128.Count;