From c76518b114673cc1e8ff6d6f574978ad2a970b21 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Mon, 15 Nov 2021 15:02:51 +0100 Subject: [PATCH 1/7] Add AVX version of TransformColor --- .../Formats/Webp/Lossless/LosslessUtils.cs | 39 ++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs index f9b97c6c44..9a6d974bdd 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs @@ -42,8 +42,12 @@ internal static unsafe class LosslessUtils private static readonly Vector128 TransformColorAlphaGreenMask = Vector128.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + private static readonly Vector256 TransformColorAlphaGreenMask256 = Vector256.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + private static readonly Vector128 TransformColorRedBlueMask = Vector128.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0); + private static readonly Vector256 TransformColorRedBlueMask256 = Vector256.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0); + private static readonly byte TransformColorShuffleMask = SimdUtils.Shuffle.MmShuffle(2, 2, 0, 0); private static readonly Vector128 TransformColorInverseAlphaGreenMask = Vector128.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); @@ -408,7 +412,37 @@ public static void ColorSpaceInverseTransform(Vp8LTransform transform, Span data, int numPixels) { #if SUPPORTS_RUNTIME_INTRINSICS - if (Sse2.IsSupported) + if (Avx2.IsSupported && numPixels >= 8) + { + Vector256 multsrb = MkCst32(Cst5b(m.GreenToRed), Cst5b(m.GreenToBlue)); + Vector256 multsb2 = MkCst32(Cst5b(m.RedToBlue), 0); + fixed (uint* src = data) + { + int idx; + for (idx = 0; idx + 8 <= numPixels; idx += 8) + { + uint* pos = src + idx; + Vector256 input = Avx.LoadVector256(pos); + Vector256 a = Avx2.And(input.AsByte(), TransformColorAlphaGreenMask256); + Vector256 b = Avx2.ShuffleLow(a.AsInt16(), TransformColorShuffleMask); + Vector256 c = Avx2.ShuffleHigh(b.AsInt16(), TransformColorShuffleMask); + Vector256 d = Avx2.MultiplyHigh(c.AsInt16(), multsrb.AsInt16()); + Vector256 e = Avx2.ShiftLeftLogical(input.AsInt16(), 8); + Vector256 f = Avx2.MultiplyHigh(e.AsInt16(), multsb2.AsInt16()); + Vector256 g = Avx2.ShiftRightLogical(f.AsInt32(), 16); + Vector256 h = Avx2.Add(g.AsByte(), d.AsByte()); + Vector256 i = Avx2.And(h, TransformColorRedBlueMask256); + Vector256 output = Avx2.Subtract(input.AsByte(), i); + Avx.Store((byte*)pos, output); + } + + if (idx != numPixels) + { + TransformColorNoneVectorized(m, data.Slice(idx), numPixels - idx); + } + } + } + else if (Sse2.IsSupported) { Vector128 multsrb = MkCst16(Cst5b(m.GreenToRed), Cst5b(m.GreenToBlue)); Vector128 multsb2 = MkCst16(Cst5b(m.RedToBlue), 0); @@ -1288,6 +1322,9 @@ private static uint ClampedAddSubtractHalf(uint c0, uint c1, uint c2) #if SUPPORTS_RUNTIME_INTRINSICS [MethodImpl(InliningOptions.ShortMethod)] private static Vector128 MkCst16(int hi, int lo) => Vector128.Create((hi << 16) | (lo & 0xffff)); + + [MethodImpl(InliningOptions.ShortMethod)] + private static Vector256 MkCst32(int hi, int lo) => Vector256.Create((hi << 16) | (lo & 0xffff)); #endif private static uint Select(uint a, uint b, uint c, Span scratch) From e67ad60e8d2a42d246d785f0ebe91d92b2183aff Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Mon, 15 Nov 2021 15:13:56 +0100 Subject: [PATCH 2/7] Add AVX version of TransformColorInverse --- .../Formats/Webp/Lossless/LosslessUtils.cs | 35 ++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs index 9a6d974bdd..94ad343c83 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs @@ -52,6 +52,8 @@ internal static unsafe class LosslessUtils private static readonly Vector128 TransformColorInverseAlphaGreenMask = Vector128.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + private static readonly Vector256 TransformColorInverseAlphaGreenMask256 = Vector256.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + private static readonly byte TransformColorInverseShuffleMask = SimdUtils.Shuffle.MmShuffle(2, 2, 0, 0); #endif @@ -505,7 +507,38 @@ private static void TransformColorNoneVectorized(Vp8LMultipliers m, Span d public static void TransformColorInverse(Vp8LMultipliers m, Span pixelData) { #if SUPPORTS_RUNTIME_INTRINSICS - if (Sse2.IsSupported) + if (Avx2.IsSupported && pixelData.Length >= 8) + { + Vector256 multsrb = MkCst32(Cst5b(m.GreenToRed), Cst5b(m.GreenToBlue)); + Vector256 multsb2 = MkCst32(Cst5b(m.RedToBlue), 0); + fixed (uint* src = pixelData) + { + int idx; + for (idx = 0; idx + 8 <= pixelData.Length; idx += 8) + { + uint* pos = src + idx; + Vector256 input = Avx.LoadVector256(pos); + Vector256 a = Avx2.And(input.AsByte(), TransformColorInverseAlphaGreenMask256); + Vector256 b = Avx2.ShuffleLow(a.AsInt16(), TransformColorInverseShuffleMask); + Vector256 c = Avx2.ShuffleHigh(b.AsInt16(), TransformColorInverseShuffleMask); + Vector256 d = Avx2.MultiplyHigh(c.AsInt16(), multsrb.AsInt16()); + Vector256 e = Avx2.Add(input.AsByte(), d.AsByte()); + Vector256 f = Avx2.ShiftLeftLogical(e.AsInt16(), 8); + Vector256 g = Avx2.MultiplyHigh(f, multsb2.AsInt16()); + Vector256 h = Avx2.ShiftRightLogical(g.AsInt32(), 8); + Vector256 i = Avx2.Add(h.AsByte(), f.AsByte()); + Vector256 j = Avx2.ShiftRightLogical(i.AsInt16(), 8); + Vector256 output = Avx2.Or(j.AsByte(), a); + Avx.Store((byte*)pos, output); + } + + if (idx != pixelData.Length) + { + TransformColorInverseNoneVectorized(m, pixelData.Slice(idx)); + } + } + } + else if (Sse2.IsSupported) { Vector128 multsrb = MkCst16(Cst5b(m.GreenToRed), Cst5b(m.GreenToBlue)); Vector128 multsb2 = MkCst16(Cst5b(m.RedToBlue), 0); From 8e5645912cd0b711e865fa8c47ffdbe5be4f83de Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Mon, 15 Nov 2021 15:19:07 +0100 Subject: [PATCH 3/7] Add AVX tests --- tests/ImageSharp.Tests/Formats/WebP/LosslessUtilsTests.cs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/ImageSharp.Tests/Formats/WebP/LosslessUtilsTests.cs b/tests/ImageSharp.Tests/Formats/WebP/LosslessUtilsTests.cs index c70f332ef6..97567ba218 100644 --- a/tests/ImageSharp.Tests/Formats/WebP/LosslessUtilsTests.cs +++ b/tests/ImageSharp.Tests/Formats/WebP/LosslessUtilsTests.cs @@ -257,11 +257,17 @@ private static void RunPredictor13Test() [Fact] public void TransformColor_WithoutSSE2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTransformColorTest, HwIntrinsics.DisableSSE2); + [Fact] + public void TransformColor_WithoutAVX2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTransformColorTest, HwIntrinsics.DisableAVX2); + [Fact] public void TransformColorInverse_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTransformColorInverseTest, HwIntrinsics.AllowAll); [Fact] public void TransformColorInverse_WithoutSSE2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTransformColorInverseTest, HwIntrinsics.DisableSSE2); + + [Fact] + public void TransformColorInverse_WithoutAVX2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTransformColorInverseTest, HwIntrinsics.DisableAVX2); #endif } } From b15a021fac71d9643855e4c52e19d955bfd54daa Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Mon, 15 Nov 2021 16:14:51 +0100 Subject: [PATCH 4/7] Avoid pinning --- .../Formats/Webp/Lossless/LosslessUtils.cs | 341 ++++++++---------- 1 file changed, 156 insertions(+), 185 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs index 94ad343c83..c202ad4a8b 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs @@ -128,66 +128,57 @@ public static void AddGreenToBlueAndRed(Span pixelData) if (Avx2.IsSupported) { int numPixels = pixelData.Length; - fixed (uint* p = pixelData) + int i; + for (i = 0; i + 8 <= numPixels; i += 8) { - int i; - for (i = 0; i + 8 <= numPixels; i += 8) - { - uint* idx = p + i; - Vector256 input = Avx.LoadVector256((ushort*)idx).AsByte(); - Vector256 in0g0g = Avx2.Shuffle(input, AddGreenToBlueAndRedMaskAvx2); - Vector256 output = Avx2.Add(input, in0g0g); - Avx.Store((byte*)idx, output); - } + ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i); + Vector256 input = Unsafe.As>(ref pos).AsByte(); + Vector256 in0g0g = Avx2.Shuffle(input, AddGreenToBlueAndRedMaskAvx2); + Vector256 output = Avx2.Add(input, in0g0g); + Unsafe.As>(ref pos) = output.AsUInt32(); + } - if (i != numPixels) - { - AddGreenToBlueAndRedNoneVectorized(pixelData.Slice(i)); - } + if (i != numPixels) + { + AddGreenToBlueAndRedNoneVectorized(pixelData.Slice(i)); } } else if (Ssse3.IsSupported) { int numPixels = pixelData.Length; - fixed (uint* p = pixelData) + int i; + for (i = 0; i + 4 <= numPixels; i += 4) { - int i; - for (i = 0; i + 4 <= numPixels; i += 4) - { - uint* idx = p + i; - Vector128 input = Sse2.LoadVector128((ushort*)idx).AsByte(); - Vector128 in0g0g = Ssse3.Shuffle(input, AddGreenToBlueAndRedMaskSsse3); - Vector128 output = Sse2.Add(input, in0g0g); - Sse2.Store((byte*)idx, output.AsByte()); - } + ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i); + Vector128 input = Unsafe.As>(ref pos).AsByte(); + Vector128 in0g0g = Ssse3.Shuffle(input, AddGreenToBlueAndRedMaskSsse3); + Vector128 output = Sse2.Add(input, in0g0g); + Unsafe.As>(ref pos) = output.AsUInt32(); + } - if (i != numPixels) - { - AddGreenToBlueAndRedNoneVectorized(pixelData.Slice(i)); - } + if (i != numPixels) + { + AddGreenToBlueAndRedNoneVectorized(pixelData.Slice(i)); } } else if (Sse2.IsSupported) { int numPixels = pixelData.Length; - fixed (uint* p = pixelData) + int i; + for (i = 0; i + 4 <= numPixels; i += 4) { - int i; - for (i = 0; i + 4 <= numPixels; i += 4) - { - uint* idx = p + i; - Vector128 input = Sse2.LoadVector128((ushort*)idx); - Vector128 a = Sse2.ShiftRightLogical(input.AsUInt16(), 8); // 0 a 0 g - Vector128 b = Sse2.ShuffleLow(a, AddGreenToBlueAndRedShuffleMask); - Vector128 c = Sse2.ShuffleHigh(b, AddGreenToBlueAndRedShuffleMask); // 0g0g - Vector128 output = Sse2.Add(input.AsByte(), c.AsByte()); - Sse2.Store((byte*)idx, output); - } + ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i); + Vector128 input = Unsafe.As>(ref pos).AsByte(); + Vector128 a = Sse2.ShiftRightLogical(input.AsUInt16(), 8); // 0 a 0 g + Vector128 b = Sse2.ShuffleLow(a, AddGreenToBlueAndRedShuffleMask); + Vector128 c = Sse2.ShuffleHigh(b, AddGreenToBlueAndRedShuffleMask); // 0g0g + Vector128 output = Sse2.Add(input.AsByte(), c.AsByte()); + Unsafe.As>(ref pos) = output.AsUInt32(); + } - if (i != numPixels) - { - AddGreenToBlueAndRedNoneVectorized(pixelData.Slice(i)); - } + if (i != numPixels) + { + AddGreenToBlueAndRedNoneVectorized(pixelData.Slice(i)); } } else @@ -217,66 +208,57 @@ public static void SubtractGreenFromBlueAndRed(Span pixelData) if (Avx2.IsSupported) { int numPixels = pixelData.Length; - fixed (uint* p = pixelData) + int i; + for (i = 0; i + 8 <= numPixels; i += 8) { - int i; - for (i = 0; i + 8 <= numPixels; i += 8) - { - uint* idx = p + i; - Vector256 input = Avx.LoadVector256((ushort*)idx).AsByte(); - Vector256 in0g0g = Avx2.Shuffle(input, SubtractGreenFromBlueAndRedMaskAvx2); - Vector256 output = Avx2.Subtract(input, in0g0g); - Avx.Store((byte*)idx, output); - } + ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i); + Vector256 input = Unsafe.As>(ref pos).AsByte(); + Vector256 in0g0g = Avx2.Shuffle(input, SubtractGreenFromBlueAndRedMaskAvx2); + Vector256 output = Avx2.Subtract(input, in0g0g); + Unsafe.As>(ref pos) = output.AsUInt32(); + } - if (i != numPixels) - { - SubtractGreenFromBlueAndRedNoneVectorized(pixelData.Slice(i)); - } + if (i != numPixels) + { + SubtractGreenFromBlueAndRedNoneVectorized(pixelData.Slice(i)); } } else if (Ssse3.IsSupported) { int numPixels = pixelData.Length; - fixed (uint* p = pixelData) + int i; + for (i = 0; i + 4 <= numPixels; i += 4) { - int i; - for (i = 0; i + 4 <= numPixels; i += 4) - { - uint* idx = p + i; - Vector128 input = Sse2.LoadVector128((ushort*)idx).AsByte(); - Vector128 in0g0g = Ssse3.Shuffle(input, SubtractGreenFromBlueAndRedMaskSsse3); - Vector128 output = Sse2.Subtract(input, in0g0g); - Sse2.Store((byte*)idx, output.AsByte()); - } + ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i); + Vector128 input = Unsafe.As>(ref pos).AsByte(); + Vector128 in0g0g = Ssse3.Shuffle(input, SubtractGreenFromBlueAndRedMaskSsse3); + Vector128 output = Sse2.Subtract(input, in0g0g); + Unsafe.As>(ref pos) = output.AsUInt32(); + } - if (i != numPixels) - { - SubtractGreenFromBlueAndRedNoneVectorized(pixelData.Slice(i)); - } + if (i != numPixels) + { + SubtractGreenFromBlueAndRedNoneVectorized(pixelData.Slice(i)); } } else if (Sse2.IsSupported) { int numPixels = pixelData.Length; - fixed (uint* p = pixelData) + int i; + for (i = 0; i + 4 <= numPixels; i += 4) { - int i; - for (i = 0; i + 4 <= numPixels; i += 4) - { - uint* idx = p + i; - Vector128 input = Sse2.LoadVector128((ushort*)idx); - Vector128 a = Sse2.ShiftRightLogical(input.AsUInt16(), 8); // 0 a 0 g - Vector128 b = Sse2.ShuffleLow(a, SubtractGreenFromBlueAndRedShuffleMask); - Vector128 c = Sse2.ShuffleHigh(b, SubtractGreenFromBlueAndRedShuffleMask); // 0g0g - Vector128 output = Sse2.Subtract(input.AsByte(), c.AsByte()); - Sse2.Store((byte*)idx, output); - } + ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i); + Vector128 input = Unsafe.As>(ref pos).AsByte(); + Vector128 a = Sse2.ShiftRightLogical(input.AsUInt16(), 8); // 0 a 0 g + Vector128 b = Sse2.ShuffleLow(a, SubtractGreenFromBlueAndRedShuffleMask); + Vector128 c = Sse2.ShuffleHigh(b, SubtractGreenFromBlueAndRedShuffleMask); // 0g0g + Vector128 output = Sse2.Subtract(input.AsByte(), c.AsByte()); + Unsafe.As>(ref pos) = output.AsUInt32(); + } - if (i != numPixels) - { - SubtractGreenFromBlueAndRedNoneVectorized(pixelData.Slice(i)); - } + if (i != numPixels) + { + SubtractGreenFromBlueAndRedNoneVectorized(pixelData.Slice(i)); } } else @@ -409,75 +391,70 @@ public static void ColorSpaceInverseTransform(Vp8LTransform transform, Span /// The Vp8LMultipliers. - /// The pixel data to transform. + /// The pixel data to transform. /// The number of pixels to process. - public static void TransformColor(Vp8LMultipliers m, Span data, int numPixels) + public static void TransformColor(Vp8LMultipliers m, Span pixelData, int numPixels) { #if SUPPORTS_RUNTIME_INTRINSICS if (Avx2.IsSupported && numPixels >= 8) { Vector256 multsrb = MkCst32(Cst5b(m.GreenToRed), Cst5b(m.GreenToBlue)); Vector256 multsb2 = MkCst32(Cst5b(m.RedToBlue), 0); - fixed (uint* src = data) + + int idx; + for (idx = 0; idx + 8 <= numPixels; idx += 8) { - int idx; - for (idx = 0; idx + 8 <= numPixels; idx += 8) - { - uint* pos = src + idx; - Vector256 input = Avx.LoadVector256(pos); - Vector256 a = Avx2.And(input.AsByte(), TransformColorAlphaGreenMask256); - Vector256 b = Avx2.ShuffleLow(a.AsInt16(), TransformColorShuffleMask); - Vector256 c = Avx2.ShuffleHigh(b.AsInt16(), TransformColorShuffleMask); - Vector256 d = Avx2.MultiplyHigh(c.AsInt16(), multsrb.AsInt16()); - Vector256 e = Avx2.ShiftLeftLogical(input.AsInt16(), 8); - Vector256 f = Avx2.MultiplyHigh(e.AsInt16(), multsb2.AsInt16()); - Vector256 g = Avx2.ShiftRightLogical(f.AsInt32(), 16); - Vector256 h = Avx2.Add(g.AsByte(), d.AsByte()); - Vector256 i = Avx2.And(h, TransformColorRedBlueMask256); - Vector256 output = Avx2.Subtract(input.AsByte(), i); - Avx.Store((byte*)pos, output); - } + ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), idx); + Vector256 input = Unsafe.As>(ref pos); + Vector256 a = Avx2.And(input.AsByte(), TransformColorAlphaGreenMask256); + Vector256 b = Avx2.ShuffleLow(a.AsInt16(), TransformColorShuffleMask); + Vector256 c = Avx2.ShuffleHigh(b.AsInt16(), TransformColorShuffleMask); + Vector256 d = Avx2.MultiplyHigh(c.AsInt16(), multsrb.AsInt16()); + Vector256 e = Avx2.ShiftLeftLogical(input.AsInt16(), 8); + Vector256 f = Avx2.MultiplyHigh(e.AsInt16(), multsb2.AsInt16()); + Vector256 g = Avx2.ShiftRightLogical(f.AsInt32(), 16); + Vector256 h = Avx2.Add(g.AsByte(), d.AsByte()); + Vector256 i = Avx2.And(h, TransformColorRedBlueMask256); + Vector256 output = Avx2.Subtract(input.AsByte(), i); + Unsafe.As>(ref pos) = output.AsUInt32(); + } - if (idx != numPixels) - { - TransformColorNoneVectorized(m, data.Slice(idx), numPixels - idx); - } + if (idx != numPixels) + { + TransformColorNoneVectorized(m, pixelData.Slice(idx), numPixels - idx); } } else if (Sse2.IsSupported) { Vector128 multsrb = MkCst16(Cst5b(m.GreenToRed), Cst5b(m.GreenToBlue)); Vector128 multsb2 = MkCst16(Cst5b(m.RedToBlue), 0); - fixed (uint* src = data) + int idx; + for (idx = 0; idx + 4 <= numPixels; idx += 4) { - int idx; - for (idx = 0; idx + 4 <= numPixels; idx += 4) - { - uint* pos = src + idx; - Vector128 input = Sse2.LoadVector128(pos); - Vector128 a = Sse2.And(input.AsByte(), TransformColorAlphaGreenMask); - Vector128 b = Sse2.ShuffleLow(a.AsInt16(), TransformColorShuffleMask); - Vector128 c = Sse2.ShuffleHigh(b.AsInt16(), TransformColorShuffleMask); - Vector128 d = Sse2.MultiplyHigh(c.AsInt16(), multsrb.AsInt16()); - Vector128 e = Sse2.ShiftLeftLogical(input.AsInt16(), 8); - Vector128 f = Sse2.MultiplyHigh(e.AsInt16(), multsb2.AsInt16()); - Vector128 g = Sse2.ShiftRightLogical(f.AsInt32(), 16); - Vector128 h = Sse2.Add(g.AsByte(), d.AsByte()); - Vector128 i = Sse2.And(h, TransformColorRedBlueMask); - Vector128 output = Sse2.Subtract(input.AsByte(), i); - Sse2.Store((byte*)pos, output); - } + ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), idx); + Vector128 input = Unsafe.As>(ref pos); + Vector128 a = Sse2.And(input.AsByte(), TransformColorAlphaGreenMask); + Vector128 b = Sse2.ShuffleLow(a.AsInt16(), TransformColorShuffleMask); + Vector128 c = Sse2.ShuffleHigh(b.AsInt16(), TransformColorShuffleMask); + Vector128 d = Sse2.MultiplyHigh(c.AsInt16(), multsrb.AsInt16()); + Vector128 e = Sse2.ShiftLeftLogical(input.AsInt16(), 8); + Vector128 f = Sse2.MultiplyHigh(e.AsInt16(), multsb2.AsInt16()); + Vector128 g = Sse2.ShiftRightLogical(f.AsInt32(), 16); + Vector128 h = Sse2.Add(g.AsByte(), d.AsByte()); + Vector128 i = Sse2.And(h, TransformColorRedBlueMask); + Vector128 output = Sse2.Subtract(input.AsByte(), i); + Unsafe.As>(ref pos) = output.AsUInt32(); + } - if (idx != numPixels) - { - TransformColorNoneVectorized(m, data.Slice(idx), numPixels - idx); - } + if (idx != numPixels) + { + TransformColorNoneVectorized(m, pixelData.Slice(idx), numPixels - idx); } } else #endif { - TransformColorNoneVectorized(m, data, numPixels); + TransformColorNoneVectorized(m, pixelData, numPixels); } } @@ -511,62 +488,57 @@ public static void TransformColorInverse(Vp8LMultipliers m, Span pixelData { Vector256 multsrb = MkCst32(Cst5b(m.GreenToRed), Cst5b(m.GreenToBlue)); Vector256 multsb2 = MkCst32(Cst5b(m.RedToBlue), 0); - fixed (uint* src = pixelData) + int idx; + for (idx = 0; idx + 8 <= pixelData.Length; idx += 8) { - int idx; - for (idx = 0; idx + 8 <= pixelData.Length; idx += 8) - { - uint* pos = src + idx; - Vector256 input = Avx.LoadVector256(pos); - Vector256 a = Avx2.And(input.AsByte(), TransformColorInverseAlphaGreenMask256); - Vector256 b = Avx2.ShuffleLow(a.AsInt16(), TransformColorInverseShuffleMask); - Vector256 c = Avx2.ShuffleHigh(b.AsInt16(), TransformColorInverseShuffleMask); - Vector256 d = Avx2.MultiplyHigh(c.AsInt16(), multsrb.AsInt16()); - Vector256 e = Avx2.Add(input.AsByte(), d.AsByte()); - Vector256 f = Avx2.ShiftLeftLogical(e.AsInt16(), 8); - Vector256 g = Avx2.MultiplyHigh(f, multsb2.AsInt16()); - Vector256 h = Avx2.ShiftRightLogical(g.AsInt32(), 8); - Vector256 i = Avx2.Add(h.AsByte(), f.AsByte()); - Vector256 j = Avx2.ShiftRightLogical(i.AsInt16(), 8); - Vector256 output = Avx2.Or(j.AsByte(), a); - Avx.Store((byte*)pos, output); - } + ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), idx); + Vector256 input = Unsafe.As>(ref pos); + Vector256 a = Avx2.And(input.AsByte(), TransformColorInverseAlphaGreenMask256); + Vector256 b = Avx2.ShuffleLow(a.AsInt16(), TransformColorInverseShuffleMask); + Vector256 c = Avx2.ShuffleHigh(b.AsInt16(), TransformColorInverseShuffleMask); + Vector256 d = Avx2.MultiplyHigh(c.AsInt16(), multsrb.AsInt16()); + Vector256 e = Avx2.Add(input.AsByte(), d.AsByte()); + Vector256 f = Avx2.ShiftLeftLogical(e.AsInt16(), 8); + Vector256 g = Avx2.MultiplyHigh(f, multsb2.AsInt16()); + Vector256 h = Avx2.ShiftRightLogical(g.AsInt32(), 8); + Vector256 i = Avx2.Add(h.AsByte(), f.AsByte()); + Vector256 j = Avx2.ShiftRightLogical(i.AsInt16(), 8); + Vector256 output = Avx2.Or(j.AsByte(), a); + Unsafe.As>(ref pos) = output.AsUInt32(); + } - if (idx != pixelData.Length) - { - TransformColorInverseNoneVectorized(m, pixelData.Slice(idx)); - } + if (idx != pixelData.Length) + { + TransformColorInverseNoneVectorized(m, pixelData.Slice(idx)); } } else if (Sse2.IsSupported) { Vector128 multsrb = MkCst16(Cst5b(m.GreenToRed), Cst5b(m.GreenToBlue)); Vector128 multsb2 = MkCst16(Cst5b(m.RedToBlue), 0); - fixed (uint* src = pixelData) + + int idx; + for (idx = 0; idx + 4 <= pixelData.Length; idx += 4) { - int idx; - for (idx = 0; idx + 4 <= pixelData.Length; idx += 4) - { - uint* pos = src + idx; - Vector128 input = Sse2.LoadVector128(pos); - Vector128 a = Sse2.And(input.AsByte(), TransformColorInverseAlphaGreenMask); - Vector128 b = Sse2.ShuffleLow(a.AsInt16(), TransformColorInverseShuffleMask); - Vector128 c = Sse2.ShuffleHigh(b.AsInt16(), TransformColorInverseShuffleMask); - Vector128 d = Sse2.MultiplyHigh(c.AsInt16(), multsrb.AsInt16()); - Vector128 e = Sse2.Add(input.AsByte(), d.AsByte()); - Vector128 f = Sse2.ShiftLeftLogical(e.AsInt16(), 8); - Vector128 g = Sse2.MultiplyHigh(f, multsb2.AsInt16()); - Vector128 h = Sse2.ShiftRightLogical(g.AsInt32(), 8); - Vector128 i = Sse2.Add(h.AsByte(), f.AsByte()); - Vector128 j = Sse2.ShiftRightLogical(i.AsInt16(), 8); - Vector128 output = Sse2.Or(j.AsByte(), a); - Sse2.Store((byte*)pos, output); - } + ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), idx); + Vector128 input = Unsafe.As>(ref pos); + Vector128 a = Sse2.And(input.AsByte(), TransformColorInverseAlphaGreenMask); + Vector128 b = Sse2.ShuffleLow(a.AsInt16(), TransformColorInverseShuffleMask); + Vector128 c = Sse2.ShuffleHigh(b.AsInt16(), TransformColorInverseShuffleMask); + Vector128 d = Sse2.MultiplyHigh(c.AsInt16(), multsrb.AsInt16()); + Vector128 e = Sse2.Add(input.AsByte(), d.AsByte()); + Vector128 f = Sse2.ShiftLeftLogical(e.AsInt16(), 8); + Vector128 g = Sse2.MultiplyHigh(f, multsb2.AsInt16()); + Vector128 h = Sse2.ShiftRightLogical(g.AsInt32(), 8); + Vector128 i = Sse2.Add(h.AsByte(), f.AsByte()); + Vector128 j = Sse2.ShiftRightLogical(i.AsInt16(), 8); + Vector128 output = Sse2.Or(j.AsByte(), a); + Unsafe.As>(ref pos) = output.AsUInt32(); + } - if (idx != pixelData.Length) - { - TransformColorInverseNoneVectorized(m, pixelData.Slice(idx)); - } + if (idx != pixelData.Length) + { + TransformColorInverseNoneVectorized(m, pixelData.Slice(idx)); } } else @@ -885,15 +857,14 @@ private static float FastSLog2Slow(uint v) int correction = (int)((23 * (origV & (y - 1))) >> 4); return (vF * (WebpLookupTables.Log2Table[v] + logCnt)) + correction; } - else - { - return (float)(Log2Reciprocal * v * Math.Log(v)); - } + + return (float)(Log2Reciprocal * v * Math.Log(v)); } private static float FastLog2Slow(uint v) { Guard.MustBeGreaterThanOrEqualTo(v, LogLookupIdxMax, nameof(v)); + if (v < ApproxLogWithCorrectionMax) { int logCnt = 0; From ff77361e7c8277c5eddd71614dcbd808e22360cf Mon Sep 17 00:00:00 2001 From: Brian Popow <38701097+brianpopow@users.noreply.github.com> Date: Mon, 15 Nov 2021 20:00:25 +0100 Subject: [PATCH 5/7] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Günther Foidl --- .../Formats/Webp/Lossless/LosslessUtils.cs | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs index c202ad4a8b..5903ba9a29 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs @@ -147,7 +147,7 @@ public static void AddGreenToBlueAndRed(Span pixelData) { int numPixels = pixelData.Length; int i; - for (i = 0; i + 4 <= numPixels; i += 4) + for (i = 0; i <= numPixels - 4; i += 4) { ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i); Vector128 input = Unsafe.As>(ref pos).AsByte(); @@ -165,7 +165,7 @@ public static void AddGreenToBlueAndRed(Span pixelData) { int numPixels = pixelData.Length; int i; - for (i = 0; i + 4 <= numPixels; i += 4) + for (i = 0; i <= numPixels - 4; i += 4) { ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i); Vector128 input = Unsafe.As>(ref pos).AsByte(); @@ -209,7 +209,7 @@ public static void SubtractGreenFromBlueAndRed(Span pixelData) { int numPixels = pixelData.Length; int i; - for (i = 0; i + 8 <= numPixels; i += 8) + for (i = 0; i <= numPixels - 8; i += 8) { ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i); Vector256 input = Unsafe.As>(ref pos).AsByte(); @@ -227,7 +227,7 @@ public static void SubtractGreenFromBlueAndRed(Span pixelData) { int numPixels = pixelData.Length; int i; - for (i = 0; i + 4 <= numPixels; i += 4) + for (i = 0; i <= numPixels - 4; i += 4) { ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i); Vector128 input = Unsafe.As>(ref pos).AsByte(); @@ -245,7 +245,7 @@ public static void SubtractGreenFromBlueAndRed(Span pixelData) { int numPixels = pixelData.Length; int i; - for (i = 0; i + 4 <= numPixels; i += 4) + for (i = 0; i <= numPixels - 4; i += 4) { ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i); Vector128 input = Unsafe.As>(ref pos).AsByte(); @@ -402,7 +402,7 @@ public static void TransformColor(Vp8LMultipliers m, Span pixelData, int n Vector256 multsb2 = MkCst32(Cst5b(m.RedToBlue), 0); int idx; - for (idx = 0; idx + 8 <= numPixels; idx += 8) + for (idx = 0; idx <= numPixels - 8; idx += 8) { ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), idx); Vector256 input = Unsafe.As>(ref pos); @@ -429,7 +429,7 @@ public static void TransformColor(Vp8LMultipliers m, Span pixelData, int n Vector128 multsrb = MkCst16(Cst5b(m.GreenToRed), Cst5b(m.GreenToBlue)); Vector128 multsb2 = MkCst16(Cst5b(m.RedToBlue), 0); int idx; - for (idx = 0; idx + 4 <= numPixels; idx += 4) + for (idx = 0; idx <= numPixels - 4; idx += 4) { ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), idx); Vector128 input = Unsafe.As>(ref pos); @@ -489,7 +489,7 @@ public static void TransformColorInverse(Vp8LMultipliers m, Span pixelData Vector256 multsrb = MkCst32(Cst5b(m.GreenToRed), Cst5b(m.GreenToBlue)); Vector256 multsb2 = MkCst32(Cst5b(m.RedToBlue), 0); int idx; - for (idx = 0; idx + 8 <= pixelData.Length; idx += 8) + for (idx = 0; idx <= pixelData.Length - 8; idx += 8) { ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), idx); Vector256 input = Unsafe.As>(ref pos); @@ -518,7 +518,7 @@ public static void TransformColorInverse(Vp8LMultipliers m, Span pixelData Vector128 multsb2 = MkCst16(Cst5b(m.RedToBlue), 0); int idx; - for (idx = 0; idx + 4 <= pixelData.Length; idx += 4) + for (idx = 0; idx <= pixelData.Length - 4; idx += 4) { ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), idx); Vector128 input = Unsafe.As>(ref pos); From b53aab44b36a1d9d6c90457c724d3e53d39d90ba Mon Sep 17 00:00:00 2001 From: Brian Popow <38701097+brianpopow@users.noreply.github.com> Date: Mon, 15 Nov 2021 20:03:57 +0100 Subject: [PATCH 6/7] Change loop condition to i <= numPixels - 8 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Günther Foidl --- src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs index 5903ba9a29..ca021ba9d2 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs @@ -129,7 +129,7 @@ public static void AddGreenToBlueAndRed(Span pixelData) { int numPixels = pixelData.Length; int i; - for (i = 0; i + 8 <= numPixels; i += 8) + for (i = 0; i <= numPixels - 8; i += 8) { ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i); Vector256 input = Unsafe.As>(ref pos).AsByte(); From 00d20b8ee55b0eb01297b36cbb9dee1f697df27c Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Tue, 16 Nov 2021 22:06:47 +1100 Subject: [PATCH 7/7] Use nint and rename scalar fallback --- .../Formats/Webp/Lossless/LosslessUtils.cs | 56 +++++++++---------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs index ca021ba9d2..84b01846ba 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs @@ -128,7 +128,7 @@ public static void AddGreenToBlueAndRed(Span pixelData) if (Avx2.IsSupported) { int numPixels = pixelData.Length; - int i; + nint i; for (i = 0; i <= numPixels - 8; i += 8) { ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i); @@ -140,13 +140,13 @@ public static void AddGreenToBlueAndRed(Span pixelData) if (i != numPixels) { - AddGreenToBlueAndRedNoneVectorized(pixelData.Slice(i)); + AddGreenToBlueAndRedScalar(pixelData.Slice((int)i)); } } else if (Ssse3.IsSupported) { int numPixels = pixelData.Length; - int i; + nint i; for (i = 0; i <= numPixels - 4; i += 4) { ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i); @@ -158,13 +158,13 @@ public static void AddGreenToBlueAndRed(Span pixelData) if (i != numPixels) { - AddGreenToBlueAndRedNoneVectorized(pixelData.Slice(i)); + AddGreenToBlueAndRedScalar(pixelData.Slice((int)i)); } } else if (Sse2.IsSupported) { int numPixels = pixelData.Length; - int i; + nint i; for (i = 0; i <= numPixels - 4; i += 4) { ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i); @@ -178,17 +178,17 @@ public static void AddGreenToBlueAndRed(Span pixelData) if (i != numPixels) { - AddGreenToBlueAndRedNoneVectorized(pixelData.Slice(i)); + AddGreenToBlueAndRedScalar(pixelData.Slice((int)i)); } } else #endif { - AddGreenToBlueAndRedNoneVectorized(pixelData); + AddGreenToBlueAndRedScalar(pixelData); } } - private static void AddGreenToBlueAndRedNoneVectorized(Span pixelData) + private static void AddGreenToBlueAndRedScalar(Span pixelData) { int numPixels = pixelData.Length; for (int i = 0; i < numPixels; i++) @@ -208,7 +208,7 @@ public static void SubtractGreenFromBlueAndRed(Span pixelData) if (Avx2.IsSupported) { int numPixels = pixelData.Length; - int i; + nint i; for (i = 0; i <= numPixels - 8; i += 8) { ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i); @@ -220,13 +220,13 @@ public static void SubtractGreenFromBlueAndRed(Span pixelData) if (i != numPixels) { - SubtractGreenFromBlueAndRedNoneVectorized(pixelData.Slice(i)); + SubtractGreenFromBlueAndRedScalar(pixelData.Slice((int)i)); } } else if (Ssse3.IsSupported) { int numPixels = pixelData.Length; - int i; + nint i; for (i = 0; i <= numPixels - 4; i += 4) { ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i); @@ -238,13 +238,13 @@ public static void SubtractGreenFromBlueAndRed(Span pixelData) if (i != numPixels) { - SubtractGreenFromBlueAndRedNoneVectorized(pixelData.Slice(i)); + SubtractGreenFromBlueAndRedScalar(pixelData.Slice((int)i)); } } else if (Sse2.IsSupported) { int numPixels = pixelData.Length; - int i; + nint i; for (i = 0; i <= numPixels - 4; i += 4) { ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i); @@ -258,17 +258,17 @@ public static void SubtractGreenFromBlueAndRed(Span pixelData) if (i != numPixels) { - SubtractGreenFromBlueAndRedNoneVectorized(pixelData.Slice(i)); + SubtractGreenFromBlueAndRedScalar(pixelData.Slice((int)i)); } } else #endif { - SubtractGreenFromBlueAndRedNoneVectorized(pixelData); + SubtractGreenFromBlueAndRedScalar(pixelData); } } - private static void SubtractGreenFromBlueAndRedNoneVectorized(Span pixelData) + private static void SubtractGreenFromBlueAndRedScalar(Span pixelData) { int numPixels = pixelData.Length; for (int i = 0; i < numPixels; i++) @@ -401,7 +401,7 @@ public static void TransformColor(Vp8LMultipliers m, Span pixelData, int n Vector256 multsrb = MkCst32(Cst5b(m.GreenToRed), Cst5b(m.GreenToBlue)); Vector256 multsb2 = MkCst32(Cst5b(m.RedToBlue), 0); - int idx; + nint idx; for (idx = 0; idx <= numPixels - 8; idx += 8) { ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), idx); @@ -421,14 +421,14 @@ public static void TransformColor(Vp8LMultipliers m, Span pixelData, int n if (idx != numPixels) { - TransformColorNoneVectorized(m, pixelData.Slice(idx), numPixels - idx); + TransformColorScalar(m, pixelData.Slice((int)idx), numPixels - (int)idx); } } else if (Sse2.IsSupported) { Vector128 multsrb = MkCst16(Cst5b(m.GreenToRed), Cst5b(m.GreenToBlue)); Vector128 multsb2 = MkCst16(Cst5b(m.RedToBlue), 0); - int idx; + nint idx; for (idx = 0; idx <= numPixels - 4; idx += 4) { ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), idx); @@ -448,17 +448,17 @@ public static void TransformColor(Vp8LMultipliers m, Span pixelData, int n if (idx != numPixels) { - TransformColorNoneVectorized(m, pixelData.Slice(idx), numPixels - idx); + TransformColorScalar(m, pixelData.Slice((int)idx), numPixels - (int)idx); } } else #endif { - TransformColorNoneVectorized(m, pixelData, numPixels); + TransformColorScalar(m, pixelData, numPixels); } } - private static void TransformColorNoneVectorized(Vp8LMultipliers m, Span data, int numPixels) + private static void TransformColorScalar(Vp8LMultipliers m, Span data, int numPixels) { for (int i = 0; i < numPixels; i++) { @@ -488,7 +488,7 @@ public static void TransformColorInverse(Vp8LMultipliers m, Span pixelData { Vector256 multsrb = MkCst32(Cst5b(m.GreenToRed), Cst5b(m.GreenToBlue)); Vector256 multsb2 = MkCst32(Cst5b(m.RedToBlue), 0); - int idx; + nint idx; for (idx = 0; idx <= pixelData.Length - 8; idx += 8) { ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), idx); @@ -509,7 +509,7 @@ public static void TransformColorInverse(Vp8LMultipliers m, Span pixelData if (idx != pixelData.Length) { - TransformColorInverseNoneVectorized(m, pixelData.Slice(idx)); + TransformColorInverseScalar(m, pixelData.Slice((int)idx)); } } else if (Sse2.IsSupported) @@ -517,7 +517,7 @@ public static void TransformColorInverse(Vp8LMultipliers m, Span pixelData Vector128 multsrb = MkCst16(Cst5b(m.GreenToRed), Cst5b(m.GreenToBlue)); Vector128 multsb2 = MkCst16(Cst5b(m.RedToBlue), 0); - int idx; + nint idx; for (idx = 0; idx <= pixelData.Length - 4; idx += 4) { ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), idx); @@ -538,17 +538,17 @@ public static void TransformColorInverse(Vp8LMultipliers m, Span pixelData if (idx != pixelData.Length) { - TransformColorInverseNoneVectorized(m, pixelData.Slice(idx)); + TransformColorInverseScalar(m, pixelData.Slice((int)idx)); } } else #endif { - TransformColorInverseNoneVectorized(m, pixelData); + TransformColorInverseScalar(m, pixelData); } } - private static void TransformColorInverseNoneVectorized(Vp8LMultipliers m, Span pixelData) + private static void TransformColorInverseScalar(Vp8LMultipliers m, Span pixelData) { for (int i = 0; i < pixelData.Length; i++) {