From 02d0a808c3d3ed406db7f7e7a7412e3567cce67b Mon Sep 17 00:00:00 2001 From: TechPizza Date: Sun, 9 May 2021 02:26:24 +0200 Subject: [PATCH 01/12] Vectorized PaethFilter --- .../Formats/Png/Filters/PaethFilter.cs | 69 +++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/src/ImageSharp/Formats/Png/Filters/PaethFilter.cs b/src/ImageSharp/Formats/Png/Filters/PaethFilter.cs index fab6788061..7562c47558 100644 --- a/src/ImageSharp/Formats/Png/Filters/PaethFilter.cs +++ b/src/ImageSharp/Formats/Png/Filters/PaethFilter.cs @@ -2,6 +2,7 @@ // Licensed under the Apache License, Version 2.0. using System; +using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; @@ -82,6 +83,43 @@ public static void Encode(Span scanline, Span previousScanline, Span sum += Numerics.Abs(unchecked((sbyte)res)); } +#if SUPPORTS_RUNTIME_INTRINSICS + if (Vector.IsHardwareAccelerated) + { + Vector sumAccumulator = Vector.Zero; + + for (int xLeft = x - bytesPerPixel; x + Vector.Count <= scanline.Length; xLeft += Vector.Count) + { + var scan = new Vector(scanline.Slice(x)); + var left = new Vector(scanline.Slice(xLeft)); + var above = new Vector(previousScanline.Slice(x)); + var upperLeft = new Vector(previousScanline.Slice(xLeft)); + + Vector res = scan - PaethPredictor(left, above, upperLeft); + res.CopyTo(result.Slice(x + 1)); // + 1 to skip filter type + x += Vector.Count; + + Vector.Widen( + Vector.Abs(Vector.AsVectorSByte(res)), + out Vector shortLow, + out Vector shortHigh); + + Vector.Widen(shortLow, out Vector intLow, out Vector intHigh); + sumAccumulator += intLow; + sumAccumulator += intHigh; + + Vector.Widen(shortHigh, out intLow, out intHigh); + sumAccumulator += intLow; + sumAccumulator += intHigh; + } + + for (int i = 0; i < Vector.Count; i++) + { + sum += sumAccumulator[i]; + } + } +#endif + for (int xLeft = x - bytesPerPixel; x < scanline.Length; ++xLeft /* Note: ++x happens in the body to avoid one add operation */) { byte scan = Unsafe.Add(ref scanBaseRef, x); @@ -127,5 +165,36 @@ private static byte PaethPredictor(byte left, byte above, byte upperLeft) return upperLeft; } + + private static Vector PaethPredictor(Vector left, Vector above, Vector upperLeft) + { + Vector.Widen(left, out Vector a1, out Vector a2); + Vector.Widen(above, out Vector b1, out Vector b2); + Vector.Widen(upperLeft, out Vector c1, out Vector c2); + + Vector p1 = PaethPredictor(Vector.AsVectorInt16(a1), Vector.AsVectorInt16(b1), Vector.AsVectorInt16(c1)); + Vector p2 = PaethPredictor(Vector.AsVectorInt16(a2), Vector.AsVectorInt16(b2), Vector.AsVectorInt16(c2)); + return Vector.AsVectorByte(Vector.Narrow(p1, p2)); + } + + private static Vector PaethPredictor(Vector left, Vector above, Vector upperLeft) + { + Vector p = left + above - upperLeft; + var pa = Vector.Abs(p - left); + var pb = Vector.Abs(p - above); + var pc = Vector.Abs(p - upperLeft); + + var pa_pb = Vector.LessThanOrEqual(pa, pb); + var pa_pc = Vector.LessThanOrEqual(pa, pc); + var pb_pc = Vector.LessThanOrEqual(pb, pc); + + return Vector.ConditionalSelect( + condition: Vector.BitwiseAnd(pa_pb, pa_pc), + left: left, + right: Vector.ConditionalSelect( + condition: pb_pc, + left: above, + right: upperLeft)); + } } } From 78b6d78058f78ab9d5a10cf0fcef8685aac7dc93 Mon Sep 17 00:00:00 2001 From: TechPizza Date: Sat, 15 May 2021 19:10:46 +0200 Subject: [PATCH 02/12] Moved Accumulate to Numerics --- src/ImageSharp/Common/Helpers/Numerics.cs | 14 ++++++++++++++ .../Formats/Png/Filters/PaethFilter.cs | 19 ++++--------------- 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/Numerics.cs b/src/ImageSharp/Common/Helpers/Numerics.cs index 6105422372..0147689117 100644 --- a/src/ImageSharp/Common/Helpers/Numerics.cs +++ b/src/ImageSharp/Common/Helpers/Numerics.cs @@ -748,5 +748,19 @@ public static Vector256 Lerp( [MethodImpl(MethodImplOptions.AggressiveInlining)] public static float Lerp(float value1, float value2, float amount) => ((value2 - value1) * amount) + value1; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static void Accumulate(ref Vector accumulator, Vector values) + { + Vector.Widen(values, out Vector shortLow, out Vector shortHigh); + + Vector.Widen(shortLow, out Vector intLow, out Vector intHigh); + accumulator += intLow; + accumulator += intHigh; + + Vector.Widen(shortHigh, out intLow, out intHigh); + accumulator += intLow; + accumulator += intHigh; + } } } diff --git a/src/ImageSharp/Formats/Png/Filters/PaethFilter.cs b/src/ImageSharp/Formats/Png/Filters/PaethFilter.cs index 7562c47558..6e7bb8fb1f 100644 --- a/src/ImageSharp/Formats/Png/Filters/PaethFilter.cs +++ b/src/ImageSharp/Formats/Png/Filters/PaethFilter.cs @@ -86,7 +86,7 @@ public static void Encode(Span scanline, Span previousScanline, Span #if SUPPORTS_RUNTIME_INTRINSICS if (Vector.IsHardwareAccelerated) { - Vector sumAccumulator = Vector.Zero; + Vector sumAccumulator = Vector.Zero; for (int xLeft = x - bytesPerPixel; x + Vector.Count <= scanline.Length; xLeft += Vector.Count) { @@ -99,23 +99,12 @@ public static void Encode(Span scanline, Span previousScanline, Span res.CopyTo(result.Slice(x + 1)); // + 1 to skip filter type x += Vector.Count; - Vector.Widen( - Vector.Abs(Vector.AsVectorSByte(res)), - out Vector shortLow, - out Vector shortHigh); - - Vector.Widen(shortLow, out Vector intLow, out Vector intHigh); - sumAccumulator += intLow; - sumAccumulator += intHigh; - - Vector.Widen(shortHigh, out intLow, out intHigh); - sumAccumulator += intLow; - sumAccumulator += intHigh; + Numerics.Accumulate(ref sumAccumulator, Vector.AsVectorByte(Vector.Abs(Vector.AsVectorSByte(res)))); } - for (int i = 0; i < Vector.Count; i++) + for (int i = 0; i < Vector.Count; i++) { - sum += sumAccumulator[i]; + sum += (int)sumAccumulator[i]; } } #endif From c16af90f79c1d2f6dbf610006f0a674557fedb0b Mon Sep 17 00:00:00 2001 From: TechPizza Date: Sat, 15 May 2021 19:11:09 +0200 Subject: [PATCH 03/12] Vectorized AverageFilter --- .../Formats/Png/Filters/AverageFilter.cs | 120 ++++++++++++++++++ 1 file changed, 120 insertions(+) diff --git a/src/ImageSharp/Formats/Png/Filters/AverageFilter.cs b/src/ImageSharp/Formats/Png/Filters/AverageFilter.cs index d1c214e3d6..57416a737b 100644 --- a/src/ImageSharp/Formats/Png/Filters/AverageFilter.cs +++ b/src/ImageSharp/Formats/Png/Filters/AverageFilter.cs @@ -5,6 +5,11 @@ using System.Runtime.CompilerServices; using System.Runtime.InteropServices; +#if SUPPORTS_RUNTIME_INTRINSICS +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +#endif + namespace SixLabors.ImageSharp.Formats.Png.Filters { /// @@ -79,6 +84,89 @@ public static void Encode(Span scanline, Span previousScanline, Span sum += Numerics.Abs(unchecked((sbyte)res)); } +#if SUPPORTS_RUNTIME_INTRINSICS + if (Avx2.IsSupported) + { + Vector256 sumAccumulator = Vector256.Zero; + + for (int xLeft = x - bytesPerPixel; x + Vector256.Count <= scanline.Length; xLeft += Vector256.Count) + { + Vector256 scan = Unsafe.As>(ref Unsafe.Add(ref scanBaseRef, x)); + Vector256 left = Unsafe.As>(ref Unsafe.Add(ref scanBaseRef, xLeft)); + Vector256 above = Unsafe.As>(ref Unsafe.Add(ref prevBaseRef, x)); + + Vector256 res = Avx2.Subtract(scan, Average(left, above)); + Unsafe.As>(ref Unsafe.Add(ref resultBaseRef, x + 1)) = res; // +1 to skip filter type + x += Vector256.Count; + + Vector256 absRes = Avx2.Abs(res.AsSByte()).AsSByte(); + Vector256 loRes16 = Avx2.UnpackLow(absRes, Vector256.Zero).AsInt16(); + Vector256 hiRes16 = Avx2.UnpackHigh(absRes, Vector256.Zero).AsInt16(); + + Vector256 loRes32 = Avx2.UnpackLow(loRes16, Vector256.Zero).AsInt32(); + Vector256 hiRes32 = Avx2.UnpackHigh(loRes16, Vector256.Zero).AsInt32(); + sumAccumulator = Avx2.Add(sumAccumulator, loRes32); + sumAccumulator = Avx2.Add(sumAccumulator, hiRes32); + + loRes32 = Avx2.UnpackLow(hiRes16, Vector256.Zero).AsInt32(); + hiRes32 = Avx2.UnpackHigh(hiRes16, Vector256.Zero).AsInt32(); + sumAccumulator = Avx2.Add(sumAccumulator, loRes32); + sumAccumulator = Avx2.Add(sumAccumulator, hiRes32); + } + + for (int i = 0; i < Vector256.Count; i++) + { + sum += sumAccumulator.GetElement(i); + } + } + else if (Sse2.IsSupported) + { + var allBitsSet = Vector128.Create((sbyte)-1); + Vector128 sumAccumulator = Vector128.Zero; + + for (int xLeft = x - bytesPerPixel; x + Vector128.Count <= scanline.Length; xLeft += Vector128.Count) + { + Vector128 scan = Unsafe.As>(ref Unsafe.Add(ref scanBaseRef, x)); + Vector128 left = Unsafe.As>(ref Unsafe.Add(ref scanBaseRef, xLeft)); + Vector128 above = Unsafe.As>(ref Unsafe.Add(ref prevBaseRef, x)); + + Vector128 res = Sse2.Subtract(scan, Average(left, above)); + Unsafe.As>(ref Unsafe.Add(ref resultBaseRef, x + 1)) = res; // +1 to skip filter type + x += Vector128.Count; + + Vector128 absRes; + if (Ssse3.IsSupported) + { + absRes = Ssse3.Abs(res.AsSByte()).AsSByte(); + } + else + { + Vector128 mask = Sse2.CompareGreaterThan(res.AsSByte(), Vector128.Zero); + mask = Sse2.Xor(mask, allBitsSet); + absRes = Sse2.Xor(Sse2.Add(res.AsSByte(), mask), mask); + } + + Vector128 loRes16 = Sse2.UnpackLow(absRes, Vector128.Zero).AsInt16(); + Vector128 hiRes16 = Sse2.UnpackHigh(absRes, Vector128.Zero).AsInt16(); + + Vector128 loRes32 = Sse2.UnpackLow(loRes16, Vector128.Zero).AsInt32(); + Vector128 hiRes32 = Sse2.UnpackHigh(loRes16, Vector128.Zero).AsInt32(); + sumAccumulator = Sse2.Add(sumAccumulator, loRes32); + sumAccumulator = Sse2.Add(sumAccumulator, hiRes32); + + loRes32 = Sse2.UnpackLow(hiRes16, Vector128.Zero).AsInt32(); + hiRes32 = Sse2.UnpackHigh(hiRes16, Vector128.Zero).AsInt32(); + sumAccumulator = Sse2.Add(sumAccumulator, loRes32); + sumAccumulator = Sse2.Add(sumAccumulator, hiRes32); + } + + for (int i = 0; i < Vector128.Count; i++) + { + sum += sumAccumulator.GetElement(i); + } + } +#endif + for (int xLeft = x - bytesPerPixel; x < scanline.Length; ++xLeft /* Note: ++x happens in the body to avoid one add operation */) { byte scan = Unsafe.Add(ref scanBaseRef, x); @@ -101,5 +189,37 @@ public static void Encode(Span scanline, Span previousScanline, Span /// The [MethodImpl(MethodImplOptions.AggressiveInlining)] private static int Average(byte left, byte above) => (left + above) >> 1; + +#if SUPPORTS_RUNTIME_INTRINSICS + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector128 Average(Vector128 left, Vector128 above) + { + Vector128 loLeft16 = Sse2.UnpackLow(left, Vector128.Zero).AsUInt16(); + Vector128 hiLeft16 = Sse2.UnpackHigh(left, Vector128.Zero).AsUInt16(); + + Vector128 loAbove16 = Sse2.UnpackLow(above, Vector128.Zero).AsUInt16(); + Vector128 hiAbove16 = Sse2.UnpackHigh(above, Vector128.Zero).AsUInt16(); + + Vector128 div1 = Sse2.ShiftRightLogical(Sse2.Add(loLeft16, loAbove16), 1); + Vector128 div2 = Sse2.ShiftRightLogical(Sse2.Add(hiLeft16, hiAbove16), 1); + + return Sse2.PackUnsignedSaturate(div1.AsInt16(), div2.AsInt16()); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector256 Average(Vector256 left, Vector256 above) + { + Vector256 loLeft16 = Avx2.UnpackLow(left, Vector256.Zero).AsUInt16(); + Vector256 hiLeft16 = Avx2.UnpackHigh(left, Vector256.Zero).AsUInt16(); + + Vector256 loAbove16 = Avx2.UnpackLow(above, Vector256.Zero).AsUInt16(); + Vector256 hiAbove16 = Avx2.UnpackHigh(above, Vector256.Zero).AsUInt16(); + + Vector256 div1 = Avx2.ShiftRightLogical(Avx2.Add(loLeft16, loAbove16), 1); + Vector256 div2 = Avx2.ShiftRightLogical(Avx2.Add(hiLeft16, hiAbove16), 1); + + return Avx2.PackUnsignedSaturate(div1.AsInt16(), div2.AsInt16()); + } +#endif } } From 514d23098276b48847e8658aeb57c5ffb195c1fc Mon Sep 17 00:00:00 2001 From: TechPizza Date: Sat, 15 May 2021 19:11:21 +0200 Subject: [PATCH 04/12] Vectorized SubFilter --- .../Formats/Png/Filters/SubFilter.cs | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/src/ImageSharp/Formats/Png/Filters/SubFilter.cs b/src/ImageSharp/Formats/Png/Filters/SubFilter.cs index cb4cfb471f..31d65995a0 100644 --- a/src/ImageSharp/Formats/Png/Filters/SubFilter.cs +++ b/src/ImageSharp/Formats/Png/Filters/SubFilter.cs @@ -2,6 +2,7 @@ // Licensed under the Apache License, Version 2.0. using System; +using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; @@ -64,6 +65,30 @@ public static void Encode(Span scanline, Span result, int bytesPerPi sum += Numerics.Abs(unchecked((sbyte)res)); } +#if SUPPORTS_RUNTIME_INTRINSICS + if (Vector.IsHardwareAccelerated) + { + Vector sumAccumulator = Vector.Zero; + + for (int xLeft = x - bytesPerPixel; x + Vector.Count <= scanline.Length; xLeft += Vector.Count) + { + Vector scan = Unsafe.As>(ref Unsafe.Add(ref scanBaseRef, x)); + Vector prev = Unsafe.As>(ref Unsafe.Add(ref scanBaseRef, xLeft)); + + Vector res = scan - prev; + Unsafe.As>(ref Unsafe.Add(ref resultBaseRef, x + 1)) = res; // +1 to skip filter type + x += Vector.Count; + + Numerics.Accumulate(ref sumAccumulator, Vector.AsVectorByte(Vector.Abs(Vector.AsVectorSByte(res)))); + } + + for (int i = 0; i < Vector.Count; i++) + { + sum += (int)sumAccumulator[i]; + } + } +#endif + for (int xLeft = x - bytesPerPixel; x < scanline.Length; ++xLeft /* Note: ++x happens in the body to avoid one add operation */) { byte scan = Unsafe.Add(ref scanBaseRef, x); From 425e4876fa9a876ffebc8640d7cc75977a682c98 Mon Sep 17 00:00:00 2001 From: TechPizza Date: Sat, 15 May 2021 19:11:28 +0200 Subject: [PATCH 05/12] Vectorized UpFilter --- .../Formats/Png/Filters/UpFilter.cs | 31 +++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/src/ImageSharp/Formats/Png/Filters/UpFilter.cs b/src/ImageSharp/Formats/Png/Filters/UpFilter.cs index cf553cbb68..f119c2fbae 100644 --- a/src/ImageSharp/Formats/Png/Filters/UpFilter.cs +++ b/src/ImageSharp/Formats/Png/Filters/UpFilter.cs @@ -1,7 +1,8 @@ -// Copyright (c) Six Labors. +// Copyright (c) Six Labors. // Licensed under the Apache License, Version 2.0. using System; +using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; @@ -57,7 +58,33 @@ public static void Encode(Span scanline, Span previousScanline, Span // Up(x) = Raw(x) - Prior(x) resultBaseRef = 2; - for (int x = 0; x < scanline.Length; /* Note: ++x happens in the body to avoid one add operation */) + int x = 0; + +#if SUPPORTS_RUNTIME_INTRINSICS + if (Vector.IsHardwareAccelerated) + { + Vector sumAccumulator = Vector.Zero; + + for (; x + Vector.Count <= scanline.Length;) + { + Vector scan = Unsafe.As>(ref Unsafe.Add(ref scanBaseRef, x)); + Vector above = Unsafe.As>(ref Unsafe.Add(ref prevBaseRef, x)); + + Vector res = scan - above; + Unsafe.As>(ref Unsafe.Add(ref resultBaseRef, x + 1)) = res; // +1 to skip filter type + x += Vector.Count; + + Numerics.Accumulate(ref sumAccumulator, Vector.AsVectorByte(Vector.Abs(Vector.AsVectorSByte(res)))); + } + + for (int i = 0; i < Vector.Count; i++) + { + sum += (int)sumAccumulator[i]; + } + } +#endif + + for (; x < scanline.Length; /* Note: ++x happens in the body to avoid one add operation */) { byte scan = Unsafe.Add(ref scanBaseRef, x); byte above = Unsafe.Add(ref prevBaseRef, x); From 41b773ac0211fda9991a0d06ccc79b80d10d10f3 Mon Sep 17 00:00:00 2001 From: TechPizza Date: Sat, 15 May 2021 19:14:04 +0200 Subject: [PATCH 06/12] Made PaethFilter use unsafe loads --- src/ImageSharp/Formats/Png/Filters/PaethFilter.cs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/ImageSharp/Formats/Png/Filters/PaethFilter.cs b/src/ImageSharp/Formats/Png/Filters/PaethFilter.cs index 6e7bb8fb1f..05ecc74a7d 100644 --- a/src/ImageSharp/Formats/Png/Filters/PaethFilter.cs +++ b/src/ImageSharp/Formats/Png/Filters/PaethFilter.cs @@ -90,13 +90,13 @@ public static void Encode(Span scanline, Span previousScanline, Span for (int xLeft = x - bytesPerPixel; x + Vector.Count <= scanline.Length; xLeft += Vector.Count) { - var scan = new Vector(scanline.Slice(x)); - var left = new Vector(scanline.Slice(xLeft)); - var above = new Vector(previousScanline.Slice(x)); - var upperLeft = new Vector(previousScanline.Slice(xLeft)); + Vector scan = Unsafe.As>(ref Unsafe.Add(ref scanBaseRef, x)); + Vector left = Unsafe.As>(ref Unsafe.Add(ref scanBaseRef, xLeft)); + Vector above = Unsafe.As>(ref Unsafe.Add(ref prevBaseRef, x)); + Vector upperLeft = Unsafe.As>(ref Unsafe.Add(ref prevBaseRef, xLeft)); Vector res = scan - PaethPredictor(left, above, upperLeft); - res.CopyTo(result.Slice(x + 1)); // + 1 to skip filter type + Unsafe.As>(ref Unsafe.Add(ref resultBaseRef, x + 1)) = res; // +1 to skip filter type x += Vector.Count; Numerics.Accumulate(ref sumAccumulator, Vector.AsVectorByte(Vector.Abs(Vector.AsVectorSByte(res)))); From 29250ffbec1cd7b62857fae51a00658c5ac32652 Mon Sep 17 00:00:00 2001 From: TechPizza Date: Sun, 16 May 2021 16:50:16 +0200 Subject: [PATCH 07/12] Added Png filter tests --- .../Formats/Png/PngFilterTests.cs | 270 ++++++++++++++++++ .../Formats/Png/ReferenceImplementations.cs | 229 +++++++++++++++ 2 files changed, 499 insertions(+) create mode 100644 tests/ImageSharp.Tests/Formats/Png/PngFilterTests.cs create mode 100644 tests/ImageSharp.Tests/Formats/Png/ReferenceImplementations.cs diff --git a/tests/ImageSharp.Tests/Formats/Png/PngFilterTests.cs b/tests/ImageSharp.Tests/Formats/Png/PngFilterTests.cs new file mode 100644 index 0000000000..dae8f25e58 --- /dev/null +++ b/tests/ImageSharp.Tests/Formats/Png/PngFilterTests.cs @@ -0,0 +1,270 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +// Uncomment this to turn unit tests into benchmarks: +// #define BENCHMARKING +using System; + +using SixLabors.ImageSharp.Formats.Png; +using SixLabors.ImageSharp.Formats.Png.Filters; +using SixLabors.ImageSharp.Tests.Formats.Png.Utils; +using SixLabors.ImageSharp.Tests.TestUtilities; +using Xunit; +using Xunit.Abstractions; + +namespace SixLabors.ImageSharp.Tests.Formats.Png +{ + [Trait("Format", "Png")] + public partial class PngFilterTests : MeasureFixture + { +#if BENCHMARKING + public const int Times = 1000000; +#else + public const int Times = 1; +#endif + + public PngFilterTests(ITestOutputHelper output) + : base(output) + { + } + + public const int Size = 64; + + [Fact] + public void Average() + { + static void RunTest() + { + var data = new TestData(PngFilterMethod.Average, Size); + data.TestFilter(); + } + + FeatureTestRunner.RunWithHwIntrinsicsFeature( + RunTest, + HwIntrinsics.DisableSIMD); + } + + [Fact] + public void AverageSse2() + { + static void RunTest() + { + var data = new TestData(PngFilterMethod.Average, Size); + data.TestFilter(); + } + + FeatureTestRunner.RunWithHwIntrinsicsFeature( + RunTest, + HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2 | HwIntrinsics.DisableSSSE3); + } + + [Fact] + public void AverageSsse3() + { + static void RunTest() + { + var data = new TestData(PngFilterMethod.Average, Size); + data.TestFilter(); + } + + FeatureTestRunner.RunWithHwIntrinsicsFeature( + RunTest, + HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2); + } + + [Fact] + public void AverageAvx2() + { + static void RunTest() + { + var data = new TestData(PngFilterMethod.Average, Size); + data.TestFilter(); + } + + FeatureTestRunner.RunWithHwIntrinsicsFeature( + RunTest, + HwIntrinsics.AllowAll); + } + + [Fact] + public void Paeth() + { + static void RunTest() + { + var data = new TestData(PngFilterMethod.Paeth, Size); + data.TestFilter(); + } + + FeatureTestRunner.RunWithHwIntrinsicsFeature( + RunTest, + HwIntrinsics.DisableSIMD); + } + + [Fact] + public void PaethSimd() + { + static void RunTest() + { + var data = new TestData(PngFilterMethod.Paeth, Size); + data.TestFilter(); + } + + FeatureTestRunner.RunWithHwIntrinsicsFeature( + RunTest, + HwIntrinsics.AllowAll); + } + + [Fact] + public void Up() + { + static void RunTest() + { + var data = new TestData(PngFilterMethod.Up, Size); + data.TestFilter(); + } + + FeatureTestRunner.RunWithHwIntrinsicsFeature( + RunTest, + HwIntrinsics.DisableSIMD); + } + + [Fact] + public void UpSimd() + { + static void RunTest() + { + var data = new TestData(PngFilterMethod.Up, Size); + data.TestFilter(); + } + + FeatureTestRunner.RunWithHwIntrinsicsFeature( + RunTest, + HwIntrinsics.AllowAll); + } + + [Fact] + public void Sub() + { + static void RunTest() + { + var data = new TestData(PngFilterMethod.Sub, Size); + data.TestFilter(); + } + + FeatureTestRunner.RunWithHwIntrinsicsFeature( + RunTest, + HwIntrinsics.DisableSIMD); + } + + [Fact] + public void SubSimd() + { + static void RunTest() + { + var data = new TestData(PngFilterMethod.Sub, Size); + data.TestFilter(); + } + + FeatureTestRunner.RunWithHwIntrinsicsFeature( + RunTest, + HwIntrinsics.AllowAll); + } + + public class TestData + { + private readonly PngFilterMethod filter; + private readonly int bpp; + private readonly byte[] previousScanline; + private readonly byte[] scanline; + private readonly byte[] expectedResult; + private readonly int expectedSum; + private readonly byte[] resultBuffer; + + public TestData(PngFilterMethod filter, int size, int bpp = 4) + { + this.filter = filter; + this.bpp = bpp; + this.previousScanline = new byte[size * size * bpp]; + this.scanline = new byte[size * size * bpp]; + this.expectedResult = new byte[1 + (size * size * bpp)]; + this.resultBuffer = new byte[1 + (size * size * bpp)]; + + var rng = new Random(12345678); + byte[] tmp = new byte[6]; + for (int i = 0; i < this.previousScanline.Length; i += bpp) + { + rng.NextBytes(tmp); + + this.previousScanline[i + 0] = tmp[0]; + this.previousScanline[i + 1] = tmp[1]; + this.previousScanline[i + 2] = tmp[2]; + this.previousScanline[i + 3] = 255; + + this.scanline[i + 0] = tmp[3]; + this.scanline[i + 1] = tmp[4]; + this.scanline[i + 2] = tmp[5]; + this.scanline[i + 3] = 255; + } + + switch (this.filter) + { + case PngFilterMethod.Sub: + ReferenceImplementations.EncodeSubFilter( + this.scanline, this.expectedResult, this.bpp, out this.expectedSum); + break; + + case PngFilterMethod.Up: + ReferenceImplementations.EncodeUpFilter( + this.previousScanline, this.scanline, this.expectedResult, out this.expectedSum); + break; + + case PngFilterMethod.Average: + ReferenceImplementations.EncodeAverageFilter( + this.previousScanline, this.scanline, this.expectedResult, this.bpp, out this.expectedSum); + break; + + case PngFilterMethod.Paeth: + ReferenceImplementations.EncodePaethFilter( + this.previousScanline, this.scanline, this.expectedResult, this.bpp, out this.expectedSum); + break; + + case PngFilterMethod.None: + case PngFilterMethod.Adaptive: + default: + throw new InvalidOperationException(); + } + } + + public void TestFilter() + { + int sum; + switch (this.filter) + { + case PngFilterMethod.Sub: + SubFilter.Encode(this.scanline, this.resultBuffer, this.bpp, out sum); + break; + + case PngFilterMethod.Up: + UpFilter.Encode(this.previousScanline, this.scanline, this.resultBuffer, out sum); + break; + + case PngFilterMethod.Average: + AverageFilter.Encode(this.previousScanline, this.scanline, this.resultBuffer, this.bpp, out sum); + break; + + case PngFilterMethod.Paeth: + PaethFilter.Encode(this.previousScanline, this.scanline, this.resultBuffer, this.bpp, out sum); + break; + + case PngFilterMethod.None: + case PngFilterMethod.Adaptive: + default: + throw new InvalidOperationException(); + } + + Assert.Equal(this.expectedSum, sum); + Assert.Equal(this.expectedResult, this.resultBuffer); + } + } + } +} diff --git a/tests/ImageSharp.Tests/Formats/Png/ReferenceImplementations.cs b/tests/ImageSharp.Tests/Formats/Png/ReferenceImplementations.cs new file mode 100644 index 0000000000..dd8ecc096d --- /dev/null +++ b/tests/ImageSharp.Tests/Formats/Png/ReferenceImplementations.cs @@ -0,0 +1,229 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +using System; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; + +// ReSharper disable InconsistentNaming +namespace SixLabors.ImageSharp.Tests.Formats.Png.Utils +{ + /// + /// This class contains reference implementations to produce verification data for unit tests + /// + internal static partial class ReferenceImplementations + { + /// + /// Encodes the scanline + /// + /// The scanline to encode + /// The previous scanline. + /// The filtered scanline result. + /// The bytes per pixel. + /// The sum of the total variance of the filtered row + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static void EncodePaethFilter(Span scanline, Span previousScanline, Span result, int bytesPerPixel, out int sum) + { + DebugGuard.MustBeSameSized(scanline, previousScanline, nameof(scanline)); + DebugGuard.MustBeSizedAtLeast(result, scanline, nameof(result)); + + ref byte scanBaseRef = ref MemoryMarshal.GetReference(scanline); + ref byte prevBaseRef = ref MemoryMarshal.GetReference(previousScanline); + ref byte resultBaseRef = ref MemoryMarshal.GetReference(result); + sum = 0; + + // Paeth(x) = Raw(x) - PaethPredictor(Raw(x-bpp), Prior(x), Prior(x - bpp)) + resultBaseRef = 4; + + int x = 0; + for (; x < bytesPerPixel; /* Note: ++x happens in the body to avoid one add operation */) + { + byte scan = Unsafe.Add(ref scanBaseRef, x); + byte above = Unsafe.Add(ref prevBaseRef, x); + ++x; + ref byte res = ref Unsafe.Add(ref resultBaseRef, x); + res = (byte)(scan - PaethPredictor(0, above, 0)); + sum += Numerics.Abs(unchecked((sbyte)res)); + } + + for (int xLeft = x - bytesPerPixel; x < scanline.Length; ++xLeft /* Note: ++x happens in the body to avoid one add operation */) + { + byte scan = Unsafe.Add(ref scanBaseRef, x); + byte left = Unsafe.Add(ref scanBaseRef, xLeft); + byte above = Unsafe.Add(ref prevBaseRef, x); + byte upperLeft = Unsafe.Add(ref prevBaseRef, xLeft); + ++x; + ref byte res = ref Unsafe.Add(ref resultBaseRef, x); + res = (byte)(scan - PaethPredictor(left, above, upperLeft)); + sum += Numerics.Abs(unchecked((sbyte)res)); + } + + sum -= 4; + } + + /// + /// Encodes the scanline + /// + /// The scanline to encode + /// The filtered scanline result. + /// The bytes per pixel. + /// The sum of the total variance of the filtered row + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static void EncodeSubFilter(Span scanline, Span result, int bytesPerPixel, out int sum) + { + DebugGuard.MustBeSizedAtLeast(result, scanline, nameof(result)); + + ref byte scanBaseRef = ref MemoryMarshal.GetReference(scanline); + ref byte resultBaseRef = ref MemoryMarshal.GetReference(result); + sum = 0; + + // Sub(x) = Raw(x) - Raw(x-bpp) + resultBaseRef = 1; + + int x = 0; + for (; x < bytesPerPixel; /* Note: ++x happens in the body to avoid one add operation */) + { + byte scan = Unsafe.Add(ref scanBaseRef, x); + ++x; + ref byte res = ref Unsafe.Add(ref resultBaseRef, x); + res = scan; + sum += Numerics.Abs(unchecked((sbyte)res)); + } + + for (int xLeft = x - bytesPerPixel; x < scanline.Length; ++xLeft /* Note: ++x happens in the body to avoid one add operation */) + { + byte scan = Unsafe.Add(ref scanBaseRef, x); + byte prev = Unsafe.Add(ref scanBaseRef, xLeft); + ++x; + ref byte res = ref Unsafe.Add(ref resultBaseRef, x); + res = (byte)(scan - prev); + sum += Numerics.Abs(unchecked((sbyte)res)); + } + + sum -= 1; + } + + /// + /// Encodes the scanline + /// + /// The scanline to encode + /// The previous scanline. + /// The filtered scanline result. + /// The sum of the total variance of the filtered row + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static void EncodeUpFilter(Span scanline, Span previousScanline, Span result, out int sum) + { + DebugGuard.MustBeSameSized(scanline, previousScanline, nameof(scanline)); + DebugGuard.MustBeSizedAtLeast(result, scanline, nameof(result)); + + ref byte scanBaseRef = ref MemoryMarshal.GetReference(scanline); + ref byte prevBaseRef = ref MemoryMarshal.GetReference(previousScanline); + ref byte resultBaseRef = ref MemoryMarshal.GetReference(result); + sum = 0; + + // Up(x) = Raw(x) - Prior(x) + resultBaseRef = 2; + + int x = 0; + + for (; x < scanline.Length; /* Note: ++x happens in the body to avoid one add operation */) + { + byte scan = Unsafe.Add(ref scanBaseRef, x); + byte above = Unsafe.Add(ref prevBaseRef, x); + ++x; + ref byte res = ref Unsafe.Add(ref resultBaseRef, x); + res = (byte)(scan - above); + sum += Numerics.Abs(unchecked((sbyte)res)); + } + + sum -= 2; + } + + /// + /// Encodes the scanline + /// + /// The scanline to encode + /// The previous scanline. + /// The filtered scanline result. + /// The bytes per pixel. + /// The sum of the total variance of the filtered row + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static void EncodeAverageFilter(Span scanline, Span previousScanline, Span result, int bytesPerPixel, out int sum) + { + DebugGuard.MustBeSameSized(scanline, previousScanline, nameof(scanline)); + DebugGuard.MustBeSizedAtLeast(result, scanline, nameof(result)); + + ref byte scanBaseRef = ref MemoryMarshal.GetReference(scanline); + ref byte prevBaseRef = ref MemoryMarshal.GetReference(previousScanline); + ref byte resultBaseRef = ref MemoryMarshal.GetReference(result); + sum = 0; + + // Average(x) = Raw(x) - floor((Raw(x-bpp)+Prior(x))/2) + resultBaseRef = 3; + + int x = 0; + for (; x < bytesPerPixel; /* Note: ++x happens in the body to avoid one add operation */) + { + byte scan = Unsafe.Add(ref scanBaseRef, x); + byte above = Unsafe.Add(ref prevBaseRef, x); + ++x; + ref byte res = ref Unsafe.Add(ref resultBaseRef, x); + res = (byte)(scan - (above >> 1)); + sum += Numerics.Abs(unchecked((sbyte)res)); + } + + for (int xLeft = x - bytesPerPixel; x < scanline.Length; ++xLeft /* Note: ++x happens in the body to avoid one add operation */) + { + byte scan = Unsafe.Add(ref scanBaseRef, x); + byte left = Unsafe.Add(ref scanBaseRef, xLeft); + byte above = Unsafe.Add(ref prevBaseRef, x); + ++x; + ref byte res = ref Unsafe.Add(ref resultBaseRef, x); + res = (byte)(scan - Average(left, above)); + sum += Numerics.Abs(unchecked((sbyte)res)); + } + + sum -= 3; + } + + /// + /// Calculates the average value of two bytes + /// + /// The left byte + /// The above byte + /// The + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static int Average(byte left, byte above) => (left + above) >> 1; + + /// + /// Computes a simple linear function of the three neighboring pixels (left, above, upper left), then chooses + /// as predictor the neighboring pixel closest to the computed value. + /// + /// The left neighbor pixel. + /// The above neighbor pixel. + /// The upper left neighbor pixel. + /// + /// The . + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static byte PaethPredictor(byte left, byte above, byte upperLeft) + { + int p = left + above - upperLeft; + int pa = Numerics.Abs(p - left); + int pb = Numerics.Abs(p - above); + int pc = Numerics.Abs(p - upperLeft); + + if (pa <= pb && pa <= pc) + { + return left; + } + + if (pb <= pc) + { + return above; + } + + return upperLeft; + } + } +} From 64e082615a4ad3e1ca2a8b591b793e52e6e6b8f8 Mon Sep 17 00:00:00 2001 From: TechPizza Date: Tue, 18 May 2021 09:50:26 +0200 Subject: [PATCH 08/12] Optimized AverageFilter --- .../Formats/Png/Filters/AverageFilter.cs | 45 ++++--------------- 1 file changed, 9 insertions(+), 36 deletions(-) diff --git a/src/ImageSharp/Formats/Png/Filters/AverageFilter.cs b/src/ImageSharp/Formats/Png/Filters/AverageFilter.cs index 57416a737b..b596643622 100644 --- a/src/ImageSharp/Formats/Png/Filters/AverageFilter.cs +++ b/src/ImageSharp/Formats/Png/Filters/AverageFilter.cs @@ -88,6 +88,7 @@ public static void Encode(Span scanline, Span previousScanline, Span if (Avx2.IsSupported) { Vector256 sumAccumulator = Vector256.Zero; + Vector256 allBitsSet = Avx2.CompareEqual(sumAccumulator, sumAccumulator).AsByte(); for (int xLeft = x - bytesPerPixel; x + Vector256.Count <= scanline.Length; xLeft += Vector256.Count) { @@ -95,7 +96,9 @@ public static void Encode(Span scanline, Span previousScanline, Span Vector256 left = Unsafe.As>(ref Unsafe.Add(ref scanBaseRef, xLeft)); Vector256 above = Unsafe.As>(ref Unsafe.Add(ref prevBaseRef, x)); - Vector256 res = Avx2.Subtract(scan, Average(left, above)); + Vector256 avg = Avx2.Xor(Avx2.Average(Avx2.Xor(left, allBitsSet), Avx2.Xor(above, allBitsSet)), allBitsSet); + Vector256 res = Avx2.Subtract(scan, avg); + Unsafe.As>(ref Unsafe.Add(ref resultBaseRef, x + 1)) = res; // +1 to skip filter type x += Vector256.Count; @@ -121,8 +124,8 @@ public static void Encode(Span scanline, Span previousScanline, Span } else if (Sse2.IsSupported) { - var allBitsSet = Vector128.Create((sbyte)-1); Vector128 sumAccumulator = Vector128.Zero; + Vector128 allBitsSet = Sse2.CompareEqual(sumAccumulator, sumAccumulator).AsByte(); for (int xLeft = x - bytesPerPixel; x + Vector128.Count <= scanline.Length; xLeft += Vector128.Count) { @@ -130,7 +133,9 @@ public static void Encode(Span scanline, Span previousScanline, Span Vector128 left = Unsafe.As>(ref Unsafe.Add(ref scanBaseRef, xLeft)); Vector128 above = Unsafe.As>(ref Unsafe.Add(ref prevBaseRef, x)); - Vector128 res = Sse2.Subtract(scan, Average(left, above)); + Vector128 avg = Sse2.Xor(Sse2.Average(Sse2.Xor(left, allBitsSet), Sse2.Xor(above, allBitsSet)), allBitsSet); + Vector128 res = Sse2.Subtract(scan, avg); + Unsafe.As>(ref Unsafe.Add(ref resultBaseRef, x + 1)) = res; // +1 to skip filter type x += Vector128.Count; @@ -142,7 +147,7 @@ public static void Encode(Span scanline, Span previousScanline, Span else { Vector128 mask = Sse2.CompareGreaterThan(res.AsSByte(), Vector128.Zero); - mask = Sse2.Xor(mask, allBitsSet); + mask = Sse2.Xor(mask, allBitsSet.AsSByte()); absRes = Sse2.Xor(Sse2.Add(res.AsSByte(), mask), mask); } @@ -189,37 +194,5 @@ public static void Encode(Span scanline, Span previousScanline, Span /// The [MethodImpl(MethodImplOptions.AggressiveInlining)] private static int Average(byte left, byte above) => (left + above) >> 1; - -#if SUPPORTS_RUNTIME_INTRINSICS - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static Vector128 Average(Vector128 left, Vector128 above) - { - Vector128 loLeft16 = Sse2.UnpackLow(left, Vector128.Zero).AsUInt16(); - Vector128 hiLeft16 = Sse2.UnpackHigh(left, Vector128.Zero).AsUInt16(); - - Vector128 loAbove16 = Sse2.UnpackLow(above, Vector128.Zero).AsUInt16(); - Vector128 hiAbove16 = Sse2.UnpackHigh(above, Vector128.Zero).AsUInt16(); - - Vector128 div1 = Sse2.ShiftRightLogical(Sse2.Add(loLeft16, loAbove16), 1); - Vector128 div2 = Sse2.ShiftRightLogical(Sse2.Add(hiLeft16, hiAbove16), 1); - - return Sse2.PackUnsignedSaturate(div1.AsInt16(), div2.AsInt16()); - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static Vector256 Average(Vector256 left, Vector256 above) - { - Vector256 loLeft16 = Avx2.UnpackLow(left, Vector256.Zero).AsUInt16(); - Vector256 hiLeft16 = Avx2.UnpackHigh(left, Vector256.Zero).AsUInt16(); - - Vector256 loAbove16 = Avx2.UnpackLow(above, Vector256.Zero).AsUInt16(); - Vector256 hiAbove16 = Avx2.UnpackHigh(above, Vector256.Zero).AsUInt16(); - - Vector256 div1 = Avx2.ShiftRightLogical(Avx2.Add(loLeft16, loAbove16), 1); - Vector256 div2 = Avx2.ShiftRightLogical(Avx2.Add(hiLeft16, hiAbove16), 1); - - return Avx2.PackUnsignedSaturate(div1.AsInt16(), div2.AsInt16()); - } -#endif } } From d7f02bc23cdd5a0e6fb43ab89a074180d6aa8719 Mon Sep 17 00:00:00 2001 From: TechPizza Date: Tue, 18 May 2021 09:50:39 +0200 Subject: [PATCH 09/12] Greatly optimized PaethFilter --- .../Formats/Png/Filters/PaethFilter.cs | 64 ++++++++++++++++++- 1 file changed, 63 insertions(+), 1 deletion(-) diff --git a/src/ImageSharp/Formats/Png/Filters/PaethFilter.cs b/src/ImageSharp/Formats/Png/Filters/PaethFilter.cs index 05ecc74a7d..7fa8a6b745 100644 --- a/src/ImageSharp/Formats/Png/Filters/PaethFilter.cs +++ b/src/ImageSharp/Formats/Png/Filters/PaethFilter.cs @@ -6,6 +6,11 @@ using System.Runtime.CompilerServices; using System.Runtime.InteropServices; +#if SUPPORTS_RUNTIME_INTRINSICS +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +#endif + namespace SixLabors.ImageSharp.Formats.Png.Filters { /// @@ -84,7 +89,30 @@ public static void Encode(Span scanline, Span previousScanline, Span } #if SUPPORTS_RUNTIME_INTRINSICS - if (Vector.IsHardwareAccelerated) + if (Avx2.IsSupported) + { + Vector256 sumAccumulator = Vector256.Zero; + + for (int xLeft = x - bytesPerPixel; x + Vector256.Count <= scanline.Length; xLeft += Vector256.Count) + { + Vector256 scan = Unsafe.As>(ref Unsafe.Add(ref scanBaseRef, x)); + Vector256 left = Unsafe.As>(ref Unsafe.Add(ref scanBaseRef, xLeft)); + Vector256 above = Unsafe.As>(ref Unsafe.Add(ref prevBaseRef, x)); + Vector256 upperLeft = Unsafe.As>(ref Unsafe.Add(ref prevBaseRef, xLeft)); + + Vector256 res = Avx2.Subtract(scan, PaethPredictor(left, above, upperLeft)); + Unsafe.As>(ref Unsafe.Add(ref resultBaseRef, x + 1)) = res; // +1 to skip filter type + x += Vector256.Count; + + sumAccumulator = Avx2.Add(sumAccumulator, Avx2.SumAbsoluteDifferences(Avx2.Abs(res.AsSByte()), Vector256.Zero).AsInt32()); + } + + for (int i = 0; i < Vector256.Count; i++) + { + sum += sumAccumulator.GetElement(i); + } + } + else if (Vector.IsHardwareAccelerated) { Vector sumAccumulator = Vector.Zero; @@ -155,6 +183,39 @@ private static byte PaethPredictor(byte left, byte above, byte upperLeft) return upperLeft; } +#if SUPPORTS_RUNTIME_INTRINSICS + private static Vector256 PaethPredictor(Vector256 left, Vector256 above, Vector256 upleft) + { + Vector256 zero = Vector256.Zero; + + // Here, we refactor pa = abs(p - left) = abs(left + above - upleft - left) + // to pa = abs(above - upleft). Same deal for pb. + // Using saturated subtraction, if the result is negative, the output is zero. + // If we subtract in both directions and `or` the results, only one can be + // non-zero, so we end up with the absolute value. + Vector256 sac = Avx2.SubtractSaturate(above, upleft); + Vector256 sbc = Avx2.SubtractSaturate(left, upleft); + Vector256 pa = Avx2.Or(Avx2.SubtractSaturate(upleft, above), sac); + Vector256 pb = Avx2.Or(Avx2.SubtractSaturate(upleft, left), sbc); + + // pc = abs(left + above - upleft - upleft), or abs(left - upleft + above - upleft). + // We've already calculated left - upleft and above - upleft in `sac` and `sbc`. + // If they are both negative or both positive, the absolute value of their + // sum can't possibly be less than `pa` or `pb`, so we'll never use the value. + // We make a mask that sets the value to 255 if they either both got + // saturated to zero or both didn't. Then we calculate the absolute value + // of their difference using saturated subtract and `or`, same as before, + // keeping the value only where the mask isn't set. + Vector256 pm = Avx2.CompareEqual(Avx2.CompareEqual(sac, zero), Avx2.CompareEqual(sbc, zero)); + Vector256 pc = Avx2.Or(pm, Avx2.Or(Avx2.SubtractSaturate(pb, pa), Avx2.SubtractSaturate(pa, pb))); + + // Finally, blend the values together. We start with `upleft` and overwrite on + // tied values so that the `left`, `above`, `upleft` precedence is preserved. + Vector256 minbc = Avx2.Min(pc, pb); + Vector256 resbc = Avx2.BlendVariable(upleft, above, Avx2.CompareEqual(minbc, pb)); + return Avx2.BlendVariable(resbc, left, Avx2.CompareEqual(Avx2.Min(minbc, pa), pa)); + } + private static Vector PaethPredictor(Vector left, Vector above, Vector upperLeft) { Vector.Widen(left, out Vector a1, out Vector a2); @@ -185,5 +246,6 @@ private static Vector PaethPredictor(Vector left, Vector ab left: above, right: upperLeft)); } +#endif } } From 9d04ec8274f5cec35aa0c12d8e741652e7fbd341 Mon Sep 17 00:00:00 2001 From: TechPizza Date: Tue, 18 May 2021 10:02:28 +0200 Subject: [PATCH 10/12] Small intrinsics cleanup --- .../Formats/Png/Filters/AverageFilter.cs | 31 +++++++------------ .../Formats/Png/Filters/PaethFilter.cs | 3 +- 2 files changed, 13 insertions(+), 21 deletions(-) diff --git a/src/ImageSharp/Formats/Png/Filters/AverageFilter.cs b/src/ImageSharp/Formats/Png/Filters/AverageFilter.cs index b596643622..818119f331 100644 --- a/src/ImageSharp/Formats/Png/Filters/AverageFilter.cs +++ b/src/ImageSharp/Formats/Png/Filters/AverageFilter.cs @@ -87,6 +87,7 @@ public static void Encode(Span scanline, Span previousScanline, Span #if SUPPORTS_RUNTIME_INTRINSICS if (Avx2.IsSupported) { + Vector256 zero = Vector256.Zero; Vector256 sumAccumulator = Vector256.Zero; Vector256 allBitsSet = Avx2.CompareEqual(sumAccumulator, sumAccumulator).AsByte(); @@ -102,19 +103,7 @@ public static void Encode(Span scanline, Span previousScanline, Span Unsafe.As>(ref Unsafe.Add(ref resultBaseRef, x + 1)) = res; // +1 to skip filter type x += Vector256.Count; - Vector256 absRes = Avx2.Abs(res.AsSByte()).AsSByte(); - Vector256 loRes16 = Avx2.UnpackLow(absRes, Vector256.Zero).AsInt16(); - Vector256 hiRes16 = Avx2.UnpackHigh(absRes, Vector256.Zero).AsInt16(); - - Vector256 loRes32 = Avx2.UnpackLow(loRes16, Vector256.Zero).AsInt32(); - Vector256 hiRes32 = Avx2.UnpackHigh(loRes16, Vector256.Zero).AsInt32(); - sumAccumulator = Avx2.Add(sumAccumulator, loRes32); - sumAccumulator = Avx2.Add(sumAccumulator, hiRes32); - - loRes32 = Avx2.UnpackLow(hiRes16, Vector256.Zero).AsInt32(); - hiRes32 = Avx2.UnpackHigh(hiRes16, Vector256.Zero).AsInt32(); - sumAccumulator = Avx2.Add(sumAccumulator, loRes32); - sumAccumulator = Avx2.Add(sumAccumulator, hiRes32); + sumAccumulator = Avx2.Add(sumAccumulator, Avx2.SumAbsoluteDifferences(Avx2.Abs(res.AsSByte()), zero).AsInt32()); } for (int i = 0; i < Vector256.Count; i++) @@ -124,6 +113,8 @@ public static void Encode(Span scanline, Span previousScanline, Span } else if (Sse2.IsSupported) { + Vector128 zero8 = Vector128.Zero; + Vector128 zero16 = Vector128.Zero; Vector128 sumAccumulator = Vector128.Zero; Vector128 allBitsSet = Sse2.CompareEqual(sumAccumulator, sumAccumulator).AsByte(); @@ -146,21 +137,21 @@ public static void Encode(Span scanline, Span previousScanline, Span } else { - Vector128 mask = Sse2.CompareGreaterThan(res.AsSByte(), Vector128.Zero); + Vector128 mask = Sse2.CompareGreaterThan(res.AsSByte(), zero8); mask = Sse2.Xor(mask, allBitsSet.AsSByte()); absRes = Sse2.Xor(Sse2.Add(res.AsSByte(), mask), mask); } - Vector128 loRes16 = Sse2.UnpackLow(absRes, Vector128.Zero).AsInt16(); - Vector128 hiRes16 = Sse2.UnpackHigh(absRes, Vector128.Zero).AsInt16(); + Vector128 loRes16 = Sse2.UnpackLow(absRes, zero8).AsInt16(); + Vector128 hiRes16 = Sse2.UnpackHigh(absRes, zero8).AsInt16(); - Vector128 loRes32 = Sse2.UnpackLow(loRes16, Vector128.Zero).AsInt32(); - Vector128 hiRes32 = Sse2.UnpackHigh(loRes16, Vector128.Zero).AsInt32(); + Vector128 loRes32 = Sse2.UnpackLow(loRes16, zero16).AsInt32(); + Vector128 hiRes32 = Sse2.UnpackHigh(loRes16, zero16).AsInt32(); sumAccumulator = Sse2.Add(sumAccumulator, loRes32); sumAccumulator = Sse2.Add(sumAccumulator, hiRes32); - loRes32 = Sse2.UnpackLow(hiRes16, Vector128.Zero).AsInt32(); - hiRes32 = Sse2.UnpackHigh(hiRes16, Vector128.Zero).AsInt32(); + loRes32 = Sse2.UnpackLow(hiRes16, zero16).AsInt32(); + hiRes32 = Sse2.UnpackHigh(hiRes16, zero16).AsInt32(); sumAccumulator = Sse2.Add(sumAccumulator, loRes32); sumAccumulator = Sse2.Add(sumAccumulator, hiRes32); } diff --git a/src/ImageSharp/Formats/Png/Filters/PaethFilter.cs b/src/ImageSharp/Formats/Png/Filters/PaethFilter.cs index 7fa8a6b745..f48010dba6 100644 --- a/src/ImageSharp/Formats/Png/Filters/PaethFilter.cs +++ b/src/ImageSharp/Formats/Png/Filters/PaethFilter.cs @@ -91,6 +91,7 @@ public static void Encode(Span scanline, Span previousScanline, Span #if SUPPORTS_RUNTIME_INTRINSICS if (Avx2.IsSupported) { + Vector256 zero = Vector256.Zero; Vector256 sumAccumulator = Vector256.Zero; for (int xLeft = x - bytesPerPixel; x + Vector256.Count <= scanline.Length; xLeft += Vector256.Count) @@ -104,7 +105,7 @@ public static void Encode(Span scanline, Span previousScanline, Span Unsafe.As>(ref Unsafe.Add(ref resultBaseRef, x + 1)) = res; // +1 to skip filter type x += Vector256.Count; - sumAccumulator = Avx2.Add(sumAccumulator, Avx2.SumAbsoluteDifferences(Avx2.Abs(res.AsSByte()), Vector256.Zero).AsInt32()); + sumAccumulator = Avx2.Add(sumAccumulator, Avx2.SumAbsoluteDifferences(Avx2.Abs(res.AsSByte()), zero).AsInt32()); } for (int i = 0; i < Vector256.Count; i++) From 5c7f4a9ab37798a512f917f7df8d155dc180254c Mon Sep 17 00:00:00 2001 From: TechPizza Date: Tue, 18 May 2021 10:52:59 +0200 Subject: [PATCH 11/12] Added more specialized Png filter code Modified tests accordingly --- src/ImageSharp/Common/Helpers/Numerics.cs | 46 +++++++++++++++++ .../Formats/Png/Filters/AverageFilter.cs | 10 +--- .../Formats/Png/Filters/PaethFilter.cs | 5 +- .../Formats/Png/Filters/SubFilter.cs | 26 +++++++++- .../Formats/Png/Filters/UpFilter.cs | 26 +++++++++- .../Formats/Png/PngFilterTests.cs | 49 +++++++++++++++++-- 6 files changed, 145 insertions(+), 17 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/Numerics.cs b/src/ImageSharp/Common/Helpers/Numerics.cs index 0147689117..f9969b27a5 100644 --- a/src/ImageSharp/Common/Helpers/Numerics.cs +++ b/src/ImageSharp/Common/Helpers/Numerics.cs @@ -749,6 +749,7 @@ public static Vector256 Lerp( public static float Lerp(float value1, float value2, float amount) => ((value2 - value1) * amount) + value1; +#if SUPPORTS_RUNTIME_INTRINSICS [MethodImpl(MethodImplOptions.AggressiveInlining)] public static void Accumulate(ref Vector accumulator, Vector values) { @@ -762,5 +763,50 @@ public static void Accumulate(ref Vector accumulator, Vector values) accumulator += intLow; accumulator += intHigh; } + + /// + /// Reduces elements of the vector into one sum. + /// + /// The accumulator to reduce. + /// The sum of all elements. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static int ReduceSum(Vector128 accumulator) + { + if (Ssse3.IsSupported) + { + Vector128 hadd = Ssse3.HorizontalAdd(accumulator, accumulator); + Vector128 swapped = Sse2.Shuffle(hadd, 0x1); + Vector128 tmp = Sse2.Add(hadd, swapped); + + // Vector128.ToScalar() isn't optimized pre-net5.0 https://github.com/dotnet/runtime/pull/37882 + return Sse2.ConvertToInt32(tmp); + } + else + { + int sum = 0; + for (int i = 0; i < Vector128.Count; i++) + { + sum += accumulator.GetElement(i); + } + + return sum; + } + } + + /// + /// Reduces even elements of the vector into one sum. + /// + /// The accumulator to reduce. + /// The sum of even elements. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static int EvenReduceSum(Vector256 accumulator) + { + Vector128 vsum = Sse2.Add(accumulator.GetLower(), accumulator.GetUpper()); // add upper lane to lower lane + vsum = Sse2.Add(vsum, Sse2.Shuffle(vsum, 0b_11_10_11_10)); // add high to low + + // Vector128.ToScalar() isn't optimized pre-net5.0 https://github.com/dotnet/runtime/pull/37882 + return Sse2.ConvertToInt32(vsum); + } +#endif } } diff --git a/src/ImageSharp/Formats/Png/Filters/AverageFilter.cs b/src/ImageSharp/Formats/Png/Filters/AverageFilter.cs index 818119f331..0ab1413974 100644 --- a/src/ImageSharp/Formats/Png/Filters/AverageFilter.cs +++ b/src/ImageSharp/Formats/Png/Filters/AverageFilter.cs @@ -106,10 +106,7 @@ public static void Encode(Span scanline, Span previousScanline, Span sumAccumulator = Avx2.Add(sumAccumulator, Avx2.SumAbsoluteDifferences(Avx2.Abs(res.AsSByte()), zero).AsInt32()); } - for (int i = 0; i < Vector256.Count; i++) - { - sum += sumAccumulator.GetElement(i); - } + sum += Numerics.EvenReduceSum(sumAccumulator); } else if (Sse2.IsSupported) { @@ -156,10 +153,7 @@ public static void Encode(Span scanline, Span previousScanline, Span sumAccumulator = Sse2.Add(sumAccumulator, hiRes32); } - for (int i = 0; i < Vector128.Count; i++) - { - sum += sumAccumulator.GetElement(i); - } + sum += Numerics.ReduceSum(sumAccumulator); } #endif diff --git a/src/ImageSharp/Formats/Png/Filters/PaethFilter.cs b/src/ImageSharp/Formats/Png/Filters/PaethFilter.cs index f48010dba6..e8e0aa7043 100644 --- a/src/ImageSharp/Formats/Png/Filters/PaethFilter.cs +++ b/src/ImageSharp/Formats/Png/Filters/PaethFilter.cs @@ -108,10 +108,7 @@ public static void Encode(Span scanline, Span previousScanline, Span sumAccumulator = Avx2.Add(sumAccumulator, Avx2.SumAbsoluteDifferences(Avx2.Abs(res.AsSByte()), zero).AsInt32()); } - for (int i = 0; i < Vector256.Count; i++) - { - sum += sumAccumulator.GetElement(i); - } + sum += Numerics.EvenReduceSum(sumAccumulator); } else if (Vector.IsHardwareAccelerated) { diff --git a/src/ImageSharp/Formats/Png/Filters/SubFilter.cs b/src/ImageSharp/Formats/Png/Filters/SubFilter.cs index 31d65995a0..116154836e 100644 --- a/src/ImageSharp/Formats/Png/Filters/SubFilter.cs +++ b/src/ImageSharp/Formats/Png/Filters/SubFilter.cs @@ -6,6 +6,11 @@ using System.Runtime.CompilerServices; using System.Runtime.InteropServices; +#if SUPPORTS_RUNTIME_INTRINSICS +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +#endif + namespace SixLabors.ImageSharp.Formats.Png.Filters { /// @@ -66,7 +71,26 @@ public static void Encode(Span scanline, Span result, int bytesPerPi } #if SUPPORTS_RUNTIME_INTRINSICS - if (Vector.IsHardwareAccelerated) + if (Avx2.IsSupported) + { + Vector256 zero = Vector256.Zero; + Vector256 sumAccumulator = Vector256.Zero; + + for (int xLeft = x - bytesPerPixel; x + Vector256.Count <= scanline.Length; xLeft += Vector256.Count) + { + Vector256 scan = Unsafe.As>(ref Unsafe.Add(ref scanBaseRef, x)); + Vector256 prev = Unsafe.As>(ref Unsafe.Add(ref scanBaseRef, xLeft)); + + Vector256 res = Avx2.Subtract(scan, prev); + Unsafe.As>(ref Unsafe.Add(ref resultBaseRef, x + 1)) = res; // +1 to skip filter type + x += Vector256.Count; + + sumAccumulator = Avx2.Add(sumAccumulator, Avx2.SumAbsoluteDifferences(Avx2.Abs(res.AsSByte()), zero).AsInt32()); + } + + sum += Numerics.EvenReduceSum(sumAccumulator); + } + else if (Vector.IsHardwareAccelerated) { Vector sumAccumulator = Vector.Zero; diff --git a/src/ImageSharp/Formats/Png/Filters/UpFilter.cs b/src/ImageSharp/Formats/Png/Filters/UpFilter.cs index f119c2fbae..e0f35293a4 100644 --- a/src/ImageSharp/Formats/Png/Filters/UpFilter.cs +++ b/src/ImageSharp/Formats/Png/Filters/UpFilter.cs @@ -6,6 +6,11 @@ using System.Runtime.CompilerServices; using System.Runtime.InteropServices; +#if SUPPORTS_RUNTIME_INTRINSICS +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +#endif + namespace SixLabors.ImageSharp.Formats.Png.Filters { /// @@ -61,7 +66,26 @@ public static void Encode(Span scanline, Span previousScanline, Span int x = 0; #if SUPPORTS_RUNTIME_INTRINSICS - if (Vector.IsHardwareAccelerated) + if (Avx2.IsSupported) + { + Vector256 zero = Vector256.Zero; + Vector256 sumAccumulator = Vector256.Zero; + + for (; x + Vector256.Count <= scanline.Length;) + { + Vector256 scan = Unsafe.As>(ref Unsafe.Add(ref scanBaseRef, x)); + Vector256 above = Unsafe.As>(ref Unsafe.Add(ref prevBaseRef, x)); + + Vector256 res = Avx2.Subtract(scan, above); + Unsafe.As>(ref Unsafe.Add(ref resultBaseRef, x + 1)) = res; // +1 to skip filter type + x += Vector256.Count; + + sumAccumulator = Avx2.Add(sumAccumulator, Avx2.SumAbsoluteDifferences(Avx2.Abs(res.AsSByte()), zero).AsInt32()); + } + + sum += Numerics.EvenReduceSum(sumAccumulator); + } + else if (Vector.IsHardwareAccelerated) { Vector sumAccumulator = Vector.Zero; diff --git a/tests/ImageSharp.Tests/Formats/Png/PngFilterTests.cs b/tests/ImageSharp.Tests/Formats/Png/PngFilterTests.cs index dae8f25e58..5f7b4f8327 100644 --- a/tests/ImageSharp.Tests/Formats/Png/PngFilterTests.cs +++ b/tests/ImageSharp.Tests/Formats/Png/PngFilterTests.cs @@ -101,7 +101,7 @@ static void RunTest() } [Fact] - public void PaethSimd() + public void PaethAvx2() { static void RunTest() { @@ -114,6 +114,20 @@ static void RunTest() HwIntrinsics.AllowAll); } + [Fact] + public void PaethVector() + { + static void RunTest() + { + var data = new TestData(PngFilterMethod.Paeth, Size); + data.TestFilter(); + } + + FeatureTestRunner.RunWithHwIntrinsicsFeature( + RunTest, + HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2); + } + [Fact] public void Up() { @@ -128,8 +142,9 @@ static void RunTest() HwIntrinsics.DisableSIMD); } + [Fact] - public void UpSimd() + public void UpAvx2() { static void RunTest() { @@ -142,6 +157,20 @@ static void RunTest() HwIntrinsics.AllowAll); } + [Fact] + public void UpVector() + { + static void RunTest() + { + var data = new TestData(PngFilterMethod.Up, Size); + data.TestFilter(); + } + + FeatureTestRunner.RunWithHwIntrinsicsFeature( + RunTest, + HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2); + } + [Fact] public void Sub() { @@ -157,7 +186,7 @@ static void RunTest() } [Fact] - public void SubSimd() + public void SubAvx2() { static void RunTest() { @@ -170,6 +199,20 @@ static void RunTest() HwIntrinsics.AllowAll); } + [Fact] + public void SubVector() + { + static void RunTest() + { + var data = new TestData(PngFilterMethod.Sub, Size); + data.TestFilter(); + } + + FeatureTestRunner.RunWithHwIntrinsicsFeature( + RunTest, + HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2); + } + public class TestData { private readonly PngFilterMethod filter; From 4cfc7016ec711a6f2a16bf47a2f020ed65685b50 Mon Sep 17 00:00:00 2001 From: TechPizza Date: Tue, 18 May 2021 11:55:57 +0200 Subject: [PATCH 12/12] Added comment on Accumulate --- src/ImageSharp/Common/Helpers/Numerics.cs | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/ImageSharp/Common/Helpers/Numerics.cs b/src/ImageSharp/Common/Helpers/Numerics.cs index f9969b27a5..0581993014 100644 --- a/src/ImageSharp/Common/Helpers/Numerics.cs +++ b/src/ImageSharp/Common/Helpers/Numerics.cs @@ -750,6 +750,23 @@ public static float Lerp(float value1, float value2, float amount) => ((value2 - value1) * amount) + value1; #if SUPPORTS_RUNTIME_INTRINSICS + + /// + /// Accumulates 8-bit integers into by + /// widening them to 32-bit integers and performing four additions. + /// + /// + /// byte(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16) + /// is widened and added onto as such: + /// + /// accumulator += i32(1, 2, 3, 4); + /// accumulator += i32(5, 6, 7, 8); + /// accumulator += i32(9, 10, 11, 12); + /// accumulator += i32(13, 14, 15, 16); + /// + /// + /// The accumulator destination. + /// The values to accumulate. [MethodImpl(MethodImplOptions.AggressiveInlining)] public static void Accumulate(ref Vector accumulator, Vector values) {