SixLabors · john-h-k · Jun 19, 2020 · Jun 19, 2020 · Jun 21, 2020 · Jul 7, 2020
diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.Avx2Intrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.Avx2Intrinsics.cs
@@ -1,4 +1,4 @@
-// Copyright (c) Six Labors.
+// Copyright (c) Six Labors.
 // Licensed under the Apache License, Version 2.0.
 
 #if SUPPORTS_RUNTIME_INTRINSICS
@@ -30,7 +30,7 @@ internal static void NormalizedFloatToByteSaturateReduce(
 
                 if (Avx2.IsSupported)
                 {
-                    int remainder = ImageMaths.ModuloP2(source.Length, Vector<byte>.Count);
+                    int remainder = ImageMaths.ModuloP2(source.Length, Vector256<byte>.Count);
                     int adjustedCount = source.Length - remainder;
 
                     if (adjustedCount > 0)
@@ -91,6 +91,190 @@ internal static void NormalizedFloatToByteSaturate(
                 }
             }
 
+            internal static void PackBytesToUInt32SaturateChannel4Reduce(
+                ref ReadOnlySpan<byte> channel0,
+                ref ReadOnlySpan<byte> channel1,
+                ref ReadOnlySpan<byte> channel2,
+                ref Span<byte> dest)
+            {
+                DebugGuard.IsTrue(channel0.Length == dest.Length, nameof(channel0), "Input spans must be of same length!");
+                DebugGuard.IsTrue(channel1.Length == dest.Length, nameof(channel1), "Input spans must be of same length!");
+                DebugGuard.IsTrue(channel2.Length == dest.Length, nameof(channel2), "Input spans must be of same length!");
+
+                if (Avx2.IsSupported)
+                {
+                    int remainder = ImageMaths.ModuloP2(channel1.Length, Vector256<byte>.Count);
+                    int adjustedCount = channel1.Length - remainder;
+
+                    if (adjustedCount > 0)
+                    {
+                        channel0 = channel0.Slice(adjustedCount);
+                        channel1 = channel1.Slice(adjustedCount);
+                        channel2 = channel2.Slice(adjustedCount);
+                        dest = dest.Slice(adjustedCount);
+
+                        PackBytesToUInt32SaturateChannel4(
+                            channel0,
+                            channel1,
+                            channel2,
+                            dest);
+                    }
+                }
+            }
+
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            internal static void PackBytesToUInt32SaturateChannel4(
+                ReadOnlySpan<byte> channel0,
+                ReadOnlySpan<byte> channel1,
+                ReadOnlySpan<byte> channel2,
+                Span<byte> dest)
+            {
+                int n = dest.Length / Vector256<byte>.Count;
+
+                ref Vector256<byte> source0Base =
+                    ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(channel0));
+                ref Vector256<byte> source1Base =
+                    ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(channel1));
+                ref Vector256<byte> source2Base =
+                    ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(channel2));
+
+                ref Vector256<byte> destBase = ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(dest));
+
+                Vector256<byte> allOnes = Avx2.CompareEqual(Vector256<byte>.Zero, Vector256<byte>.Zero);
+
+                for (int i = 0, j = 0; j < n; i += 1, j += 4)
+                {
+                    Vector256<byte> s0 = Unsafe.Add(ref source0Base, i);
+                    Vector256<byte> s1 = Unsafe.Add(ref source1Base, i);
+                    Vector256<byte> s2 = Unsafe.Add(ref source2Base, i);
+
+                    s0 = Avx2.Permute4x64(s0.AsUInt64(), 0b_11_01_10_00).AsByte();
+                    s1 = Avx2.Permute4x64(s1.AsUInt64(), 0b_11_01_10_00).AsByte();
+                    s2 = Avx2.Permute4x64(s2.AsUInt64(), 0b_11_01_10_00).AsByte();
+
+                    Vector256<ushort> s01Lo = Avx2.UnpackLow(s0, s1).AsUInt16();
+                    Vector256<ushort> s01Hi = Avx2.UnpackHigh(s0, s1).AsUInt16();
+
+                    s01Lo = Avx2.Permute4x64(s01Lo.AsUInt64(), 0b_11_01_10_00).AsUInt16();
+                    s01Hi = Avx2.Permute4x64(s01Hi.AsUInt64(), 0b_11_01_10_00).AsUInt16();
+
+                    Vector256<ushort> s23Lo = Avx2.UnpackLow(s2, allOnes).AsUInt16();
+                    Vector256<ushort> s23Hi = Avx2.UnpackHigh(s2, allOnes).AsUInt16();
+
+                    s23Lo = Avx2.Permute4x64(s23Lo.AsUInt64(), 0b_11_01_10_00).AsUInt16();
+                    s23Hi = Avx2.Permute4x64(s23Hi.AsUInt64(), 0b_11_01_10_00).AsUInt16();
+
+                    Vector256<byte> b0 = Avx2.UnpackLow(s01Lo, s23Lo).AsByte();
+                    Vector256<byte> b1 = Avx2.UnpackHigh(s01Lo, s23Lo).AsByte();
+                    Vector256<byte> b2 = Avx2.UnpackLow(s01Hi, s23Hi).AsByte();
+                    Vector256<byte> b3 = Avx2.UnpackHigh(s01Hi, s23Hi).AsByte();
+
+                    Unsafe.Add(ref destBase, j) = b0;
+                    Unsafe.Add(ref destBase, j + 1) = b1;
+                    Unsafe.Add(ref destBase, j + 2) = b2;
+                    Unsafe.Add(ref destBase, j + 3) = b3;
+                }
+            }
+
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            internal static void PackBytesToUInt24Reduce(
+                    ref ReadOnlySpan<byte> channel0,
+                    ref ReadOnlySpan<byte> channel1,
+                    ref ReadOnlySpan<byte> channel2,
+                    ref Span<byte> dest)
+            {
+                DebugGuard.IsTrue(channel0.Length == dest.Length, nameof(channel0), "Input spans must be of same length!");
+                DebugGuard.IsTrue(channel1.Length == dest.Length, nameof(channel1), "Input spans must be of same length!");
+                DebugGuard.IsTrue(channel2.Length == dest.Length, nameof(channel2), "Input spans must be of same length!");
+
+                if (Avx2.IsSupported)
+                {
+                    int remainder = ImageMaths.ModuloP2(channel0.Length, Vector256<byte>.Count);
+                    int adjustedCount = channel0.Length - remainder;
+
+                    if (adjustedCount > 0)
+                    {
+                        channel0 = channel0.Slice(adjustedCount);
+                        channel1 = channel1.Slice(adjustedCount);
+                        channel2 = channel2.Slice(adjustedCount);
+                        dest = dest.Slice(adjustedCount);
+
+                        PackBytesToUInt24(
+                            channel0,
+                            channel1,
+                            channel2,
+                            dest);
+                    }
+                }
+            }
+
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            internal static void PackBytesToUInt24(
+                    ReadOnlySpan<byte> channel0,
+                    ReadOnlySpan<byte> channel1,
+                    ReadOnlySpan<byte> channel2,
+                    Span<byte> dest)
+            {
+                VerifySpanInput(channel0, dest, Vector256<byte>.Count);
+                VerifySpanInput(channel1, dest, Vector256<byte>.Count);
+                VerifySpanInput(channel2, dest, Vector256<byte>.Count);
+
+                int n = dest.Length / Vector256<byte>.Count;
+
+                ref Vector256<byte> source0Base =
+                    ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(channel0));
+                ref Vector256<byte> source1Base =
+                    ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(channel1));
+                ref Vector256<byte> source2Base =
+                    ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(channel2));
+
+                ref Vector256<byte> destBase =
+                    ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(dest));
+
+                Vector256<byte> s0Mask0 = Vector256.Create(0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1, 5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10, -1).AsByte();
+                Vector256<byte> s0Mask1 = Vector256.Create(-1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1, -1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1, 5).AsByte();
+                Vector256<byte> s0Mask2 = Vector256.Create(-1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10, -1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1, -1).AsByte();
+
+                Vector256<byte> s1Mask0 = Vector256.Create(-1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1, 5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10).AsByte();
+                Vector256<byte> s1Mask1 = Vector256.Create(-1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1, -1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1).AsByte();
+                Vector256<byte> s1Mask2 = Vector256.Create(5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10, -1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1).AsByte();
+
+                Vector256<byte> s2Mask0 = Vector256.Create(-1, -1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1, 5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1).AsByte();
+                Vector256<byte> s2Mask1 = Vector256.Create(10, -1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1, -1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1).AsByte();
+                Vector256<byte> s2Mask2 = Vector256.Create(-1, 5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10, -1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15).AsByte();
+
+                for (int i = 0, j = 0; j < n; i += 1, j += 3)
+                {
+                    Vector256<byte> s0 = Unsafe.Add(ref source0Base, i);
+                    Vector256<byte> s1 = Unsafe.Add(ref source1Base, i);
+                    Vector256<byte> s2 = Unsafe.Add(ref source2Base, i);
+
+                    Vector256<byte> loS0 = Avx2.Permute2x128(s0, s0, 0);
+                    Vector256<byte> loS1 = Avx2.Permute2x128(s1, s1, 0);
+                    Vector256<byte> loS2 = Avx2.Permute2x128(s2, s2, 0);
+
+                    Vector256<byte> b0 = Avx2.Shuffle(loS0, s0Mask0);
+                    b0 = Avx2.Or(b0, Avx2.Shuffle(loS1, s1Mask0));
+                    b0 = Avx2.Or(b0, Avx2.Shuffle(loS2, s2Mask0));
+
+                    Vector256<byte> b1 = Avx2.Shuffle(s0, s0Mask1);
+                    b1 = Avx2.Or(b1, Avx2.Shuffle(s1, s1Mask1));
+                    b1 = Avx2.Or(b1, Avx2.Shuffle(s2, s2Mask1));
+
+                    Vector256<byte> hiS0 = Avx2.Permute2x128(s0, s0, 0b_0001_0001);
+                    Vector256<byte> hiS1 = Avx2.Permute2x128(s1, s1, 0b_0001_0001);
+                    Vector256<byte> hiS2 = Avx2.Permute2x128(s2, s2, 0b_0001_0001);
+
+                    Vector256<byte> b2 = Avx2.Shuffle(hiS0, s0Mask2);
+                    b2 = Avx2.Or(b2, Avx2.Shuffle(hiS1, s1Mask2));
+                    b2 = Avx2.Or(b2, Avx2.Shuffle(hiS2, s2Mask2));
+
+                    Unsafe.Add(ref destBase, j + 0) = b0;
+                    Unsafe.Add(ref destBase, j + 1) = b1;
+                    Unsafe.Add(ref destBase, j + 2) = b2;
+                }
+            }
+
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
             private static Vector256<int> ConvertToInt32(Vector256<float> vf, Vector256<float> scale)
             {

diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs
@@ -62,7 +62,9 @@ internal static void ByteToNormalizedFloatReduce(
 
                 if (adjustedCount > 0)
                 {
-                    ByteToNormalizedFloat(source.Slice(0, adjustedCount), dest.Slice(0, adjustedCount));
+                    ByteToNormalizedFloat(
+                        source.Slice(0, adjustedCount),
+                        dest.Slice(0, adjustedCount));
 
                     source = source.Slice(adjustedCount);
                     dest = dest.Slice(adjustedCount);

diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.cs b/src/ImageSharp/Common/Helpers/SimdUtils.cs
@@ -112,6 +112,103 @@ internal static void NormalizedFloatToByteSaturate(ReadOnlySpan<float> source, S
             }
         }
 
+        internal static void PackBytesToUInt32SaturateChannel4(
+                ReadOnlySpan<byte> channel0,
+                ReadOnlySpan<byte> channel1,
+                ReadOnlySpan<byte> channel2,
+                Span<byte> dest)
+        {
+            DebugGuard.IsTrue(channel0.Length == dest.Length, nameof(channel0), "Input spans must be of same length!");
+            DebugGuard.IsTrue(channel1.Length == dest.Length, nameof(channel1), "Input spans must be of same length!");
+            DebugGuard.IsTrue(channel2.Length == dest.Length, nameof(channel2), "Input spans must be of same length!");
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+            Avx2Intrinsics.PackBytesToUInt32SaturateChannel4Reduce(ref channel0, ref channel1, ref channel2, ref dest);
+
+            // I can't immediately see a way to do this operation efficiently with Vector<T> or Vector4<T>. TODO
+#elif SUPPORTS_EXTENDED_INTRINSICS
+            // ExtendedIntrinsics.PackBytesToUInt32SaturateChannel4Reduce(ref channel0, ref channel1, ref channel2, ref dest);
+#else
+            // BasicIntrinsics256.PackBytesToUInt32SaturateChannel4Reduce(ref channel0, ref channel1, ref channel2, ref dest);
+#endif
+
+            // Deal with the remainder:
+            if (channel0.Length > 0)
+            {
+                PackBytesToUInt32SaturateChannel4Remainder(channel0, channel1, channel2, dest);
+            }
+        }
+
+        private static void PackBytesToUInt32SaturateChannel4Remainder(
+                ReadOnlySpan<byte> channel0,
+                ReadOnlySpan<byte> channel1,
+                ReadOnlySpan<byte> channel2,
+                Span<byte> dest)
+        {
+            DebugGuard.MustBeGreaterThanOrEqualTo(dest.Length, channel0.Length * 4, nameof(dest));
+
+            ref byte s0Base = ref MemoryMarshal.GetReference(channel0);
+            ref byte s1Base = ref MemoryMarshal.GetReference(channel1);
+            ref byte s2Base = ref MemoryMarshal.GetReference(channel2);
+            ref byte dBase = ref MemoryMarshal.GetReference(dest);
+
+            for (int i = 0, j = 0; i < dest.Length; i += 1, j += 4)
+            {
+                Unsafe.Add(ref dBase, j) = Unsafe.Add(ref s0Base, i);
+                Unsafe.Add(ref dBase, j + 1) = Unsafe.Add(ref s1Base, i);
+                Unsafe.Add(ref dBase, j + 2) = Unsafe.Add(ref s2Base, i);
+                Unsafe.Add(ref dBase, j + 2) = 0xFF;
+            }
+        }
+
+        internal static void PackBytesToUInt24(
+                ReadOnlySpan<byte> channel0,
+                ReadOnlySpan<byte> channel1,
+                ReadOnlySpan<byte> channel2,
+                Span<byte> dest)
+        {
+            DebugGuard.IsTrue(channel0.Length == dest.Length, nameof(channel0), "Input spans must be of same length!");
+            DebugGuard.IsTrue(channel1.Length == dest.Length, nameof(channel1), "Input spans must be of same length!");
+            DebugGuard.IsTrue(channel2.Length == dest.Length, nameof(channel2), "Input spans must be of same length!");
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+            Avx2Intrinsics.PackBytesToUInt24Reduce(ref channel0, ref channel1, ref channel2, ref dest);
+
+            // I can't immediately see a way to do this operation efficiently with Vector<T> or Vector4<T>. TODO
+#elif SUPPORTS_EXTENDED_INTRINSICS
+            // ExtendedIntrinsics.PackBytesToUInt24Reduce(ref channel0, ref channel1, ref channel2, ref dest);
+#else
+            // BasicIntrinsics256.PackBytesToUInt24Reduce(ref channel0, ref channel1, ref channel2, ref dest);
+#endif
+
+            // Deal with the remainder:
+            if (channel0.Length > 0)
+            {
+                PackBytesToUInt24(channel0, channel1, channel2, dest);
+            }
+        }
+
+        private static void PackBytesToUInt24Remainder(
+                ReadOnlySpan<byte> channel0,
+                ReadOnlySpan<byte> channel1,
+                ReadOnlySpan<byte> channel2,
+                Span<byte> dest)
+        {
+            DebugGuard.MustBeGreaterThanOrEqualTo(dest.Length, channel0.Length * 3, nameof(dest));
+
+            ref byte s0Base = ref MemoryMarshal.GetReference(channel0);
+            ref byte s1Base = ref MemoryMarshal.GetReference(channel1);
+            ref byte s2Base = ref MemoryMarshal.GetReference(channel2);
+            ref byte dBase = ref MemoryMarshal.GetReference(dest);
+
+            for (int i = 0, j = 0; i < dest.Length; i += 1, j += 3)
+            {
+                Unsafe.Add(ref dBase, j) = Unsafe.Add(ref s0Base, i);
+                Unsafe.Add(ref dBase, j + 1) = Unsafe.Add(ref s1Base, i);
+                Unsafe.Add(ref dBase, j + 2) = Unsafe.Add(ref s2Base, i);
+            }
+        }
+
         [MethodImpl(InliningOptions.ColdPath)]
         private static void ConvertByteToNormalizedFloatRemainder(ReadOnlySpan<byte> source, Span<float> dest)
         {
@@ -176,6 +273,16 @@ private static void VerifySpanInput(ReadOnlySpan<byte> source, Span<float> dest,
                 $"length should be divisible by {shouldBeDivisibleBy}!");
         }
 
+        [Conditional("DEBUG")]
+        private static void VerifySpanInput(ReadOnlySpan<byte> source, Span<byte> dest, int shouldBeDivisibleBy)
+        {
+            DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");
+            DebugGuard.IsTrue(
+                ImageMaths.ModuloP2(dest.Length, shouldBeDivisibleBy) == 0,
+                nameof(source),
+                $"length should be divisible by {shouldBeDivisibleBy}!");
+        }
+
         [Conditional("DEBUG")]
         private static void VerifySpanInput(ReadOnlySpan<float> source, Span<byte> dest, int shouldBeDivisibleBy)
         {

diff --git a/...ImageSharp/PixelFormats/PixelImplementations/Generated/Rgb24.PixelOperations.Generated.cs b/...ImageSharp/PixelFormats/PixelImplementations/Generated/Rgb24.PixelOperations.Generated.cs
@@ -1,4 +1,4 @@
-// Copyright (c) Six Labors.
+// Copyright (c) Six Labors.
 // Licensed under the Apache License, Version 2.0.
 
 // <auto-generated />
@@ -21,7 +21,7 @@ public partial struct Rgb24
         /// <summary>
         /// Provides optimized overrides for bulk operations.
         /// </summary>
-        internal class PixelOperations : PixelOperations<Rgb24>
+        internal partial class PixelOperations : PixelOperations<Rgb24>
         {
             /// <inheritdoc />
             public override void FromRgb24(Configuration configuration, ReadOnlySpan<Rgb24> source, Span<Rgb24> destinationPixels)

diff --git a/...ImageSharp/PixelFormats/PixelImplementations/Generated/Rgb24.PixelOperations.Generated.tt b/...ImageSharp/PixelFormats/PixelImplementations/Generated/Rgb24.PixelOperations.Generated.tt
@@ -1,4 +1,4 @@
-<#@include file="_Common.ttinclude" #>
+<#@include file="_Common.ttinclude" #>
 <#@ output extension=".cs" #>
 
 namespace SixLabors.ImageSharp.PixelFormats
@@ -11,7 +11,7 @@ namespace SixLabors.ImageSharp.PixelFormats
         /// <summary>
         /// Provides optimized overrides for bulk operations.
         /// </summary>
-        internal class PixelOperations : PixelOperations<Rgb24>
+        internal partial class PixelOperations : PixelOperations<Rgb24>
         {
             <# GenerateAllDefaultConversionMethods("Rgb24"); #>
         }