-
-
Notifications
You must be signed in to change notification settings - Fork 887
Add PackFromRgbPlanes AVX2 vectorised implementation for Rgba32 and Rgba24 pixels #1242
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
5771c2c
883344c
e907126
f7289ee
a50fc32
0510783
09f464f
4a90255
c35b3a8
660b110
6ed1928
dd071a2
f449283
3b216c7
cf78b85
3298603
f4cabf2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,4 +1,4 @@ | ||
| // Copyright (c) Six Labors. | ||
| // Copyright (c) Six Labors. | ||
| // Licensed under the Apache License, Version 2.0. | ||
|
|
||
| #if SUPPORTS_RUNTIME_INTRINSICS | ||
|
|
@@ -30,7 +30,7 @@ internal static void NormalizedFloatToByteSaturateReduce( | |
|
|
||
| if (Avx2.IsSupported) | ||
| { | ||
| int remainder = ImageMaths.ModuloP2(source.Length, Vector<byte>.Count); | ||
| int remainder = ImageMaths.ModuloP2(source.Length, Vector256<byte>.Count); | ||
| int adjustedCount = source.Length - remainder; | ||
|
|
||
| if (adjustedCount > 0) | ||
|
|
@@ -91,6 +91,190 @@ internal static void NormalizedFloatToByteSaturate( | |
| } | ||
| } | ||
|
|
||
| internal static void PackBytesToUInt32SaturateChannel4Reduce( | ||
| ref ReadOnlySpan<byte> channel0, | ||
| ref ReadOnlySpan<byte> channel1, | ||
| ref ReadOnlySpan<byte> channel2, | ||
| ref Span<byte> dest) | ||
| { | ||
| DebugGuard.IsTrue(channel0.Length == dest.Length, nameof(channel0), "Input spans must be of same length!"); | ||
| DebugGuard.IsTrue(channel1.Length == dest.Length, nameof(channel1), "Input spans must be of same length!"); | ||
| DebugGuard.IsTrue(channel2.Length == dest.Length, nameof(channel2), "Input spans must be of same length!"); | ||
|
|
||
| if (Avx2.IsSupported) | ||
| { | ||
| int remainder = ImageMaths.ModuloP2(channel1.Length, Vector256<byte>.Count); | ||
| int adjustedCount = channel1.Length - remainder; | ||
|
|
||
| if (adjustedCount > 0) | ||
| { | ||
| channel0 = channel0.Slice(adjustedCount); | ||
|
||
| channel1 = channel1.Slice(adjustedCount); | ||
| channel2 = channel2.Slice(adjustedCount); | ||
| dest = dest.Slice(adjustedCount); | ||
|
|
||
| PackBytesToUInt32SaturateChannel4( | ||
| channel0, | ||
| channel1, | ||
| channel2, | ||
| dest); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| [MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
| internal static void PackBytesToUInt32SaturateChannel4( | ||
| ReadOnlySpan<byte> channel0, | ||
| ReadOnlySpan<byte> channel1, | ||
| ReadOnlySpan<byte> channel2, | ||
| Span<byte> dest) | ||
| { | ||
| int n = dest.Length / Vector256<byte>.Count; | ||
|
|
||
| ref Vector256<byte> source0Base = | ||
| ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(channel0)); | ||
| ref Vector256<byte> source1Base = | ||
| ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(channel1)); | ||
| ref Vector256<byte> source2Base = | ||
| ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(channel2)); | ||
|
|
||
| ref Vector256<byte> destBase = ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(dest)); | ||
|
|
||
| Vector256<byte> allOnes = Avx2.CompareEqual(Vector256<byte>.Zero, Vector256<byte>.Zero); | ||
|
|
||
| for (int i = 0, j = 0; j < n; i += 1, j += 4) | ||
| { | ||
| Vector256<byte> s0 = Unsafe.Add(ref source0Base, i); | ||
| Vector256<byte> s1 = Unsafe.Add(ref source1Base, i); | ||
| Vector256<byte> s2 = Unsafe.Add(ref source2Base, i); | ||
|
|
||
| s0 = Avx2.Permute4x64(s0.AsUInt64(), 0b_11_01_10_00).AsByte(); | ||
| s1 = Avx2.Permute4x64(s1.AsUInt64(), 0b_11_01_10_00).AsByte(); | ||
| s2 = Avx2.Permute4x64(s2.AsUInt64(), 0b_11_01_10_00).AsByte(); | ||
|
||
|
|
||
| Vector256<ushort> s01Lo = Avx2.UnpackLow(s0, s1).AsUInt16(); | ||
| Vector256<ushort> s01Hi = Avx2.UnpackHigh(s0, s1).AsUInt16(); | ||
|
|
||
| s01Lo = Avx2.Permute4x64(s01Lo.AsUInt64(), 0b_11_01_10_00).AsUInt16(); | ||
| s01Hi = Avx2.Permute4x64(s01Hi.AsUInt64(), 0b_11_01_10_00).AsUInt16(); | ||
|
|
||
| Vector256<ushort> s23Lo = Avx2.UnpackLow(s2, allOnes).AsUInt16(); | ||
| Vector256<ushort> s23Hi = Avx2.UnpackHigh(s2, allOnes).AsUInt16(); | ||
|
|
||
| s23Lo = Avx2.Permute4x64(s23Lo.AsUInt64(), 0b_11_01_10_00).AsUInt16(); | ||
| s23Hi = Avx2.Permute4x64(s23Hi.AsUInt64(), 0b_11_01_10_00).AsUInt16(); | ||
|
|
||
| Vector256<byte> b0 = Avx2.UnpackLow(s01Lo, s23Lo).AsByte(); | ||
| Vector256<byte> b1 = Avx2.UnpackHigh(s01Lo, s23Lo).AsByte(); | ||
| Vector256<byte> b2 = Avx2.UnpackLow(s01Hi, s23Hi).AsByte(); | ||
| Vector256<byte> b3 = Avx2.UnpackHigh(s01Hi, s23Hi).AsByte(); | ||
|
|
||
| Unsafe.Add(ref destBase, j) = b0; | ||
| Unsafe.Add(ref destBase, j + 1) = b1; | ||
| Unsafe.Add(ref destBase, j + 2) = b2; | ||
| Unsafe.Add(ref destBase, j + 3) = b3; | ||
| } | ||
| } | ||
|
|
||
| [MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
| internal static void PackBytesToUInt24Reduce( | ||
| ref ReadOnlySpan<byte> channel0, | ||
| ref ReadOnlySpan<byte> channel1, | ||
| ref ReadOnlySpan<byte> channel2, | ||
| ref Span<byte> dest) | ||
| { | ||
| DebugGuard.IsTrue(channel0.Length == dest.Length, nameof(channel0), "Input spans must be of same length!"); | ||
| DebugGuard.IsTrue(channel1.Length == dest.Length, nameof(channel1), "Input spans must be of same length!"); | ||
| DebugGuard.IsTrue(channel2.Length == dest.Length, nameof(channel2), "Input spans must be of same length!"); | ||
|
|
||
| if (Avx2.IsSupported) | ||
| { | ||
| int remainder = ImageMaths.ModuloP2(channel0.Length, Vector256<byte>.Count); | ||
| int adjustedCount = channel0.Length - remainder; | ||
|
|
||
| if (adjustedCount > 0) | ||
| { | ||
| channel0 = channel0.Slice(adjustedCount); | ||
|
||
| channel1 = channel1.Slice(adjustedCount); | ||
| channel2 = channel2.Slice(adjustedCount); | ||
| dest = dest.Slice(adjustedCount); | ||
|
|
||
| PackBytesToUInt24( | ||
| channel0, | ||
| channel1, | ||
| channel2, | ||
| dest); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| [MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
| internal static void PackBytesToUInt24( | ||
| ReadOnlySpan<byte> channel0, | ||
| ReadOnlySpan<byte> channel1, | ||
| ReadOnlySpan<byte> channel2, | ||
| Span<byte> dest) | ||
| { | ||
| VerifySpanInput(channel0, dest, Vector256<byte>.Count); | ||
| VerifySpanInput(channel1, dest, Vector256<byte>.Count); | ||
| VerifySpanInput(channel2, dest, Vector256<byte>.Count); | ||
|
|
||
| int n = dest.Length / Vector256<byte>.Count; | ||
|
|
||
| ref Vector256<byte> source0Base = | ||
| ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(channel0)); | ||
| ref Vector256<byte> source1Base = | ||
| ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(channel1)); | ||
| ref Vector256<byte> source2Base = | ||
| ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(channel2)); | ||
|
|
||
| ref Vector256<byte> destBase = | ||
| ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(dest)); | ||
|
|
||
| Vector256<byte> s0Mask0 = Vector256.Create(0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1, 5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10, -1).AsByte(); | ||
|
||
| Vector256<byte> s0Mask1 = Vector256.Create(-1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1, -1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1, 5).AsByte(); | ||
| Vector256<byte> s0Mask2 = Vector256.Create(-1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10, -1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1, -1).AsByte(); | ||
|
|
||
| Vector256<byte> s1Mask0 = Vector256.Create(-1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1, 5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10).AsByte(); | ||
| Vector256<byte> s1Mask1 = Vector256.Create(-1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1, -1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1).AsByte(); | ||
| Vector256<byte> s1Mask2 = Vector256.Create(5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10, -1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1).AsByte(); | ||
|
|
||
| Vector256<byte> s2Mask0 = Vector256.Create(-1, -1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1, 5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1).AsByte(); | ||
| Vector256<byte> s2Mask1 = Vector256.Create(10, -1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1, -1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1).AsByte(); | ||
| Vector256<byte> s2Mask2 = Vector256.Create(-1, 5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10, -1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15).AsByte(); | ||
|
|
||
| for (int i = 0, j = 0; j < n; i += 1, j += 3) | ||
| { | ||
| Vector256<byte> s0 = Unsafe.Add(ref source0Base, i); | ||
| Vector256<byte> s1 = Unsafe.Add(ref source1Base, i); | ||
| Vector256<byte> s2 = Unsafe.Add(ref source2Base, i); | ||
|
|
||
| Vector256<byte> loS0 = Avx2.Permute2x128(s0, s0, 0); | ||
| Vector256<byte> loS1 = Avx2.Permute2x128(s1, s1, 0); | ||
| Vector256<byte> loS2 = Avx2.Permute2x128(s2, s2, 0); | ||
|
||
|
|
||
| Vector256<byte> b0 = Avx2.Shuffle(loS0, s0Mask0); | ||
| b0 = Avx2.Or(b0, Avx2.Shuffle(loS1, s1Mask0)); | ||
| b0 = Avx2.Or(b0, Avx2.Shuffle(loS2, s2Mask0)); | ||
|
|
||
| Vector256<byte> b1 = Avx2.Shuffle(s0, s0Mask1); | ||
| b1 = Avx2.Or(b1, Avx2.Shuffle(s1, s1Mask1)); | ||
| b1 = Avx2.Or(b1, Avx2.Shuffle(s2, s2Mask1)); | ||
|
|
||
| Vector256<byte> hiS0 = Avx2.Permute2x128(s0, s0, 0b_0001_0001); | ||
| Vector256<byte> hiS1 = Avx2.Permute2x128(s1, s1, 0b_0001_0001); | ||
| Vector256<byte> hiS2 = Avx2.Permute2x128(s2, s2, 0b_0001_0001); | ||
|
|
||
| Vector256<byte> b2 = Avx2.Shuffle(hiS0, s0Mask2); | ||
| b2 = Avx2.Or(b2, Avx2.Shuffle(hiS1, s1Mask2)); | ||
| b2 = Avx2.Or(b2, Avx2.Shuffle(hiS2, s2Mask2)); | ||
|
|
||
| Unsafe.Add(ref destBase, j + 0) = b0; | ||
| Unsafe.Add(ref destBase, j + 1) = b1; | ||
| Unsafe.Add(ref destBase, j + 2) = b2; | ||
| } | ||
| } | ||
|
|
||
| [MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
| private static Vector256<int> ConvertToInt32(Vector256<float> vf, Vector256<float> scale) | ||
| { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -112,6 +112,103 @@ internal static void NormalizedFloatToByteSaturate(ReadOnlySpan<float> source, S | |
| } | ||
| } | ||
|
|
||
| internal static void PackBytesToUInt32SaturateChannel4( | ||
| ReadOnlySpan<byte> channel0, | ||
| ReadOnlySpan<byte> channel1, | ||
| ReadOnlySpan<byte> channel2, | ||
| Span<byte> dest) | ||
| { | ||
| DebugGuard.IsTrue(channel0.Length == dest.Length, nameof(channel0), "Input spans must be of same length!"); | ||
| DebugGuard.IsTrue(channel1.Length == dest.Length, nameof(channel1), "Input spans must be of same length!"); | ||
| DebugGuard.IsTrue(channel2.Length == dest.Length, nameof(channel2), "Input spans must be of same length!"); | ||
|
|
||
| #if SUPPORTS_RUNTIME_INTRINSICS | ||
| Avx2Intrinsics.PackBytesToUInt32SaturateChannel4Reduce(ref channel0, ref channel1, ref channel2, ref dest); | ||
|
|
||
| // I can't immediately see a way to do this operation efficiently with Vector<T> or Vector4<T>. TODO | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There is none :) |
||
| #elif SUPPORTS_EXTENDED_INTRINSICS | ||
| // ExtendedIntrinsics.PackBytesToUInt32SaturateChannel4Reduce(ref channel0, ref channel1, ref channel2, ref dest); | ||
| #else | ||
| // BasicIntrinsics256.PackBytesToUInt32SaturateChannel4Reduce(ref channel0, ref channel1, ref channel2, ref dest); | ||
| #endif | ||
|
|
||
| // Deal with the remainder: | ||
| if (channel0.Length > 0) | ||
| { | ||
| PackBytesToUInt32SaturateChannel4Remainder(channel0, channel1, channel2, dest); | ||
| } | ||
| } | ||
|
|
||
| private static void PackBytesToUInt32SaturateChannel4Remainder( | ||
| ReadOnlySpan<byte> channel0, | ||
| ReadOnlySpan<byte> channel1, | ||
| ReadOnlySpan<byte> channel2, | ||
| Span<byte> dest) | ||
| { | ||
| DebugGuard.MustBeGreaterThanOrEqualTo(dest.Length, channel0.Length * 4, nameof(dest)); | ||
|
|
||
| ref byte s0Base = ref MemoryMarshal.GetReference(channel0); | ||
| ref byte s1Base = ref MemoryMarshal.GetReference(channel1); | ||
| ref byte s2Base = ref MemoryMarshal.GetReference(channel2); | ||
| ref byte dBase = ref MemoryMarshal.GetReference(dest); | ||
|
|
||
| for (int i = 0, j = 0; i < dest.Length; i += 1, j += 4) | ||
| { | ||
| Unsafe.Add(ref dBase, j) = Unsafe.Add(ref s0Base, i); | ||
| Unsafe.Add(ref dBase, j + 1) = Unsafe.Add(ref s1Base, i); | ||
| Unsafe.Add(ref dBase, j + 2) = Unsafe.Add(ref s2Base, i); | ||
| Unsafe.Add(ref dBase, j + 2) = 0xFF; | ||
| } | ||
| } | ||
|
|
||
| internal static void PackBytesToUInt24( | ||
| ReadOnlySpan<byte> channel0, | ||
| ReadOnlySpan<byte> channel1, | ||
| ReadOnlySpan<byte> channel2, | ||
| Span<byte> dest) | ||
| { | ||
| DebugGuard.IsTrue(channel0.Length == dest.Length, nameof(channel0), "Input spans must be of same length!"); | ||
| DebugGuard.IsTrue(channel1.Length == dest.Length, nameof(channel1), "Input spans must be of same length!"); | ||
| DebugGuard.IsTrue(channel2.Length == dest.Length, nameof(channel2), "Input spans must be of same length!"); | ||
|
|
||
| #if SUPPORTS_RUNTIME_INTRINSICS | ||
| Avx2Intrinsics.PackBytesToUInt24Reduce(ref channel0, ref channel1, ref channel2, ref dest); | ||
|
|
||
| // I can't immediately see a way to do this operation efficiently with Vector<T> or Vector4<T>. TODO | ||
| #elif SUPPORTS_EXTENDED_INTRINSICS | ||
| // ExtendedIntrinsics.PackBytesToUInt24Reduce(ref channel0, ref channel1, ref channel2, ref dest); | ||
| #else | ||
| // BasicIntrinsics256.PackBytesToUInt24Reduce(ref channel0, ref channel1, ref channel2, ref dest); | ||
| #endif | ||
|
|
||
| // Deal with the remainder: | ||
| if (channel0.Length > 0) | ||
| { | ||
| PackBytesToUInt24(channel0, channel1, channel2, dest); | ||
|
||
| } | ||
| } | ||
|
|
||
| private static void PackBytesToUInt24Remainder( | ||
| ReadOnlySpan<byte> channel0, | ||
| ReadOnlySpan<byte> channel1, | ||
| ReadOnlySpan<byte> channel2, | ||
| Span<byte> dest) | ||
| { | ||
| DebugGuard.MustBeGreaterThanOrEqualTo(dest.Length, channel0.Length * 3, nameof(dest)); | ||
|
|
||
| ref byte s0Base = ref MemoryMarshal.GetReference(channel0); | ||
| ref byte s1Base = ref MemoryMarshal.GetReference(channel1); | ||
| ref byte s2Base = ref MemoryMarshal.GetReference(channel2); | ||
| ref byte dBase = ref MemoryMarshal.GetReference(dest); | ||
|
|
||
| for (int i = 0, j = 0; i < dest.Length; i += 1, j += 3) | ||
| { | ||
| Unsafe.Add(ref dBase, j) = Unsafe.Add(ref s0Base, i); | ||
| Unsafe.Add(ref dBase, j + 1) = Unsafe.Add(ref s1Base, i); | ||
| Unsafe.Add(ref dBase, j + 2) = Unsafe.Add(ref s2Base, i); | ||
| } | ||
| } | ||
|
|
||
| [MethodImpl(InliningOptions.ColdPath)] | ||
| private static void ConvertByteToNormalizedFloatRemainder(ReadOnlySpan<byte> source, Span<float> dest) | ||
| { | ||
|
|
@@ -176,6 +273,16 @@ private static void VerifySpanInput(ReadOnlySpan<byte> source, Span<float> dest, | |
| $"length should be divisible by {shouldBeDivisibleBy}!"); | ||
| } | ||
|
|
||
| [Conditional("DEBUG")] | ||
| private static void VerifySpanInput(ReadOnlySpan<byte> source, Span<byte> dest, int shouldBeDivisibleBy) | ||
| { | ||
| DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!"); | ||
| DebugGuard.IsTrue( | ||
| ImageMaths.ModuloP2(dest.Length, shouldBeDivisibleBy) == 0, | ||
| nameof(source), | ||
| $"length should be divisible by {shouldBeDivisibleBy}!"); | ||
| } | ||
|
|
||
| [Conditional("DEBUG")] | ||
| private static void VerifySpanInput(ReadOnlySpan<float> source, Span<byte> dest, int shouldBeDivisibleBy) | ||
| { | ||
|
|
||

There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't know what name is best for this, but there's no saturation happening here because the input is already
byte.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
by "saturate channel 4" i just meant set all bits to true. Not sure why i chose that wording
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah, I get it. I suck at names, so feel free to ignore my feedback on anything naming related 😆