Skip to content
Closed
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
188 changes: 186 additions & 2 deletions src/ImageSharp/Common/Helpers/SimdUtils.Avx2Intrinsics.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (c) Six Labors.
// Copyright (c) Six Labors.
// Licensed under the Apache License, Version 2.0.

#if SUPPORTS_RUNTIME_INTRINSICS
Expand Down Expand Up @@ -30,7 +30,7 @@ internal static void NormalizedFloatToByteSaturateReduce(

if (Avx2.IsSupported)
{
int remainder = ImageMaths.ModuloP2(source.Length, Vector<byte>.Count);
int remainder = ImageMaths.ModuloP2(source.Length, Vector256<byte>.Count);
int adjustedCount = source.Length - remainder;

if (adjustedCount > 0)
Expand Down Expand Up @@ -91,6 +91,190 @@ internal static void NormalizedFloatToByteSaturate(
}
}

internal static void PackBytesToUInt32SaturateChannel4Reduce(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know what name is best for this, but there's no saturation happening here because the input is already byte.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

by "saturate channel 4" i just meant set all bits to true. Not sure why i chose that wording

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, I get it. I suck at names, so feel free to ignore my feedback on anything naming related 😆

ref ReadOnlySpan<byte> channel0,
ref ReadOnlySpan<byte> channel1,
ref ReadOnlySpan<byte> channel2,
ref Span<byte> dest)
{
DebugGuard.IsTrue(channel0.Length == dest.Length, nameof(channel0), "Input spans must be of same length!");
DebugGuard.IsTrue(channel1.Length == dest.Length, nameof(channel1), "Input spans must be of same length!");
DebugGuard.IsTrue(channel2.Length == dest.Length, nameof(channel2), "Input spans must be of same length!");

if (Avx2.IsSupported)
{
int remainder = ImageMaths.ModuloP2(channel1.Length, Vector256<byte>.Count);
int adjustedCount = channel1.Length - remainder;

if (adjustedCount > 0)
{
channel0 = channel0.Slice(adjustedCount);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the same Slice mistake @JimBobSquarePants pointed out from the other changes. You're slicing to the remainder before calling the method that's supposed to do the main part.

channel1 = channel1.Slice(adjustedCount);
channel2 = channel2.Slice(adjustedCount);
dest = dest.Slice(adjustedCount);

PackBytesToUInt32SaturateChannel4(
channel0,
channel1,
channel2,
dest);
}
}
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static void PackBytesToUInt32SaturateChannel4(
ReadOnlySpan<byte> channel0,
ReadOnlySpan<byte> channel1,
ReadOnlySpan<byte> channel2,
Span<byte> dest)
{
int n = dest.Length / Vector256<byte>.Count;

ref Vector256<byte> source0Base =
ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(channel0));
ref Vector256<byte> source1Base =
ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(channel1));
ref Vector256<byte> source2Base =
ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(channel2));

ref Vector256<byte> destBase = ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(dest));

Vector256<byte> allOnes = Avx2.CompareEqual(Vector256<byte>.Zero, Vector256<byte>.Zero);

for (int i = 0, j = 0; j < n; i += 1, j += 4)
{
Vector256<byte> s0 = Unsafe.Add(ref source0Base, i);
Vector256<byte> s1 = Unsafe.Add(ref source1Base, i);
Vector256<byte> s2 = Unsafe.Add(ref source2Base, i);

s0 = Avx2.Permute4x64(s0.AsUInt64(), 0b_11_01_10_00).AsByte();
s1 = Avx2.Permute4x64(s1.AsUInt64(), 0b_11_01_10_00).AsByte();
s2 = Avx2.Permute4x64(s2.AsUInt64(), 0b_11_01_10_00).AsByte();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Permutes are expensive, so you'll do better to just let the upper/lower lanes stay as is and fix them at the end rather than trying to keep everything in order all the way through. It may help to write the SSE2 version first and have the AVX2 code match that right up until the end when you deal with the 128-bit lanes being interleaved.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍 that's actually news to me 😄 , i was always under the impression permutes were only a couple of cycles

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

They're 3 cycles latency and 1 cycle throughput, but they're also only able to be scheduled on a single port, so they can easily be a bottleneck. I should have said 'relatively expensive', compared to most of the other instructions used here.

A good test to judge the impact would be to write up a benchmark for the SSE2 version of the code to see how the AVX2 version compares to that. Your goal would, of course, be 2x performance. You might be surprised what kind of impact all those permutes have in that context :)


Vector256<ushort> s01Lo = Avx2.UnpackLow(s0, s1).AsUInt16();
Vector256<ushort> s01Hi = Avx2.UnpackHigh(s0, s1).AsUInt16();

s01Lo = Avx2.Permute4x64(s01Lo.AsUInt64(), 0b_11_01_10_00).AsUInt16();
s01Hi = Avx2.Permute4x64(s01Hi.AsUInt64(), 0b_11_01_10_00).AsUInt16();

Vector256<ushort> s23Lo = Avx2.UnpackLow(s2, allOnes).AsUInt16();
Vector256<ushort> s23Hi = Avx2.UnpackHigh(s2, allOnes).AsUInt16();

s23Lo = Avx2.Permute4x64(s23Lo.AsUInt64(), 0b_11_01_10_00).AsUInt16();
s23Hi = Avx2.Permute4x64(s23Hi.AsUInt64(), 0b_11_01_10_00).AsUInt16();

Vector256<byte> b0 = Avx2.UnpackLow(s01Lo, s23Lo).AsByte();
Vector256<byte> b1 = Avx2.UnpackHigh(s01Lo, s23Lo).AsByte();
Vector256<byte> b2 = Avx2.UnpackLow(s01Hi, s23Hi).AsByte();
Vector256<byte> b3 = Avx2.UnpackHigh(s01Hi, s23Hi).AsByte();

Unsafe.Add(ref destBase, j) = b0;
Unsafe.Add(ref destBase, j + 1) = b1;
Unsafe.Add(ref destBase, j + 2) = b2;
Unsafe.Add(ref destBase, j + 3) = b3;
}
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static void PackBytesToUInt24Reduce(
ref ReadOnlySpan<byte> channel0,
ref ReadOnlySpan<byte> channel1,
ref ReadOnlySpan<byte> channel2,
ref Span<byte> dest)
{
DebugGuard.IsTrue(channel0.Length == dest.Length, nameof(channel0), "Input spans must be of same length!");
DebugGuard.IsTrue(channel1.Length == dest.Length, nameof(channel1), "Input spans must be of same length!");
DebugGuard.IsTrue(channel2.Length == dest.Length, nameof(channel2), "Input spans must be of same length!");

if (Avx2.IsSupported)
{
int remainder = ImageMaths.ModuloP2(channel0.Length, Vector256<byte>.Count);
int adjustedCount = channel0.Length - remainder;

if (adjustedCount > 0)
{
channel0 = channel0.Slice(adjustedCount);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Incorrect Slice use here as well

channel1 = channel1.Slice(adjustedCount);
channel2 = channel2.Slice(adjustedCount);
dest = dest.Slice(adjustedCount);

PackBytesToUInt24(
channel0,
channel1,
channel2,
dest);
}
}
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static void PackBytesToUInt24(
ReadOnlySpan<byte> channel0,
ReadOnlySpan<byte> channel1,
ReadOnlySpan<byte> channel2,
Span<byte> dest)
{
VerifySpanInput(channel0, dest, Vector256<byte>.Count);
VerifySpanInput(channel1, dest, Vector256<byte>.Count);
VerifySpanInput(channel2, dest, Vector256<byte>.Count);

int n = dest.Length / Vector256<byte>.Count;

ref Vector256<byte> source0Base =
ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(channel0));
ref Vector256<byte> source1Base =
ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(channel1));
ref Vector256<byte> source2Base =
ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(channel2));

ref Vector256<byte> destBase =
ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(dest));

Vector256<byte> s0Mask0 = Vector256.Create(0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1, 5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10, -1).AsByte();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

They'll direct you to use Vector256.Create like this over in the runtime repo because they've fixed the JIT to recognize it in 5.0. It's exceedingly expensive in 3.x, though, and since 3.1 has 2.5 years of LTS left, I would personally keep using the ReadOnlySpan<byte> trick to load these from static data.

@JimBobSquarePants do you have a preference there?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, let's use the ROS trick here. We want to try to max out on all supported frameworks.

Vector256<byte> s0Mask1 = Vector256.Create(-1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1, -1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1, 5).AsByte();
Vector256<byte> s0Mask2 = Vector256.Create(-1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10, -1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1, -1).AsByte();

Vector256<byte> s1Mask0 = Vector256.Create(-1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1, 5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10).AsByte();
Vector256<byte> s1Mask1 = Vector256.Create(-1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1, -1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1).AsByte();
Vector256<byte> s1Mask2 = Vector256.Create(5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10, -1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1).AsByte();

Vector256<byte> s2Mask0 = Vector256.Create(-1, -1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1, 5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1).AsByte();
Vector256<byte> s2Mask1 = Vector256.Create(10, -1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1, -1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1).AsByte();
Vector256<byte> s2Mask2 = Vector256.Create(-1, 5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10, -1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15).AsByte();

for (int i = 0, j = 0; j < n; i += 1, j += 3)
{
Vector256<byte> s0 = Unsafe.Add(ref source0Base, i);
Vector256<byte> s1 = Unsafe.Add(ref source1Base, i);
Vector256<byte> s2 = Unsafe.Add(ref source2Base, i);

Vector256<byte> loS0 = Avx2.Permute2x128(s0, s0, 0);
Vector256<byte> loS1 = Avx2.Permute2x128(s1, s1, 0);
Vector256<byte> loS2 = Avx2.Permute2x128(s2, s2, 0);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Again, all these permutes and shuffles will be quite expensive. What I do for these 3 channel cases is treat them the same as 4 channel right up until the end (in this case with a dummy zero vector), and then do a single shuffle to pack the 12 good bytes out of each 16 together. That does mean overrunning the end of your buffer by 8 bytes for the AVX2 implementation, though, so you'd have to adjust your remainder/cleanup length by 8 to compensate. It's worth doing some benchmarks to see if the difference if you're not clear on it.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That single shuffle... How would that work? I can't figure it out without multiple permutes.

Copy link
Contributor

@saucecontrol saucecontrol Oct 25, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So, assuming this one starts off the same as the 4-channel version, you'll end up with vectors that look like this:

BGRx:BGRx:BGRx:BGRx||BGRx:BGRx:BGRx:BGRx

With a shuffle, you can pack the BGR/RGB triplets together, putting the filler/alpha values at the end of each lane. Then with a permute, you can cram the 12-byte valid sections of each lane together, leaving all the dummy values in the last 8 bytes. Like such (forgive my poor ascii art skills):

BGRx:BGRx:BGRx:BGRx||BGRx:BGRx:BGRx:BGRx
vpshufb (0,1,2,4,5,6,8,9,a,c,d,e,3,7,b,f)
                   ||
                   \/
BGRB:GRBG:RBGR:xxxx||BGRB:GRBG:RBGR:xxxx
vpermd (0,1,2,4,5,6,3,7)
                   ||
                   \/
BGRB:GRBG:RBGR:BGRB||GRBG:RBGR:xxxx:xxxx

At that point, you can either write the full 32-byte vector out (but then only advance your output ref/ptr by 24 bytes), or you can write the lower lane and then movq the first half of the upper lane, depending on whether you're at the end of your out buffer or not.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BTW, because this one requires 3 permutes to start and then 4 to end, it may be one where SSE2 is faster than AVX2. Would be worth benchmarking.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yay! I couldn't figure out the first part. Thanks!

On another note, BroadcastVector128ToVector256 should absolutely have an overload accepting a Vector128<T>.

image

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice!

The vbroadcasti128 instruction only accepts a memory operand, so to use it with Vector128<T>, it would have to spill to memory. You can, however, use Vector256<T>.Create() with the same Vector128<T> for both args, and it will emit the correct permute for you.


Vector256<byte> b0 = Avx2.Shuffle(loS0, s0Mask0);
b0 = Avx2.Or(b0, Avx2.Shuffle(loS1, s1Mask0));
b0 = Avx2.Or(b0, Avx2.Shuffle(loS2, s2Mask0));

Vector256<byte> b1 = Avx2.Shuffle(s0, s0Mask1);
b1 = Avx2.Or(b1, Avx2.Shuffle(s1, s1Mask1));
b1 = Avx2.Or(b1, Avx2.Shuffle(s2, s2Mask1));

Vector256<byte> hiS0 = Avx2.Permute2x128(s0, s0, 0b_0001_0001);
Vector256<byte> hiS1 = Avx2.Permute2x128(s1, s1, 0b_0001_0001);
Vector256<byte> hiS2 = Avx2.Permute2x128(s2, s2, 0b_0001_0001);

Vector256<byte> b2 = Avx2.Shuffle(hiS0, s0Mask2);
b2 = Avx2.Or(b2, Avx2.Shuffle(hiS1, s1Mask2));
b2 = Avx2.Or(b2, Avx2.Shuffle(hiS2, s2Mask2));

Unsafe.Add(ref destBase, j + 0) = b0;
Unsafe.Add(ref destBase, j + 1) = b1;
Unsafe.Add(ref destBase, j + 2) = b2;
}
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector256<int> ConvertToInt32(Vector256<float> vf, Vector256<float> scale)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,9 @@ internal static void ByteToNormalizedFloatReduce(

if (adjustedCount > 0)
{
ByteToNormalizedFloat(source.Slice(0, adjustedCount), dest.Slice(0, adjustedCount));
ByteToNormalizedFloat(
source.Slice(0, adjustedCount),
dest.Slice(0, adjustedCount));

source = source.Slice(adjustedCount);
dest = dest.Slice(adjustedCount);
Expand Down
107 changes: 107 additions & 0 deletions src/ImageSharp/Common/Helpers/SimdUtils.cs
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,103 @@ internal static void NormalizedFloatToByteSaturate(ReadOnlySpan<float> source, S
}
}

internal static void PackBytesToUInt32SaturateChannel4(
ReadOnlySpan<byte> channel0,
ReadOnlySpan<byte> channel1,
ReadOnlySpan<byte> channel2,
Span<byte> dest)
{
DebugGuard.IsTrue(channel0.Length == dest.Length, nameof(channel0), "Input spans must be of same length!");
DebugGuard.IsTrue(channel1.Length == dest.Length, nameof(channel1), "Input spans must be of same length!");
DebugGuard.IsTrue(channel2.Length == dest.Length, nameof(channel2), "Input spans must be of same length!");

#if SUPPORTS_RUNTIME_INTRINSICS
Avx2Intrinsics.PackBytesToUInt32SaturateChannel4Reduce(ref channel0, ref channel1, ref channel2, ref dest);

// I can't immediately see a way to do this operation efficiently with Vector<T> or Vector4<T>. TODO
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is none :)

#elif SUPPORTS_EXTENDED_INTRINSICS
// ExtendedIntrinsics.PackBytesToUInt32SaturateChannel4Reduce(ref channel0, ref channel1, ref channel2, ref dest);
#else
// BasicIntrinsics256.PackBytesToUInt32SaturateChannel4Reduce(ref channel0, ref channel1, ref channel2, ref dest);
#endif

// Deal with the remainder:
if (channel0.Length > 0)
{
PackBytesToUInt32SaturateChannel4Remainder(channel0, channel1, channel2, dest);
}
}

private static void PackBytesToUInt32SaturateChannel4Remainder(
ReadOnlySpan<byte> channel0,
ReadOnlySpan<byte> channel1,
ReadOnlySpan<byte> channel2,
Span<byte> dest)
{
DebugGuard.MustBeGreaterThanOrEqualTo(dest.Length, channel0.Length * 4, nameof(dest));

ref byte s0Base = ref MemoryMarshal.GetReference(channel0);
ref byte s1Base = ref MemoryMarshal.GetReference(channel1);
ref byte s2Base = ref MemoryMarshal.GetReference(channel2);
ref byte dBase = ref MemoryMarshal.GetReference(dest);

for (int i = 0, j = 0; i < dest.Length; i += 1, j += 4)
{
Unsafe.Add(ref dBase, j) = Unsafe.Add(ref s0Base, i);
Unsafe.Add(ref dBase, j + 1) = Unsafe.Add(ref s1Base, i);
Unsafe.Add(ref dBase, j + 2) = Unsafe.Add(ref s2Base, i);
Unsafe.Add(ref dBase, j + 2) = 0xFF;
}
}

internal static void PackBytesToUInt24(
ReadOnlySpan<byte> channel0,
ReadOnlySpan<byte> channel1,
ReadOnlySpan<byte> channel2,
Span<byte> dest)
{
DebugGuard.IsTrue(channel0.Length == dest.Length, nameof(channel0), "Input spans must be of same length!");
DebugGuard.IsTrue(channel1.Length == dest.Length, nameof(channel1), "Input spans must be of same length!");
DebugGuard.IsTrue(channel2.Length == dest.Length, nameof(channel2), "Input spans must be of same length!");

#if SUPPORTS_RUNTIME_INTRINSICS
Avx2Intrinsics.PackBytesToUInt24Reduce(ref channel0, ref channel1, ref channel2, ref dest);

// I can't immediately see a way to do this operation efficiently with Vector<T> or Vector4<T>. TODO
#elif SUPPORTS_EXTENDED_INTRINSICS
// ExtendedIntrinsics.PackBytesToUInt24Reduce(ref channel0, ref channel1, ref channel2, ref dest);
#else
// BasicIntrinsics256.PackBytesToUInt24Reduce(ref channel0, ref channel1, ref channel2, ref dest);
#endif

// Deal with the remainder:
if (channel0.Length > 0)
{
PackBytesToUInt24(channel0, channel1, channel2, dest);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is calling itself recursively. Should be calling the remainder method.

}
}

private static void PackBytesToUInt24Remainder(
ReadOnlySpan<byte> channel0,
ReadOnlySpan<byte> channel1,
ReadOnlySpan<byte> channel2,
Span<byte> dest)
{
DebugGuard.MustBeGreaterThanOrEqualTo(dest.Length, channel0.Length * 3, nameof(dest));

ref byte s0Base = ref MemoryMarshal.GetReference(channel0);
ref byte s1Base = ref MemoryMarshal.GetReference(channel1);
ref byte s2Base = ref MemoryMarshal.GetReference(channel2);
ref byte dBase = ref MemoryMarshal.GetReference(dest);

for (int i = 0, j = 0; i < dest.Length; i += 1, j += 3)
{
Unsafe.Add(ref dBase, j) = Unsafe.Add(ref s0Base, i);
Unsafe.Add(ref dBase, j + 1) = Unsafe.Add(ref s1Base, i);
Unsafe.Add(ref dBase, j + 2) = Unsafe.Add(ref s2Base, i);
}
}

[MethodImpl(InliningOptions.ColdPath)]
private static void ConvertByteToNormalizedFloatRemainder(ReadOnlySpan<byte> source, Span<float> dest)
{
Expand Down Expand Up @@ -176,6 +273,16 @@ private static void VerifySpanInput(ReadOnlySpan<byte> source, Span<float> dest,
$"length should be divisible by {shouldBeDivisibleBy}!");
}

[Conditional("DEBUG")]
private static void VerifySpanInput(ReadOnlySpan<byte> source, Span<byte> dest, int shouldBeDivisibleBy)
{
DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");
DebugGuard.IsTrue(
ImageMaths.ModuloP2(dest.Length, shouldBeDivisibleBy) == 0,
nameof(source),
$"length should be divisible by {shouldBeDivisibleBy}!");
}

[Conditional("DEBUG")]
private static void VerifySpanInput(ReadOnlySpan<float> source, Span<byte> dest, int shouldBeDivisibleBy)
{
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (c) Six Labors.
// Copyright (c) Six Labors.
// Licensed under the Apache License, Version 2.0.

// <auto-generated />
Expand All @@ -21,7 +21,7 @@ public partial struct Rgb24
/// <summary>
/// Provides optimized overrides for bulk operations.
/// </summary>
internal class PixelOperations : PixelOperations<Rgb24>
internal partial class PixelOperations : PixelOperations<Rgb24>
{
/// <inheritdoc />
public override void FromRgb24(Configuration configuration, ReadOnlySpan<Rgb24> source, Span<Rgb24> destinationPixels)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
<#@include file="_Common.ttinclude" #>
<#@include file="_Common.ttinclude" #>
<#@ output extension=".cs" #>

namespace SixLabors.ImageSharp.PixelFormats
Expand All @@ -11,7 +11,7 @@ namespace SixLabors.ImageSharp.PixelFormats
/// <summary>
/// Provides optimized overrides for bulk operations.
/// </summary>
internal class PixelOperations : PixelOperations<Rgb24>
internal partial class PixelOperations : PixelOperations<Rgb24>
{
<# GenerateAllDefaultConversionMethods("Rgb24"); #>
}
Expand Down
Loading