Skip to content
53 changes: 44 additions & 9 deletions src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
// ReSharper disable InconsistentNaming
namespace SixLabors.ImageSharp.Formats.Webp.Lossy
{
internal static unsafe class LossyUtils
internal static class LossyUtils
{
[MethodImpl(InliningOptions.ShortMethod)]
public static int Vp8Sse16X16(Span<byte> a, Span<byte> b) => GetSse(a, b, 16, 16);
Expand All @@ -22,7 +22,48 @@ internal static unsafe class LossyUtils
public static int Vp8Sse16X8(Span<byte> a, Span<byte> b) => GetSse(a, b, 16, 8);

[MethodImpl(InliningOptions.ShortMethod)]
public static int Vp8Sse4X4(Span<byte> a, Span<byte> b) => GetSse(a, b, 4, 4);
public static int Vp8Sse4X4(Span<byte> a, Span<byte> b)
{
#if SUPPORTS_RUNTIME_INTRINSICS
if (Sse2.IsSupported)
{
// Load values.
Vector128<byte> a0 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(a));
Vector128<byte> a1 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(a.Slice(WebpConstants.Bps, 8)));
Vector128<byte> a2 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(a.Slice(WebpConstants.Bps * 2, 8)));
Vector128<byte> a3 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(a.Slice(WebpConstants.Bps * 3, 8)));
Copy link
Member

@antonfirsov antonfirsov Nov 9, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Slicing has some unnecessary extra costs, if input is safe we can do everything with pointer Unsafe arithmetics:

Suggested change
Vector128<byte> a0 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(a));
Vector128<byte> a1 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(a.Slice(WebpConstants.Bps, 8)));
Vector128<byte> a2 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(a.Slice(WebpConstants.Bps * 2, 8)));
Vector128<byte> a3 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(a.Slice(WebpConstants.Bps * 3, 8)));
ref byte aRef = ref MemoryMarshal.GetReference(a);
Vector128<byte> a0 = Unsafe.As<byte, Vector128<byte>>(ref aRef);
Vector128<byte> a1 = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref aRef, WebpConstants.Bps));
Vector128<byte> a2 = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref aRef, WebpConstants.Bps * 2));
Vector128<byte> a3 = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref aRef, WebpConstants.Bps * 3));

Same for b.

Vector128<byte> b0 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(b));
Vector128<byte> b1 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(b.Slice(WebpConstants.Bps, 8)));
Vector128<byte> b2 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(b.Slice(WebpConstants.Bps * 2, 8)));
Vector128<byte> b3 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(b.Slice(WebpConstants.Bps * 3, 8)));

// Combine pair of lines.
Vector128<int> a01 = Sse2.UnpackLow(a0.AsInt32(), a1.AsInt32());
Vector128<int> a23 = Sse2.UnpackLow(a2.AsInt32(), a3.AsInt32());
Vector128<int> b01 = Sse2.UnpackLow(b0.AsInt32(), b1.AsInt32());
Vector128<int> b23 = Sse2.UnpackLow(b2.AsInt32(), b3.AsInt32());

// Convert to 16b.
Vector128<byte> a01s = Sse2.UnpackLow(a01.AsByte(), Vector128<byte>.Zero);
Vector128<byte> a23s = Sse2.UnpackLow(a23.AsByte(), Vector128<byte>.Zero);
Vector128<byte> b01s = Sse2.UnpackLow(b01.AsByte(), Vector128<byte>.Zero);
Vector128<byte> b23s = Sse2.UnpackLow(b23.AsByte(), Vector128<byte>.Zero);

// subtract, square and accumulate.
Vector128<byte> d0 = Sse2.SubtractSaturate(a01s, b01s);
Vector128<byte> d1 = Sse2.SubtractSaturate(a23s, b23s);
Vector128<int> e0 = Sse2.MultiplyAddAdjacent(d0.AsInt16(), d0.AsInt16());
Vector128<int> e1 = Sse2.MultiplyAddAdjacent(d1.AsInt16(), d1.AsInt16());
Vector128<int> sum = Sse2.Add(e0, e1);

return Numerics.ReduceSum(sum);
}
else
#endif
{
return GetSse(a, b, 4, 4);
}
}

[MethodImpl(InliningOptions.ShortMethod)]
public static int GetSse(Span<byte> a, Span<byte> b, int w, int h)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This method name confuses me shouldn't it be Vp84X4?

Copy link
Member

@antonfirsov antonfirsov Nov 10, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Vp84X4 is confusing because of 84. Parent method can be Vp8_4x4 while this one can be Vp8_4X4Scalar. Or maybe Vp8_4x4Scalar could be inlined if it's not used anywhere else?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree that the name is not ideal, but its called VP8SSE4x4 in the original libwebp source and I wanted to keep the method names to be as close to the original as possible to make it easier to follow/commpare the original code.
A better name might have been CalculateDistortion4x4, but again I am still in favor of keeping the names similar to the original.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This method name confuses me shouldn't it be Vp84X4?

GetSse is not called Vp84X4 because of the variable parameters w and h.

I tried to rename the methods: 7e20c5d
Is that better (or worse)?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Much better yes 👍

Expand Down Expand Up @@ -613,9 +654,6 @@ public static int TTransform(Span<byte> input, Span<ushort> w, Span<int> scratch
/// </summary>
public static int TTransformSse41(Span<byte> inputA, Span<byte> inputB, Span<ushort> w, Span<int> scratch)
{
Span<int> sum = scratch.Slice(0, 4);
sum.Clear();

// Load and combine inputs.
Vector128<byte> ina0 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputA));
Vector128<byte> ina1 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputA.Slice(WebpConstants.Bps, 16)));
Expand Down Expand Up @@ -720,9 +758,7 @@ public static int TTransformSse41(Span<byte> inputA, Span<byte> inputB, Span<ush
// difference of weighted sums.
Vector128<int> result = Sse2.Subtract(ab0ab2Sum.AsInt32(), b0w0bb2w8Sum.AsInt32());

ref int outputRef = ref MemoryMarshal.GetReference(sum);
Unsafe.As<int, Vector128<int>>(ref outputRef) = result.AsInt32();
return sum[3] + sum[2] + sum[1] + sum[0];
return Numerics.ReduceSum(result);
}
#endif

Expand All @@ -735,7 +771,6 @@ public static void TransformTwo(Span<short> src, Span<byte> dst, Span<int> scrat
public static void TransformOne(Span<short> src, Span<byte> dst, Span<int> scratch)
{
Span<int> tmp = scratch.Slice(0, 16);
tmp.Clear();
int tmpOffset = 0;
for (int srcOffset = 0; srcOffset < 4; srcOffset++)
{
Expand Down
17 changes: 5 additions & 12 deletions src/ImageSharp/Formats/Webp/Lossy/Vp8ModeScore.cs
Original file line number Diff line number Diff line change
Expand Up @@ -97,18 +97,11 @@ public Vp8ModeScore()

public void Clear()
{
this.YDcLevels.AsSpan().Clear();
this.YAcLevels.AsSpan().Clear();
this.UvLevels.AsSpan().Clear();
this.ModesI4.AsSpan().Clear();

for (int i = 0; i < 2; i++)
{
for (int j = 0; j < 3; j++)
{
this.Derr[i, j] = 0;
}
}
Array.Clear(this.YDcLevels, 0, this.YDcLevels.Length);
Array.Clear(this.YAcLevels, 0, this.YAcLevels.Length);
Array.Clear(this.UvLevels, 0, this.UvLevels.Length);
Array.Clear(this.ModesI4, 0, this.ModesI4.Length);
Array.Clear(this.Derr, 0, this.Derr.Length);
Comment on lines +100 to +104
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unrelated to the main thing in the PR, but Vp8ModeScore is also a good candidate to be made a value type with fixed buffers.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes seems reasonable, Vp8ModeScore makes up the most of the allocations during encoding, but I might try that in a different PR.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Definitely worth a look after.

}

public void InitScore()
Expand Down
38 changes: 38 additions & 0 deletions tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,35 @@ namespace SixLabors.ImageSharp.Tests.Formats.WebP
[Trait("Format", "Webp")]
public class LossyUtilsTests
{
private static void RunVp8Sse4X4Test()
{
byte[] a =
{
27, 27, 28, 29, 29, 28, 27, 27, 27, 28, 28, 29, 29, 28, 28, 27, 129, 129, 129, 129, 129, 129, 129,
129, 128, 128, 128, 128, 128, 128, 128, 128, 27, 27, 27, 27, 27, 27, 27, 27, 27, 28, 28, 29, 29, 28,
28, 27, 129, 129, 129, 129, 129, 129, 129, 129, 128, 128, 128, 128, 128, 128, 128, 128, 27, 27, 26,
26, 26, 26, 27, 27, 27, 28, 28, 29, 29, 28, 28, 27, 129, 129, 129, 129, 129, 129, 129, 129, 128,
128, 128, 128, 128, 128, 128, 128, 28, 27, 27, 26, 26, 27, 27, 28, 27, 28, 28, 29, 29, 28, 28, 27,
129, 129, 129, 129, 129, 129, 129, 129, 128, 128, 128, 128, 128, 128, 128, 128
};

byte[] b =
{
26, 26, 26, 26, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 204, 204, 204, 204, 204, 204, 204,
204, 204, 204, 204, 204, 204, 204, 204, 204, 26, 26, 26, 26, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
28, 28, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 26, 26, 26,
26, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 204, 204, 204, 204, 204, 204, 204, 204, 204,
204, 204, 204, 204, 204, 204, 204, 26, 26, 26, 26, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204
};

int expected = 27;

int actual = LossyUtils.Vp8Sse4X4(a, b);

Assert.Equal(expected, actual);
}

private static void RunHadamardTransformTest()
{
byte[] a =
Expand Down Expand Up @@ -37,10 +66,19 @@ private static void RunHadamardTransformTest()
Assert.Equal(expected, actual);
}

[Fact]
public void Vp8Sse4X4_Works() => RunVp8Sse4X4Test();

[Fact]
public void HadamardTransform_Works() => RunHadamardTransformTest();

#if SUPPORTS_RUNTIME_INTRINSICS
[Fact]
public void Vp8Sse4X4_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse4X4Test, HwIntrinsics.AllowAll);

[Fact]
public void Vp8Sse4X4_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse4X4Test, HwIntrinsics.DisableHWIntrinsic);

[Fact]
public void HadamardTransform_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunHadamardTransformTest, HwIntrinsics.AllowAll);

Expand Down