Skip to content
66 changes: 53 additions & 13 deletions src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,63 @@ internal static class LossyUtils
private static readonly Vector128<byte> Mean16x4Mask = Vector128.Create((short)0x00ff).AsByte();
#endif

// Note: method name in libwebp reference implementation is called VP8SSE16x16.
[MethodImpl(InliningOptions.ShortMethod)]
public static int Vp8Sse16X16(Span<byte> a, Span<byte> b) => GetSse(a, b, 16, 16);
public static int Vp8_Sse16X16(Span<byte> a, Span<byte> b) => Vp8_SseNxN(a, b, 16, 16);

// Note: method name in libwebp reference implementation is called VP8SSE16x8.
[MethodImpl(InliningOptions.ShortMethod)]
public static int Vp8Sse16X8(Span<byte> a, Span<byte> b) => GetSse(a, b, 16, 8);
public static int Vp8_Sse16X8(Span<byte> a, Span<byte> b) => Vp8_SseNxN(a, b, 16, 8);

// Note: method name in libwebp reference implementation is called VP8SSE4x4.
[MethodImpl(InliningOptions.ShortMethod)]
public static int Vp8Sse4X4(Span<byte> a, Span<byte> b) => GetSse(a, b, 4, 4);
public static int Vp8_Sse4X4(Span<byte> a, Span<byte> b)
{
#if SUPPORTS_RUNTIME_INTRINSICS
if (Sse2.IsSupported)
{
// Load values.
ref byte aRef = ref MemoryMarshal.GetReference(a);
Vector128<byte> a0 = Unsafe.As<byte, Vector128<byte>>(ref aRef);
Vector128<byte> a1 = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref aRef, WebpConstants.Bps));
Vector128<byte> a2 = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref aRef, WebpConstants.Bps * 2));
Vector128<byte> a3 = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref aRef, WebpConstants.Bps * 3));
ref byte bRef = ref MemoryMarshal.GetReference(b);
Vector128<byte> b0 = Unsafe.As<byte, Vector128<byte>>(ref bRef);
Vector128<byte> b1 = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref bRef, WebpConstants.Bps));
Vector128<byte> b2 = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref bRef, WebpConstants.Bps * 2));
Vector128<byte> b3 = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref bRef, WebpConstants.Bps * 3));

// Combine pair of lines.
Vector128<int> a01 = Sse2.UnpackLow(a0.AsInt32(), a1.AsInt32());
Vector128<int> a23 = Sse2.UnpackLow(a2.AsInt32(), a3.AsInt32());
Vector128<int> b01 = Sse2.UnpackLow(b0.AsInt32(), b1.AsInt32());
Vector128<int> b23 = Sse2.UnpackLow(b2.AsInt32(), b3.AsInt32());

// Convert to 16b.
Vector128<byte> a01s = Sse2.UnpackLow(a01.AsByte(), Vector128<byte>.Zero);
Vector128<byte> a23s = Sse2.UnpackLow(a23.AsByte(), Vector128<byte>.Zero);
Vector128<byte> b01s = Sse2.UnpackLow(b01.AsByte(), Vector128<byte>.Zero);
Vector128<byte> b23s = Sse2.UnpackLow(b23.AsByte(), Vector128<byte>.Zero);

// subtract, square and accumulate.
Vector128<byte> d0 = Sse2.SubtractSaturate(a01s, b01s);
Vector128<byte> d1 = Sse2.SubtractSaturate(a23s, b23s);
Vector128<int> e0 = Sse2.MultiplyAddAdjacent(d0.AsInt16(), d0.AsInt16());
Vector128<int> e1 = Sse2.MultiplyAddAdjacent(d1.AsInt16(), d1.AsInt16());
Vector128<int> sum = Sse2.Add(e0, e1);

return Numerics.ReduceSum(sum);
}
else
#endif
{
return Vp8_SseNxN(a, b, 4, 4);
}
}

[MethodImpl(InliningOptions.ShortMethod)]
public static int GetSse(Span<byte> a, Span<byte> b, int w, int h)
public static int Vp8_SseNxN(Span<byte> a, Span<byte> b, int w, int h)
{
int count = 0;
int aOffset = 0;
Expand Down Expand Up @@ -88,7 +134,7 @@ public static int Vp8Disto4X4(Span<byte> a, Span<byte> b, Span<ushort> w, Span<i
#if SUPPORTS_RUNTIME_INTRINSICS
if (Sse41.IsSupported)
{
int diffSum = TTransformSse41(a, b, w, scratch);
int diffSum = TTransformSse41(a, b, w);
return Math.Abs(diffSum) >> 5;
}
else
Expand Down Expand Up @@ -615,11 +661,8 @@ public static int TTransform(Span<byte> input, Span<ushort> w, Span<int> scratch
/// Returns the weighted sum of the absolute value of transformed coefficients.
/// w[] contains a row-major 4 by 4 symmetric matrix.
/// </summary>
public static int TTransformSse41(Span<byte> inputA, Span<byte> inputB, Span<ushort> w, Span<int> scratch)
public static int TTransformSse41(Span<byte> inputA, Span<byte> inputB, Span<ushort> w)
{
Span<int> sum = scratch.Slice(0, 4);
sum.Clear();

// Load and combine inputs.
Vector128<byte> ina0 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputA));
Vector128<byte> ina1 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputA.Slice(WebpConstants.Bps, 16)));
Expand Down Expand Up @@ -724,9 +767,7 @@ public static int TTransformSse41(Span<byte> inputA, Span<byte> inputB, Span<ush
// difference of weighted sums.
Vector128<int> result = Sse2.Subtract(ab0ab2Sum.AsInt32(), b0w0bb2w8Sum.AsInt32());

ref int outputRef = ref MemoryMarshal.GetReference(sum);
Unsafe.As<int, Vector128<int>>(ref outputRef) = result.AsInt32();
return sum[3] + sum[2] + sum[1] + sum[0];
return Numerics.ReduceSum(result);
}
#endif

Expand All @@ -739,7 +780,6 @@ public static void TransformTwo(Span<short> src, Span<byte> dst, Span<int> scrat
public static void TransformOne(Span<short> src, Span<byte> dst, Span<int> scratch)
{
Span<int> tmp = scratch.Slice(0, 16);
tmp.Clear();
int tmpOffset = 0;
for (int srcOffset = 0; srcOffset < 4; srcOffset++)
{
Expand Down
16 changes: 6 additions & 10 deletions src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ public static void PickBestIntra16(Vp8EncIterator it, ref Vp8ModeScore rd, Vp8Se
rdCur.Nz = (uint)ReconstructIntra16(it, dqm, rdCur, tmpDst, mode);

// Measure RD-score.
rdCur.D = LossyUtils.Vp8Sse16X16(src, tmpDst);
rdCur.D = LossyUtils.Vp8_Sse16X16(src, tmpDst);
rdCur.SD = tlambda != 0 ? Mult8B(tlambda, LossyUtils.Vp8Disto16X16(src, tmpDst, WeightY, scratch)) : 0;
rdCur.H = WebpConstants.Vp8FixedCostsI16[mode];
rdCur.R = it.GetCostLuma16(rdCur, proba, res);
Expand Down Expand Up @@ -160,7 +160,7 @@ public static bool PickBestIntra4(Vp8EncIterator it, ref Vp8ModeScore rd, Vp8Seg
rdTmp.Nz = (uint)ReconstructIntra4(it, dqm, tmpLevels, src, tmpDst, mode);

// Compute RD-score.
rdTmp.D = LossyUtils.Vp8Sse4X4(src, tmpDst);
rdTmp.D = LossyUtils.Vp8_Sse4X4(src, tmpDst);
rdTmp.SD = tlambda != 0 ? Mult8B(tlambda, LossyUtils.Vp8Disto4X4(src, tmpDst, WeightY, scratch)) : 0;
rdTmp.H = modeCosts[mode];

Expand Down Expand Up @@ -251,7 +251,7 @@ public static void PickBestUv(Vp8EncIterator it, ref Vp8ModeScore rd, Vp8Segment
rdUv.Nz = (uint)ReconstructUv(it, dqm, rdUv, tmpDst, mode);

// Compute RD-score
rdUv.D = LossyUtils.Vp8Sse16X8(src, tmpDst);
rdUv.D = LossyUtils.Vp8_Sse16X8(src, tmpDst);
rdUv.SD = 0; // not calling TDisto here: it tends to flatten areas.
rdUv.H = WebpConstants.Vp8FixedCostsUv[mode];
rdUv.R = it.GetCostUv(rdUv, proba, res);
Expand Down Expand Up @@ -340,8 +340,6 @@ public static int ReconstructIntra4(Vp8EncIterator it, Vp8SegmentInfo dqm, Span<
Span<byte> reference = it.YuvP.AsSpan(Vp8Encoding.Vp8I4ModeOffsets[mode]);
Span<short> tmp = it.Scratch2.AsSpan(0, 16);
Span<int> scratch = it.Scratch3.AsSpan(0, 16);
tmp.Clear();
scratch.Clear();
Vp8Encoding.FTransform(src, reference, tmp, scratch);
int nz = QuantizeBlock(tmp, levels, ref dqm.Y1);
Vp8Encoding.ITransform(reference, tmp, yuvOut, false, scratch);
Expand All @@ -357,8 +355,6 @@ public static int ReconstructUv(Vp8EncIterator it, Vp8SegmentInfo dqm, Vp8ModeSc
int n;
Span<short> tmp = it.Scratch2.AsSpan(0, 8 * 16);
Span<int> scratch = it.Scratch3.AsSpan(0, 16);
tmp.Clear();
scratch.Clear();

for (n = 0; n < 8; n += 2)
{
Expand Down Expand Up @@ -411,7 +407,7 @@ public static void RefineUsingDistortion(Vp8EncIterator it, Vp8SegmentInfo[] seg
for (mode = 0; mode < WebpConstants.NumPredModes; ++mode)
{
Span<byte> reference = it.YuvP.AsSpan(Vp8Encoding.Vp8I16ModeOffsets[mode]);
long score = (LossyUtils.Vp8Sse16X16(src, reference) * WebpConstants.RdDistoMult) + (WebpConstants.Vp8FixedCostsI16[mode] * lambdaDi16);
long score = (LossyUtils.Vp8_Sse16X16(src, reference) * WebpConstants.RdDistoMult) + (WebpConstants.Vp8FixedCostsI16[mode] * lambdaDi16);

if (mode > 0 && WebpConstants.Vp8FixedCostsI16[mode] > bitLimit)
{
Expand Down Expand Up @@ -458,7 +454,7 @@ public static void RefineUsingDistortion(Vp8EncIterator it, Vp8SegmentInfo[] seg
for (mode = 0; mode < WebpConstants.NumBModes; ++mode)
{
Span<byte> reference = it.YuvP.AsSpan(Vp8Encoding.Vp8I4ModeOffsets[mode]);
long score = (LossyUtils.Vp8Sse4X4(src, reference) * WebpConstants.RdDistoMult) + (modeCosts[mode] * lambdaDi4);
long score = (LossyUtils.Vp8_Sse4X4(src, reference) * WebpConstants.RdDistoMult) + (modeCosts[mode] * lambdaDi4);
if (score < bestI4Score)
{
bestI4Mode = mode;
Expand Down Expand Up @@ -507,7 +503,7 @@ public static void RefineUsingDistortion(Vp8EncIterator it, Vp8SegmentInfo[] seg
for (mode = 0; mode < WebpConstants.NumPredModes; ++mode)
{
Span<byte> reference = it.YuvP.AsSpan(Vp8Encoding.Vp8UvModeOffsets[mode]);
long score = (LossyUtils.Vp8Sse16X8(src, reference) * WebpConstants.RdDistoMult) + (WebpConstants.Vp8FixedCostsUv[mode] * lambdaDuv);
long score = (LossyUtils.Vp8_Sse16X8(src, reference) * WebpConstants.RdDistoMult) + (WebpConstants.Vp8FixedCostsUv[mode] * lambdaDuv);
if (score < bestUvScore)
{
bestMode = mode;
Expand Down
3 changes: 0 additions & 3 deletions src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,6 @@ public static void ITransformOne(Span<byte> reference, Span<short> input, Span<b
{
int i;
Span<int> tmp = scratch.Slice(0, 16);
tmp.Clear();
for (i = 0; i < 4; i++)
{
// vertical pass.
Expand Down Expand Up @@ -124,7 +123,6 @@ public static void FTransform(Span<byte> src, Span<byte> reference, Span<short>
{
int i;
Span<int> tmp = scratch.Slice(0, 16);
tmp.Clear();

int srcIdx = 0;
int refIdx = 0;
Expand Down Expand Up @@ -163,7 +161,6 @@ public static void FTransform(Span<byte> src, Span<byte> reference, Span<short>
public static void FTransformWht(Span<short> input, Span<short> output, Span<int> scratch)
{
Span<int> tmp = scratch.Slice(0, 16);
tmp.Clear();

int i;
int inputIdx = 0;
Expand Down
1 change: 0 additions & 1 deletion src/ImageSharp/Formats/Webp/Lossy/Vp8Histogram.cs
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@ public void CollectHistogram(Span<byte> reference, Span<byte> pred, int startBlo
this.distribution.AsSpan().Clear();
for (j = startBlock; j < endBlock; j++)
{
this.output.AsSpan().Clear();
this.Vp8FTransform(reference.Slice(WebpLookupTables.Vp8DspScan[j]), pred.Slice(WebpLookupTables.Vp8DspScan[j]), this.output);

// Convert coefficients to bin.
Expand Down
17 changes: 5 additions & 12 deletions src/ImageSharp/Formats/Webp/Lossy/Vp8ModeScore.cs
Original file line number Diff line number Diff line change
Expand Up @@ -97,18 +97,11 @@ public Vp8ModeScore()

public void Clear()
{
this.YDcLevels.AsSpan().Clear();
this.YAcLevels.AsSpan().Clear();
this.UvLevels.AsSpan().Clear();
this.ModesI4.AsSpan().Clear();

for (int i = 0; i < 2; i++)
{
for (int j = 0; j < 3; j++)
{
this.Derr[i, j] = 0;
}
}
Array.Clear(this.YDcLevels, 0, this.YDcLevels.Length);
Array.Clear(this.YAcLevels, 0, this.YAcLevels.Length);
Array.Clear(this.UvLevels, 0, this.UvLevels.Length);
Array.Clear(this.ModesI4, 0, this.ModesI4.Length);
Array.Clear(this.Derr, 0, this.Derr.Length);
Comment on lines +100 to +104
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unrelated to the main thing in the PR, but Vp8ModeScore is also a good candidate to be made a value type with fixed buffers.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes seems reasonable, Vp8ModeScore makes up the most of the allocations during encoding, but I might try that in a different PR.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Definitely worth a look after.

}

public void InitScore()
Expand Down
38 changes: 38 additions & 0 deletions tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,35 @@ namespace SixLabors.ImageSharp.Tests.Formats.WebP
[Trait("Format", "Webp")]
public class LossyUtilsTests
{
private static void RunVp8Sse4X4Test()
{
byte[] a =
{
27, 27, 28, 29, 29, 28, 27, 27, 27, 28, 28, 29, 29, 28, 28, 27, 129, 129, 129, 129, 129, 129, 129,
129, 128, 128, 128, 128, 128, 128, 128, 128, 27, 27, 27, 27, 27, 27, 27, 27, 27, 28, 28, 29, 29, 28,
28, 27, 129, 129, 129, 129, 129, 129, 129, 129, 128, 128, 128, 128, 128, 128, 128, 128, 27, 27, 26,
26, 26, 26, 27, 27, 27, 28, 28, 29, 29, 28, 28, 27, 129, 129, 129, 129, 129, 129, 129, 129, 128,
128, 128, 128, 128, 128, 128, 128, 28, 27, 27, 26, 26, 27, 27, 28, 27, 28, 28, 29, 29, 28, 28, 27,
129, 129, 129, 129, 129, 129, 129, 129, 128, 128, 128, 128, 128, 128, 128, 128
};

byte[] b =
{
26, 26, 26, 26, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 204, 204, 204, 204, 204, 204, 204,
204, 204, 204, 204, 204, 204, 204, 204, 204, 26, 26, 26, 26, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
28, 28, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 26, 26, 26,
26, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 204, 204, 204, 204, 204, 204, 204, 204, 204,
204, 204, 204, 204, 204, 204, 204, 26, 26, 26, 26, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204
};

int expected = 27;

int actual = LossyUtils.Vp8_Sse4X4(a, b);

Assert.Equal(expected, actual);
}

private static void RunMean16x4Test()
{
// arrange
Expand Down Expand Up @@ -61,13 +90,22 @@ private static void RunHadamardTransformTest()
Assert.Equal(expected, actual);
}

[Fact]
public void Vp8Sse4X4_Works() => RunVp8Sse4X4Test();

[Fact]
public void Mean16x4_Works() => RunMean16x4Test();

[Fact]
public void HadamardTransform_Works() => RunHadamardTransformTest();

#if SUPPORTS_RUNTIME_INTRINSICS
[Fact]
public void Vp8Sse4X4_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse4X4Test, HwIntrinsics.AllowAll);

[Fact]
public void Vp8Sse4X4_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse4X4Test, HwIntrinsics.DisableHWIntrinsic);

[Fact]
public void Mean16x4_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunMean16x4Test, HwIntrinsics.AllowAll);

Expand Down