Skip to content
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 38 additions & 22 deletions src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
Original file line number Diff line number Diff line change
Expand Up @@ -704,28 +704,7 @@ public static int TTransformSse41(Span<byte> inputA, Span<byte> inputB, Span<ush
// a20 a21 a22 a23 b20 b21 b22 b23
// a30 a31 a32 a33 b30 b31 b32 b33
// Transpose the two 4x4.
Vector128<short> transpose00 = Sse2.UnpackLow(b0, b1);
Vector128<short> transpose01 = Sse2.UnpackLow(b2, b3);
Vector128<short> transpose02 = Sse2.UnpackHigh(b0, b1);
Vector128<short> transpose03 = Sse2.UnpackHigh(b2, b3);

// a00 a10 a01 a11 a02 a12 a03 a13
// a20 a30 a21 a31 a22 a32 a23 a33
// b00 b10 b01 b11 b02 b12 b03 b13
// b20 b30 b21 b31 b22 b32 b23 b33
Vector128<int> transpose10 = Sse2.UnpackLow(transpose00.AsInt32(), transpose01.AsInt32());
Vector128<int> transpose11 = Sse2.UnpackLow(transpose02.AsInt32(), transpose03.AsInt32());
Vector128<int> transpose12 = Sse2.UnpackHigh(transpose00.AsInt32(), transpose01.AsInt32());
Vector128<int> transpose13 = Sse2.UnpackHigh(transpose02.AsInt32(), transpose03.AsInt32());

// a00 a10 a20 a30 a01 a11 a21 a31
// b00 b10 b20 b30 b01 b11 b21 b31
// a02 a12 a22 a32 a03 a13 a23 a33
// b02 b12 a22 b32 b03 b13 b23 b33
Vector128<long> output0 = Sse2.UnpackLow(transpose10.AsInt64(), transpose11.AsInt64());
Vector128<long> output1 = Sse2.UnpackHigh(transpose10.AsInt64(), transpose11.AsInt64());
Vector128<long> output2 = Sse2.UnpackLow(transpose12.AsInt64(), transpose13.AsInt64());
Vector128<long> output3 = Sse2.UnpackHigh(transpose12.AsInt64(), transpose13.AsInt64());
Vp8Transpose_2_4x4_16b(b0, b1, b2, b3, out Vector128<long> output0, out Vector128<long> output1, out Vector128<long> output2, out Vector128<long> output3);

// a00 a10 a20 a30 b00 b10 b20 b30
// a01 a11 a21 a31 b01 b11 b21 b31
Expand Down Expand Up @@ -769,6 +748,43 @@ public static int TTransformSse41(Span<byte> inputA, Span<byte> inputB, Span<ush

return Numerics.ReduceSum(result);
}

// Transpose two 4x4 16b matrices horizontally stored in registers.
public static void Vp8Transpose_2_4x4_16b(Vector128<short> b0, Vector128<short> b1, Vector128<short> b2, Vector128<short> b3, out Vector128<long> output0, out Vector128<long> output1, out Vector128<long> output2, out Vector128<long> output3)
{
// Transpose the two 4x4.
// a00 a01 a02 a03 b00 b01 b02 b03
// a10 a11 a12 a13 b10 b11 b12 b13
// a20 a21 a22 a23 b20 b21 b22 b23
// a30 a31 a32 a33 b30 b31 b32 b33
Vector128<short> transpose00 = Sse2.UnpackLow(b0, b1);
Vector128<short> transpose01 = Sse2.UnpackLow(b2, b3);
Vector128<short> transpose02 = Sse2.UnpackHigh(b0, b1);
Vector128<short> transpose03 = Sse2.UnpackHigh(b2, b3);

// a00 a10 a01 a11 a02 a12 a03 a13
// a20 a30 a21 a31 a22 a32 a23 a33
// b00 b10 b01 b11 b02 b12 b03 b13
// b20 b30 b21 b31 b22 b32 b23 b33
Vector128<int> transpose10 = Sse2.UnpackLow(transpose00.AsInt32(), transpose01.AsInt32());
Vector128<int> transpose11 = Sse2.UnpackLow(transpose02.AsInt32(), transpose03.AsInt32());
Vector128<int> transpose12 = Sse2.UnpackHigh(transpose00.AsInt32(), transpose01.AsInt32());
Vector128<int> transpose13 = Sse2.UnpackHigh(transpose02.AsInt32(), transpose03.AsInt32());

// a00 a10 a20 a30 a01 a11 a21 a31
// b00 b10 b20 b30 b01 b11 b21 b31
// a02 a12 a22 a32 a03 a13 a23 a33
// b02 b12 a22 b32 b03 b13 b23 b33
output0 = Sse2.UnpackLow(transpose10.AsInt64(), transpose11.AsInt64());
output1 = Sse2.UnpackHigh(transpose10.AsInt64(), transpose11.AsInt64());
output2 = Sse2.UnpackLow(transpose12.AsInt64(), transpose13.AsInt64());
output3 = Sse2.UnpackHigh(transpose12.AsInt64(), transpose13.AsInt64());

// a00 a10 a20 a30 b00 b10 b20 b30
// a01 a11 a21 a31 b01 b11 b21 b31
// a02 a12 a22 a32 b02 b12 b22 b32
// a03 a13 a23 a33 b03 b13 b23 b33
}
#endif

public static void TransformTwo(Span<short> src, Span<byte> dst, Span<int> scratch)
Expand Down
207 changes: 203 additions & 4 deletions src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,20 @@

using System;
using System.Buffers.Binary;
using System.Linq;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
#if SUPPORTS_RUNTIME_INTRINSICS
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
#endif

namespace SixLabors.ImageSharp.Formats.Webp.Lossy
{
/// <summary>
/// Methods for encoding a VP8 frame.
/// </summary>
internal static class Vp8Encoding
internal static unsafe class Vp8Encoding
{
private const int KC1 = 20091 + (1 << 16);

Expand Down Expand Up @@ -60,6 +66,14 @@ internal static class Vp8Encoding

public static readonly int[] Vp8I4ModeOffsets = { I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4 };

#if SUPPORTS_RUNTIME_INTRINSICS
public static readonly Vector128<short> K1 = Vector128.Create((short)20091).AsInt16();

public static readonly Vector128<short> K2 = Vector128.Create((short)-30068).AsInt16();

public static readonly Vector128<short> Four = Vector128.Create((short)4);
#endif

static Vp8Encoding()
{
for (int i = -255; i <= 255 + 255; i++)
Expand All @@ -68,12 +82,197 @@ static Vp8Encoding()
}
}

// Transforms (Paragraph 14.4)
// Does one or two inverse transforms.
public static void ITransform(Span<byte> reference, Span<short> input, Span<byte> dst, bool doTwo, Span<int> scratch)
{
ITransformOne(reference, input, dst, scratch);
if (doTwo)
#if SUPPORTS_RUNTIME_INTRINSICS
if (Sse2.IsSupported)
{
// This implementation makes use of 16-bit fixed point versions of two
// multiply constants:
// K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
// K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16
//
// To be able to use signed 16-bit integers, we use the following trick to
// have constants within range:
// - Associated constants are obtained by subtracting the 16-bit fixed point
// version of one:
// k = K - (1 << 16) => K = k + (1 << 16)
// K1 = 85267 => k1 = 20091
// K2 = 35468 => k2 = -30068
// - The multiplication of a variable by a constant become the sum of the
// variable and the multiplication of that variable by the associated
// constant:
// (x * K) >> 16 = (x * (k + (1 << 16))) >> 16 = ((x * k ) >> 16) + x

// Load and concatenate the transform coefficients (we'll do two inverse
// transforms in parallel). In the case of only one inverse transform, the
// second half of the vectors will just contain random value we'll never
// use nor store.
ref short inputRef = ref MemoryMarshal.GetReference(input);
var in0 = Vector128.Create(Unsafe.As<short, long>(ref inputRef), 0);
var in1 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 4)), 0);
var in2 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 8)), 0);
var in3 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 12)), 0);

// a00 a10 a20 a30 x x x x
// a01 a11 a21 a31 x x x x
// a02 a12 a22 a32 x x x x
// a03 a13 a23 a33 x x x x
if (doTwo)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's better to avoid giving unnecessary extra work to the branch predictor, and refactor ITransform into two separate methods.

One hidden cost is that assembly code having branches on a runtime constant will be longer, and needs more cache lines, which are expensive to load.

Alternative "fun" solution, sort of template metaprogramming in C#, that will eliminate branches at JIT time:

ITransform<TDoTwo>(...) {
    ...
    if (typeof(TDoTwo) == typeof(DoTwo_True)) {
        ...
    }
    else {
       ...
    }
}

Can't decide if it's worth it or better to just duplicate the code, and maybe create reusable helper methods for shared bits.

{
var inb0 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 16)), 0);
var inb1 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 20)), 0);
var inb2 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 24)), 0);
var inb3 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 28)), 0);

in0 = Sse2.UnpackLow(in0, inb0);
in1 = Sse2.UnpackLow(in1, inb1);
in2 = Sse2.UnpackLow(in2, inb2);
in3 = Sse2.UnpackLow(in3, inb3);

// a00 a10 a20 a30 b00 b10 b20 b30
// a01 a11 a21 a31 b01 b11 b21 b31
// a02 a12 a22 a32 b02 b12 b22 b32
// a03 a13 a23 a33 b03 b13 b23 b33
}

// Vertical pass and subsequent transpose.
// First pass, c and d calculations are longer because of the "trick" multiplications.
Vector128<short> a = Sse2.Add(in0.AsInt16(), in2.AsInt16());
Vector128<short> b = Sse2.Subtract(in0.AsInt16(), in2.AsInt16());

// c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3
Vector128<short> c1 = Sse2.MultiplyHigh(in1.AsInt16(), K2);
Vector128<short> c2 = Sse2.MultiplyHigh(in3.AsInt16(), K1);
Vector128<short> c3 = Sse2.Subtract(in1.AsInt16(), in3.AsInt16());
Vector128<short> c4 = Sse2.Subtract(c1, c2);
Vector128<short> c = Sse2.Add(c3, c4);

// d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3
Vector128<short> d1 = Sse2.MultiplyHigh(in1.AsInt16(), K1);
Vector128<short> d2 = Sse2.MultiplyHigh(in3.AsInt16(), K2);
Vector128<short> d3 = Sse2.Add(in1.AsInt16(), in3.AsInt16());
Vector128<short> d4 = Sse2.Add(d1, d2);
Vector128<short> d = Sse2.Add(d3, d4);

// Second pass.
Vector128<short> tmp0 = Sse2.Add(a, d);
Vector128<short> tmp1 = Sse2.Add(b, c);
Vector128<short> tmp2 = Sse2.Subtract(b, c);
Vector128<short> tmp3 = Sse2.Subtract(a, d);

// Transpose the two 4x4.
LossyUtils.Vp8Transpose_2_4x4_16b(tmp0, tmp1, tmp2, tmp3, out Vector128<long> t0, out Vector128<long> t1, out Vector128<long> t2, out Vector128<long> t3);

// Horizontal pass and subsequent transpose.
// First pass, c and d calculations are longer because of the "trick" multiplications.
Vector128<short> dc = Sse2.Add(t0.AsInt16(), Four);
a = Sse2.Add(dc, t2.AsInt16());
b = Sse2.Subtract(dc, t2.AsInt16());

// c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3
c1 = Sse2.MultiplyHigh(t1.AsInt16(), K2);
c2 = Sse2.MultiplyHigh(t3.AsInt16(), K1);
c3 = Sse2.Subtract(t1.AsInt16(), t3.AsInt16());
c4 = Sse2.Subtract(c1, c2);
c = Sse2.Add(c3, c4);

// d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3
d1 = Sse2.MultiplyHigh(t1.AsInt16(), K1);
d2 = Sse2.MultiplyHigh(t3.AsInt16(), K2);
d3 = Sse2.Add(t1.AsInt16(), t3.AsInt16());
d4 = Sse2.Add(d1, d2);
d = Sse2.Add(d3, d4);

// Second pass.
tmp0 = Sse2.Add(a, d);
tmp1 = Sse2.Add(b, c);
tmp2 = Sse2.Subtract(b, c);
tmp3 = Sse2.Subtract(a, d);
Vector128<short> shifted0 = Sse2.ShiftRightArithmetic(tmp0, 3);
Vector128<short> shifted1 = Sse2.ShiftRightArithmetic(tmp1, 3);
Vector128<short> shifted2 = Sse2.ShiftRightArithmetic(tmp2, 3);
Vector128<short> shifted3 = Sse2.ShiftRightArithmetic(tmp3, 3);

// Transpose the two 4x4.
LossyUtils.Vp8Transpose_2_4x4_16b(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3);

// Add inverse transform to 'ref' and store.
// Load the reference(s).
Vector128<byte> ref0 = Vector128<byte>.Zero;
Vector128<byte> ref1 = Vector128<byte>.Zero;
Vector128<byte> ref2 = Vector128<byte>.Zero;
Vector128<byte> ref3 = Vector128<byte>.Zero;
ref byte referenceRef = ref MemoryMarshal.GetReference(reference);
if (doTwo)
{
// Load eight bytes/pixels per line.
ref0 = Vector128.Create(Unsafe.As<byte, long>(ref referenceRef), 0).AsByte();
ref1 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps)), 0).AsByte();
ref2 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2)), 0).AsByte();
ref3 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3)), 0).AsByte();
}
else
{
// Load four bytes/pixels per line.
ref0 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref referenceRef)).AsByte();
ref1 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps))).AsByte();
ref2 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2))).AsByte();
ref3 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3))).AsByte();
}

// Convert to 16b.
ref0 = Sse2.UnpackLow(ref0, Vector128<byte>.Zero);
ref1 = Sse2.UnpackLow(ref1, Vector128<byte>.Zero);
ref2 = Sse2.UnpackLow(ref2, Vector128<byte>.Zero);
ref3 = Sse2.UnpackLow(ref3, Vector128<byte>.Zero);

// Add the inverse transform(s).
Vector128<short> ref0InvAdded = Sse2.Add(ref0.AsInt16(), t0.AsInt16());
Vector128<short> ref1InvAdded = Sse2.Add(ref1.AsInt16(), t1.AsInt16());
Vector128<short> ref2InvAdded = Sse2.Add(ref2.AsInt16(), t2.AsInt16());
Vector128<short> ref3InvAdded = Sse2.Add(ref3.AsInt16(), t3.AsInt16());

// Unsigned saturate to 8b.
ref0 = Sse2.PackUnsignedSaturate(ref0InvAdded, ref0InvAdded);
ref1 = Sse2.PackUnsignedSaturate(ref1InvAdded, ref1InvAdded);
ref2 = Sse2.PackUnsignedSaturate(ref2InvAdded, ref2InvAdded);
ref3 = Sse2.PackUnsignedSaturate(ref3InvAdded, ref3InvAdded);

// Unsigned saturate to 8b.
ref byte outputRef = ref MemoryMarshal.GetReference(dst);
if (doTwo)
{
// Store eight bytes/pixels per line.
Unsafe.As<byte, Vector64<byte>>(ref outputRef) = ref0.GetLower();
Unsafe.As<byte, Vector64<byte>>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = ref1.GetLower();
Unsafe.As<byte, Vector64<byte>>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = ref2.GetLower();
Unsafe.As<byte, Vector64<byte>>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3)) = ref3.GetLower();
}
else
{
// Store four bytes/pixels per line.
int output0 = Sse2.ConvertToInt32(ref0.AsInt32());
int output1 = Sse2.ConvertToInt32(ref1.AsInt32());
int output2 = Sse2.ConvertToInt32(ref2.AsInt32());
int output3 = Sse2.ConvertToInt32(ref3.AsInt32());

Unsafe.As<byte, int>(ref outputRef) = output0;
Unsafe.As<byte, int>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = output1;
Unsafe.As<byte, int>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = output2;
Unsafe.As<byte, int>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3)) = output3;
}
}
else
#endif
{
ITransformOne(reference.Slice(4), input.Slice(16), dst.Slice(4), scratch);
ITransformOne(reference, input, dst, scratch);
if (doTwo)
{
ITransformOne(reference.Slice(4), input.Slice(16), dst.Slice(4), scratch);
}
}
}

Expand Down
Loading