-
-
Notifications
You must be signed in to change notification settings - Fork 888
Add sse2 version of inverse transform #1819
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 11 commits
6e8def1
26d3063
835ecea
5968de8
5c0b598
b6a20af
6039d2a
abcbc4c
18ecb06
6e548b5
a201e8a
b7059ae
544319e
5074ee6
16bb94f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3,14 +3,20 @@ | |
|
|
||
| using System; | ||
| using System.Buffers.Binary; | ||
| using System.Linq; | ||
| using System.Runtime.CompilerServices; | ||
| using System.Runtime.InteropServices; | ||
| #if SUPPORTS_RUNTIME_INTRINSICS | ||
| using System.Runtime.Intrinsics; | ||
| using System.Runtime.Intrinsics.X86; | ||
| #endif | ||
|
|
||
| namespace SixLabors.ImageSharp.Formats.Webp.Lossy | ||
| { | ||
| /// <summary> | ||
| /// Methods for encoding a VP8 frame. | ||
| /// </summary> | ||
| internal static class Vp8Encoding | ||
| internal static unsafe class Vp8Encoding | ||
| { | ||
| private const int KC1 = 20091 + (1 << 16); | ||
|
|
||
|
|
@@ -60,6 +66,14 @@ internal static class Vp8Encoding | |
|
|
||
| public static readonly int[] Vp8I4ModeOffsets = { I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4 }; | ||
|
|
||
| #if SUPPORTS_RUNTIME_INTRINSICS | ||
| public static readonly Vector128<short> K1 = Vector128.Create((short)20091).AsInt16(); | ||
|
|
||
| public static readonly Vector128<short> K2 = Vector128.Create((short)-30068).AsInt16(); | ||
|
|
||
| public static readonly Vector128<short> Four = Vector128.Create((short)4); | ||
| #endif | ||
|
|
||
| static Vp8Encoding() | ||
| { | ||
| for (int i = -255; i <= 255 + 255; i++) | ||
|
|
@@ -68,12 +82,197 @@ static Vp8Encoding() | |
| } | ||
| } | ||
|
|
||
| // Transforms (Paragraph 14.4) | ||
| // Does one or two inverse transforms. | ||
| public static void ITransform(Span<byte> reference, Span<short> input, Span<byte> dst, bool doTwo, Span<int> scratch) | ||
| { | ||
| ITransformOne(reference, input, dst, scratch); | ||
| if (doTwo) | ||
| #if SUPPORTS_RUNTIME_INTRINSICS | ||
| if (Sse2.IsSupported) | ||
| { | ||
| // This implementation makes use of 16-bit fixed point versions of two | ||
| // multiply constants: | ||
| // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16 | ||
| // K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16 | ||
| // | ||
| // To be able to use signed 16-bit integers, we use the following trick to | ||
| // have constants within range: | ||
| // - Associated constants are obtained by subtracting the 16-bit fixed point | ||
| // version of one: | ||
| // k = K - (1 << 16) => K = k + (1 << 16) | ||
| // K1 = 85267 => k1 = 20091 | ||
| // K2 = 35468 => k2 = -30068 | ||
| // - The multiplication of a variable by a constant become the sum of the | ||
| // variable and the multiplication of that variable by the associated | ||
| // constant: | ||
| // (x * K) >> 16 = (x * (k + (1 << 16))) >> 16 = ((x * k ) >> 16) + x | ||
|
|
||
| // Load and concatenate the transform coefficients (we'll do two inverse | ||
| // transforms in parallel). In the case of only one inverse transform, the | ||
| // second half of the vectors will just contain random value we'll never | ||
| // use nor store. | ||
| ref short inputRef = ref MemoryMarshal.GetReference(input); | ||
| var in0 = Vector128.Create(Unsafe.As<short, long>(ref inputRef), 0); | ||
| var in1 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 4)), 0); | ||
| var in2 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 8)), 0); | ||
| var in3 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 12)), 0); | ||
|
|
||
| // a00 a10 a20 a30 x x x x | ||
| // a01 a11 a21 a31 x x x x | ||
| // a02 a12 a22 a32 x x x x | ||
| // a03 a13 a23 a33 x x x x | ||
| if (doTwo) | ||
|
||
| { | ||
| var inb0 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 16)), 0); | ||
| var inb1 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 20)), 0); | ||
| var inb2 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 24)), 0); | ||
| var inb3 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 28)), 0); | ||
|
|
||
| in0 = Sse2.UnpackLow(in0, inb0); | ||
| in1 = Sse2.UnpackLow(in1, inb1); | ||
| in2 = Sse2.UnpackLow(in2, inb2); | ||
| in3 = Sse2.UnpackLow(in3, inb3); | ||
|
|
||
| // a00 a10 a20 a30 b00 b10 b20 b30 | ||
| // a01 a11 a21 a31 b01 b11 b21 b31 | ||
| // a02 a12 a22 a32 b02 b12 b22 b32 | ||
| // a03 a13 a23 a33 b03 b13 b23 b33 | ||
| } | ||
|
|
||
| // Vertical pass and subsequent transpose. | ||
| // First pass, c and d calculations are longer because of the "trick" multiplications. | ||
| Vector128<short> a = Sse2.Add(in0.AsInt16(), in2.AsInt16()); | ||
| Vector128<short> b = Sse2.Subtract(in0.AsInt16(), in2.AsInt16()); | ||
|
|
||
| // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3 | ||
| Vector128<short> c1 = Sse2.MultiplyHigh(in1.AsInt16(), K2); | ||
| Vector128<short> c2 = Sse2.MultiplyHigh(in3.AsInt16(), K1); | ||
| Vector128<short> c3 = Sse2.Subtract(in1.AsInt16(), in3.AsInt16()); | ||
| Vector128<short> c4 = Sse2.Subtract(c1, c2); | ||
| Vector128<short> c = Sse2.Add(c3, c4); | ||
|
|
||
| // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3 | ||
| Vector128<short> d1 = Sse2.MultiplyHigh(in1.AsInt16(), K1); | ||
| Vector128<short> d2 = Sse2.MultiplyHigh(in3.AsInt16(), K2); | ||
| Vector128<short> d3 = Sse2.Add(in1.AsInt16(), in3.AsInt16()); | ||
| Vector128<short> d4 = Sse2.Add(d1, d2); | ||
| Vector128<short> d = Sse2.Add(d3, d4); | ||
|
|
||
| // Second pass. | ||
| Vector128<short> tmp0 = Sse2.Add(a, d); | ||
| Vector128<short> tmp1 = Sse2.Add(b, c); | ||
| Vector128<short> tmp2 = Sse2.Subtract(b, c); | ||
| Vector128<short> tmp3 = Sse2.Subtract(a, d); | ||
|
|
||
| // Transpose the two 4x4. | ||
| LossyUtils.Vp8Transpose_2_4x4_16b(tmp0, tmp1, tmp2, tmp3, out Vector128<long> t0, out Vector128<long> t1, out Vector128<long> t2, out Vector128<long> t3); | ||
|
|
||
| // Horizontal pass and subsequent transpose. | ||
| // First pass, c and d calculations are longer because of the "trick" multiplications. | ||
| Vector128<short> dc = Sse2.Add(t0.AsInt16(), Four); | ||
| a = Sse2.Add(dc, t2.AsInt16()); | ||
| b = Sse2.Subtract(dc, t2.AsInt16()); | ||
|
|
||
| // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3 | ||
| c1 = Sse2.MultiplyHigh(t1.AsInt16(), K2); | ||
| c2 = Sse2.MultiplyHigh(t3.AsInt16(), K1); | ||
| c3 = Sse2.Subtract(t1.AsInt16(), t3.AsInt16()); | ||
| c4 = Sse2.Subtract(c1, c2); | ||
| c = Sse2.Add(c3, c4); | ||
|
|
||
| // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3 | ||
| d1 = Sse2.MultiplyHigh(t1.AsInt16(), K1); | ||
| d2 = Sse2.MultiplyHigh(t3.AsInt16(), K2); | ||
| d3 = Sse2.Add(t1.AsInt16(), t3.AsInt16()); | ||
| d4 = Sse2.Add(d1, d2); | ||
| d = Sse2.Add(d3, d4); | ||
|
|
||
| // Second pass. | ||
| tmp0 = Sse2.Add(a, d); | ||
| tmp1 = Sse2.Add(b, c); | ||
| tmp2 = Sse2.Subtract(b, c); | ||
| tmp3 = Sse2.Subtract(a, d); | ||
| Vector128<short> shifted0 = Sse2.ShiftRightArithmetic(tmp0, 3); | ||
| Vector128<short> shifted1 = Sse2.ShiftRightArithmetic(tmp1, 3); | ||
| Vector128<short> shifted2 = Sse2.ShiftRightArithmetic(tmp2, 3); | ||
| Vector128<short> shifted3 = Sse2.ShiftRightArithmetic(tmp3, 3); | ||
|
|
||
| // Transpose the two 4x4. | ||
| LossyUtils.Vp8Transpose_2_4x4_16b(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3); | ||
|
|
||
| // Add inverse transform to 'ref' and store. | ||
| // Load the reference(s). | ||
| Vector128<byte> ref0 = Vector128<byte>.Zero; | ||
| Vector128<byte> ref1 = Vector128<byte>.Zero; | ||
| Vector128<byte> ref2 = Vector128<byte>.Zero; | ||
| Vector128<byte> ref3 = Vector128<byte>.Zero; | ||
| ref byte referenceRef = ref MemoryMarshal.GetReference(reference); | ||
| if (doTwo) | ||
| { | ||
| // Load eight bytes/pixels per line. | ||
| ref0 = Vector128.Create(Unsafe.As<byte, long>(ref referenceRef), 0).AsByte(); | ||
| ref1 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps)), 0).AsByte(); | ||
| ref2 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2)), 0).AsByte(); | ||
| ref3 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3)), 0).AsByte(); | ||
| } | ||
| else | ||
| { | ||
| // Load four bytes/pixels per line. | ||
| ref0 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref referenceRef)).AsByte(); | ||
| ref1 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps))).AsByte(); | ||
| ref2 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2))).AsByte(); | ||
| ref3 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3))).AsByte(); | ||
| } | ||
|
|
||
| // Convert to 16b. | ||
| ref0 = Sse2.UnpackLow(ref0, Vector128<byte>.Zero); | ||
| ref1 = Sse2.UnpackLow(ref1, Vector128<byte>.Zero); | ||
| ref2 = Sse2.UnpackLow(ref2, Vector128<byte>.Zero); | ||
| ref3 = Sse2.UnpackLow(ref3, Vector128<byte>.Zero); | ||
|
|
||
| // Add the inverse transform(s). | ||
| Vector128<short> ref0InvAdded = Sse2.Add(ref0.AsInt16(), t0.AsInt16()); | ||
| Vector128<short> ref1InvAdded = Sse2.Add(ref1.AsInt16(), t1.AsInt16()); | ||
| Vector128<short> ref2InvAdded = Sse2.Add(ref2.AsInt16(), t2.AsInt16()); | ||
| Vector128<short> ref3InvAdded = Sse2.Add(ref3.AsInt16(), t3.AsInt16()); | ||
|
|
||
| // Unsigned saturate to 8b. | ||
| ref0 = Sse2.PackUnsignedSaturate(ref0InvAdded, ref0InvAdded); | ||
| ref1 = Sse2.PackUnsignedSaturate(ref1InvAdded, ref1InvAdded); | ||
| ref2 = Sse2.PackUnsignedSaturate(ref2InvAdded, ref2InvAdded); | ||
| ref3 = Sse2.PackUnsignedSaturate(ref3InvAdded, ref3InvAdded); | ||
|
|
||
| // Unsigned saturate to 8b. | ||
| ref byte outputRef = ref MemoryMarshal.GetReference(dst); | ||
| if (doTwo) | ||
| { | ||
| // Store eight bytes/pixels per line. | ||
| Unsafe.As<byte, Vector64<byte>>(ref outputRef) = ref0.GetLower(); | ||
| Unsafe.As<byte, Vector64<byte>>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = ref1.GetLower(); | ||
| Unsafe.As<byte, Vector64<byte>>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = ref2.GetLower(); | ||
| Unsafe.As<byte, Vector64<byte>>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3)) = ref3.GetLower(); | ||
| } | ||
| else | ||
| { | ||
| // Store four bytes/pixels per line. | ||
| int output0 = Sse2.ConvertToInt32(ref0.AsInt32()); | ||
| int output1 = Sse2.ConvertToInt32(ref1.AsInt32()); | ||
| int output2 = Sse2.ConvertToInt32(ref2.AsInt32()); | ||
| int output3 = Sse2.ConvertToInt32(ref3.AsInt32()); | ||
|
|
||
| Unsafe.As<byte, int>(ref outputRef) = output0; | ||
| Unsafe.As<byte, int>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = output1; | ||
| Unsafe.As<byte, int>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = output2; | ||
| Unsafe.As<byte, int>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3)) = output3; | ||
| } | ||
| } | ||
| else | ||
| #endif | ||
| { | ||
| ITransformOne(reference.Slice(4), input.Slice(16), dst.Slice(4), scratch); | ||
| ITransformOne(reference, input, dst, scratch); | ||
| if (doTwo) | ||
| { | ||
| ITransformOne(reference.Slice(4), input.Slice(16), dst.Slice(4), scratch); | ||
| } | ||
| } | ||
| } | ||
|
|
||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.