Skip to content

Commit 544319e

Browse files
committed
ITransform now always does two transforms
1 parent b7059ae commit 544319e

File tree

3 files changed

+192
-95
lines changed

3 files changed

+192
-95
lines changed

src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -329,7 +329,7 @@ public static int ReconstructIntra16(Vp8EncIterator it, Vp8SegmentInfo dqm, Vp8M
329329
LossyUtils.TransformWht(dcTmp, tmp, scratch);
330330
for (n = 0; n < 16; n += 2)
331331
{
332-
Vp8Encoding.ITransform(reference.Slice(WebpLookupTables.Vp8Scan[n]), tmp.Slice(n * 16, 32), yuvOut.Slice(WebpLookupTables.Vp8Scan[n]), true, scratch);
332+
Vp8Encoding.ITransform(reference.Slice(WebpLookupTables.Vp8Scan[n]), tmp.Slice(n * 16, 32), yuvOut.Slice(WebpLookupTables.Vp8Scan[n]), scratch);
333333
}
334334

335335
return nz;
@@ -342,7 +342,7 @@ public static int ReconstructIntra4(Vp8EncIterator it, Vp8SegmentInfo dqm, Span<
342342
Span<int> scratch = it.Scratch3.AsSpan(0, 16);
343343
Vp8Encoding.FTransform(src, reference, tmp, scratch);
344344
int nz = QuantizeBlock(tmp, levels, ref dqm.Y1);
345-
Vp8Encoding.ITransform(reference, tmp, yuvOut, false, scratch);
345+
Vp8Encoding.ITransformOne(reference, tmp, yuvOut, scratch);
346346

347347
return nz;
348348
}
@@ -375,7 +375,7 @@ public static int ReconstructUv(Vp8EncIterator it, Vp8SegmentInfo dqm, Vp8ModeSc
375375

376376
for (n = 0; n < 8; n += 2)
377377
{
378-
Vp8Encoding.ITransform(reference.Slice(WebpLookupTables.Vp8ScanUv[n]), tmp.Slice(n * 16, 32), yuvOut.Slice(WebpLookupTables.Vp8ScanUv[n]), true, scratch);
378+
Vp8Encoding.ITransform(reference.Slice(WebpLookupTables.Vp8ScanUv[n]), tmp.Slice(n * 16, 32), yuvOut.Slice(WebpLookupTables.Vp8ScanUv[n]), scratch);
379379
}
380380

381381
return nz << 16;

src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs

Lines changed: 187 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33

44
using System;
55
using System.Buffers.Binary;
6-
using System.Linq;
76
using System.Runtime.CompilerServices;
87
using System.Runtime.InteropServices;
98
#if SUPPORTS_RUNTIME_INTRINSICS
@@ -16,7 +15,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
1615
/// <summary>
1716
/// Methods for encoding a VP8 frame.
1817
/// </summary>
19-
internal static unsafe class Vp8Encoding
18+
internal static class Vp8Encoding
2019
{
2120
private const int KC1 = 20091 + (1 << 16);
2221

@@ -83,8 +82,8 @@ static Vp8Encoding()
8382
}
8483

8584
// Transforms (Paragraph 14.4)
86-
// Does one or two inverse transforms.
87-
public static void ITransform(Span<byte> reference, Span<short> input, Span<byte> dst, bool doTwo, Span<int> scratch)
85+
// Does two inverse transforms.
86+
public static void ITransform(Span<byte> reference, Span<short> input, Span<byte> dst, Span<int> scratch)
8887
{
8988
#if SUPPORTS_RUNTIME_INTRINSICS
9089
if (Sse2.IsSupported)
@@ -120,23 +119,20 @@ public static void ITransform(Span<byte> reference, Span<short> input, Span<byte
120119
// a01 a11 a21 a31 x x x x
121120
// a02 a12 a22 a32 x x x x
122121
// a03 a13 a23 a33 x x x x
123-
if (doTwo)
124-
{
125-
var inb0 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 16)), 0);
126-
var inb1 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 20)), 0);
127-
var inb2 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 24)), 0);
128-
var inb3 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 28)), 0);
129-
130-
in0 = Sse2.UnpackLow(in0, inb0);
131-
in1 = Sse2.UnpackLow(in1, inb1);
132-
in2 = Sse2.UnpackLow(in2, inb2);
133-
in3 = Sse2.UnpackLow(in3, inb3);
134-
135-
// a00 a10 a20 a30 b00 b10 b20 b30
136-
// a01 a11 a21 a31 b01 b11 b21 b31
137-
// a02 a12 a22 a32 b02 b12 b22 b32
138-
// a03 a13 a23 a33 b03 b13 b23 b33
139-
}
122+
var inb0 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 16)), 0);
123+
var inb1 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 20)), 0);
124+
var inb2 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 24)), 0);
125+
var inb3 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 28)), 0);
126+
127+
in0 = Sse2.UnpackLow(in0, inb0);
128+
in1 = Sse2.UnpackLow(in1, inb1);
129+
in2 = Sse2.UnpackLow(in2, inb2);
130+
in3 = Sse2.UnpackLow(in3, inb3);
131+
132+
// a00 a10 a20 a30 b00 b10 b20 b30
133+
// a01 a11 a21 a31 b01 b11 b21 b31
134+
// a02 a12 a22 a32 b02 b12 b22 b32
135+
// a03 a13 a23 a33 b03 b13 b23 b33
140136

141137
// Vertical pass and subsequent transpose.
142138
// First pass, c and d calculations are longer because of the "trick" multiplications.
@@ -206,22 +202,12 @@ public static void ITransform(Span<byte> reference, Span<short> input, Span<byte
206202
Vector128<byte> ref2 = Vector128<byte>.Zero;
207203
Vector128<byte> ref3 = Vector128<byte>.Zero;
208204
ref byte referenceRef = ref MemoryMarshal.GetReference(reference);
209-
if (doTwo)
210-
{
211-
// Load eight bytes/pixels per line.
212-
ref0 = Vector128.Create(Unsafe.As<byte, long>(ref referenceRef), 0).AsByte();
213-
ref1 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps)), 0).AsByte();
214-
ref2 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2)), 0).AsByte();
215-
ref3 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3)), 0).AsByte();
216-
}
217-
else
218-
{
219-
// Load four bytes/pixels per line.
220-
ref0 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref referenceRef)).AsByte();
221-
ref1 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps))).AsByte();
222-
ref2 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2))).AsByte();
223-
ref3 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3))).AsByte();
224-
}
205+
206+
// Load eight bytes/pixels per line.
207+
ref0 = Vector128.Create(Unsafe.As<byte, long>(ref referenceRef), 0).AsByte();
208+
ref1 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps)), 0).AsByte();
209+
ref2 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2)), 0).AsByte();
210+
ref3 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3)), 0).AsByte();
225211

226212
// Convert to 16b.
227213
ref0 = Sse2.UnpackLow(ref0, Vector128<byte>.Zero);
@@ -243,72 +229,183 @@ public static void ITransform(Span<byte> reference, Span<short> input, Span<byte
243229

244230
// Unsigned saturate to 8b.
245231
ref byte outputRef = ref MemoryMarshal.GetReference(dst);
246-
if (doTwo)
247-
{
248-
// Store eight bytes/pixels per line.
249-
Unsafe.As<byte, Vector64<byte>>(ref outputRef) = ref0.GetLower();
250-
Unsafe.As<byte, Vector64<byte>>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = ref1.GetLower();
251-
Unsafe.As<byte, Vector64<byte>>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = ref2.GetLower();
252-
Unsafe.As<byte, Vector64<byte>>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3)) = ref3.GetLower();
253-
}
254-
else
255-
{
256-
// Store four bytes/pixels per line.
257-
int output0 = Sse2.ConvertToInt32(ref0.AsInt32());
258-
int output1 = Sse2.ConvertToInt32(ref1.AsInt32());
259-
int output2 = Sse2.ConvertToInt32(ref2.AsInt32());
260-
int output3 = Sse2.ConvertToInt32(ref3.AsInt32());
261-
262-
Unsafe.As<byte, int>(ref outputRef) = output0;
263-
Unsafe.As<byte, int>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = output1;
264-
Unsafe.As<byte, int>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = output2;
265-
Unsafe.As<byte, int>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3)) = output3;
266-
}
232+
233+
// Store eight bytes/pixels per line.
234+
Unsafe.As<byte, Vector64<byte>>(ref outputRef) = ref0.GetLower();
235+
Unsafe.As<byte, Vector64<byte>>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = ref1.GetLower();
236+
Unsafe.As<byte, Vector64<byte>>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = ref2.GetLower();
237+
Unsafe.As<byte, Vector64<byte>>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3)) = ref3.GetLower();
267238
}
268239
else
269240
#endif
270241
{
271242
ITransformOne(reference, input, dst, scratch);
272-
if (doTwo)
273-
{
274-
ITransformOne(reference.Slice(4), input.Slice(16), dst.Slice(4), scratch);
275-
}
243+
ITransformOne(reference.Slice(4), input.Slice(16), dst.Slice(4), scratch);
276244
}
277245
}
278246

279247
public static void ITransformOne(Span<byte> reference, Span<short> input, Span<byte> dst, Span<int> scratch)
280248
{
281-
int i;
282-
Span<int> tmp = scratch.Slice(0, 16);
283-
for (i = 0; i < 4; i++)
249+
#if SUPPORTS_RUNTIME_INTRINSICS
250+
if (Sse2.IsSupported)
284251
{
285-
// vertical pass.
286-
int a = input[0] + input[8];
287-
int b = input[0] - input[8];
288-
int c = Mul(input[4], KC2) - Mul(input[12], KC1);
289-
int d = Mul(input[4], KC1) + Mul(input[12], KC2);
290-
tmp[0] = a + d;
291-
tmp[1] = b + c;
292-
tmp[2] = b - c;
293-
tmp[3] = a - d;
294-
tmp = tmp.Slice(4);
295-
input = input.Slice(1);
296-
}
252+
// Load and concatenate the transform coefficients (we'll do two inverse
253+
// transforms in parallel). In the case of only one inverse transform, the
254+
// second half of the vectors will just contain random value we'll never
255+
// use nor store.
256+
ref short inputRef = ref MemoryMarshal.GetReference(input);
257+
var in0 = Vector128.Create(Unsafe.As<short, long>(ref inputRef), 0);
258+
var in1 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 4)), 0);
259+
var in2 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 8)), 0);
260+
var in3 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 12)), 0);
297261

298-
tmp = scratch;
299-
for (i = 0; i < 4; i++)
262+
// a00 a10 a20 a30 x x x x
263+
// a01 a11 a21 a31 x x x x
264+
// a02 a12 a22 a32 x x x x
265+
// a03 a13 a23 a33 x x x x
266+
267+
// Vertical pass and subsequent transpose.
268+
// First pass, c and d calculations are longer because of the "trick" multiplications.
269+
Vector128<short> a = Sse2.Add(in0.AsInt16(), in2.AsInt16());
270+
Vector128<short> b = Sse2.Subtract(in0.AsInt16(), in2.AsInt16());
271+
272+
// c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3
273+
Vector128<short> c1 = Sse2.MultiplyHigh(in1.AsInt16(), K2);
274+
Vector128<short> c2 = Sse2.MultiplyHigh(in3.AsInt16(), K1);
275+
Vector128<short> c3 = Sse2.Subtract(in1.AsInt16(), in3.AsInt16());
276+
Vector128<short> c4 = Sse2.Subtract(c1, c2);
277+
Vector128<short> c = Sse2.Add(c3, c4);
278+
279+
// d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3
280+
Vector128<short> d1 = Sse2.MultiplyHigh(in1.AsInt16(), K1);
281+
Vector128<short> d2 = Sse2.MultiplyHigh(in3.AsInt16(), K2);
282+
Vector128<short> d3 = Sse2.Add(in1.AsInt16(), in3.AsInt16());
283+
Vector128<short> d4 = Sse2.Add(d1, d2);
284+
Vector128<short> d = Sse2.Add(d3, d4);
285+
286+
// Second pass.
287+
Vector128<short> tmp0 = Sse2.Add(a, d);
288+
Vector128<short> tmp1 = Sse2.Add(b, c);
289+
Vector128<short> tmp2 = Sse2.Subtract(b, c);
290+
Vector128<short> tmp3 = Sse2.Subtract(a, d);
291+
292+
// Transpose the two 4x4.
293+
LossyUtils.Vp8Transpose_2_4x4_16b(tmp0, tmp1, tmp2, tmp3, out Vector128<long> t0, out Vector128<long> t1, out Vector128<long> t2, out Vector128<long> t3);
294+
295+
// Horizontal pass and subsequent transpose.
296+
// First pass, c and d calculations are longer because of the "trick" multiplications.
297+
Vector128<short> dc = Sse2.Add(t0.AsInt16(), Four);
298+
a = Sse2.Add(dc, t2.AsInt16());
299+
b = Sse2.Subtract(dc, t2.AsInt16());
300+
301+
// c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3
302+
c1 = Sse2.MultiplyHigh(t1.AsInt16(), K2);
303+
c2 = Sse2.MultiplyHigh(t3.AsInt16(), K1);
304+
c3 = Sse2.Subtract(t1.AsInt16(), t3.AsInt16());
305+
c4 = Sse2.Subtract(c1, c2);
306+
c = Sse2.Add(c3, c4);
307+
308+
// d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3
309+
d1 = Sse2.MultiplyHigh(t1.AsInt16(), K1);
310+
d2 = Sse2.MultiplyHigh(t3.AsInt16(), K2);
311+
d3 = Sse2.Add(t1.AsInt16(), t3.AsInt16());
312+
d4 = Sse2.Add(d1, d2);
313+
d = Sse2.Add(d3, d4);
314+
315+
// Second pass.
316+
tmp0 = Sse2.Add(a, d);
317+
tmp1 = Sse2.Add(b, c);
318+
tmp2 = Sse2.Subtract(b, c);
319+
tmp3 = Sse2.Subtract(a, d);
320+
Vector128<short> shifted0 = Sse2.ShiftRightArithmetic(tmp0, 3);
321+
Vector128<short> shifted1 = Sse2.ShiftRightArithmetic(tmp1, 3);
322+
Vector128<short> shifted2 = Sse2.ShiftRightArithmetic(tmp2, 3);
323+
Vector128<short> shifted3 = Sse2.ShiftRightArithmetic(tmp3, 3);
324+
325+
// Transpose the two 4x4.
326+
LossyUtils.Vp8Transpose_2_4x4_16b(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3);
327+
328+
// Add inverse transform to 'ref' and store.
329+
// Load the reference(s).
330+
Vector128<byte> ref0 = Vector128<byte>.Zero;
331+
Vector128<byte> ref1 = Vector128<byte>.Zero;
332+
Vector128<byte> ref2 = Vector128<byte>.Zero;
333+
Vector128<byte> ref3 = Vector128<byte>.Zero;
334+
ref byte referenceRef = ref MemoryMarshal.GetReference(reference);
335+
336+
// Load four bytes/pixels per line.
337+
ref0 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref referenceRef)).AsByte();
338+
ref1 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps))).AsByte();
339+
ref2 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2))).AsByte();
340+
ref3 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3))).AsByte();
341+
342+
// Convert to 16b.
343+
ref0 = Sse2.UnpackLow(ref0, Vector128<byte>.Zero);
344+
ref1 = Sse2.UnpackLow(ref1, Vector128<byte>.Zero);
345+
ref2 = Sse2.UnpackLow(ref2, Vector128<byte>.Zero);
346+
ref3 = Sse2.UnpackLow(ref3, Vector128<byte>.Zero);
347+
348+
// Add the inverse transform(s).
349+
Vector128<short> ref0InvAdded = Sse2.Add(ref0.AsInt16(), t0.AsInt16());
350+
Vector128<short> ref1InvAdded = Sse2.Add(ref1.AsInt16(), t1.AsInt16());
351+
Vector128<short> ref2InvAdded = Sse2.Add(ref2.AsInt16(), t2.AsInt16());
352+
Vector128<short> ref3InvAdded = Sse2.Add(ref3.AsInt16(), t3.AsInt16());
353+
354+
// Unsigned saturate to 8b.
355+
ref0 = Sse2.PackUnsignedSaturate(ref0InvAdded, ref0InvAdded);
356+
ref1 = Sse2.PackUnsignedSaturate(ref1InvAdded, ref1InvAdded);
357+
ref2 = Sse2.PackUnsignedSaturate(ref2InvAdded, ref2InvAdded);
358+
ref3 = Sse2.PackUnsignedSaturate(ref3InvAdded, ref3InvAdded);
359+
360+
// Unsigned saturate to 8b.
361+
ref byte outputRef = ref MemoryMarshal.GetReference(dst);
362+
363+
// Store four bytes/pixels per line.
364+
int output0 = Sse2.ConvertToInt32(ref0.AsInt32());
365+
int output1 = Sse2.ConvertToInt32(ref1.AsInt32());
366+
int output2 = Sse2.ConvertToInt32(ref2.AsInt32());
367+
int output3 = Sse2.ConvertToInt32(ref3.AsInt32());
368+
369+
Unsafe.As<byte, int>(ref outputRef) = output0;
370+
Unsafe.As<byte, int>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = output1;
371+
Unsafe.As<byte, int>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = output2;
372+
Unsafe.As<byte, int>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3)) = output3;
373+
}
374+
else
375+
#endif
300376
{
301-
// horizontal pass.
302-
int dc = tmp[0] + 4;
303-
int a = dc + tmp[8];
304-
int b = dc - tmp[8];
305-
int c = Mul(tmp[4], KC2) - Mul(tmp[12], KC1);
306-
int d = Mul(tmp[4], KC1) + Mul(tmp[12], KC2);
307-
Store(dst, reference, 0, i, a + d);
308-
Store(dst, reference, 1, i, b + c);
309-
Store(dst, reference, 2, i, b - c);
310-
Store(dst, reference, 3, i, a - d);
311-
tmp = tmp.Slice(1);
377+
int i;
378+
Span<int> tmp = scratch.Slice(0, 16);
379+
for (i = 0; i < 4; i++)
380+
{
381+
// vertical pass.
382+
int a = input[0] + input[8];
383+
int b = input[0] - input[8];
384+
int c = Mul(input[4], KC2) - Mul(input[12], KC1);
385+
int d = Mul(input[4], KC1) + Mul(input[12], KC2);
386+
tmp[0] = a + d;
387+
tmp[1] = b + c;
388+
tmp[2] = b - c;
389+
tmp[3] = a - d;
390+
tmp = tmp.Slice(4);
391+
input = input.Slice(1);
392+
}
393+
394+
tmp = scratch;
395+
for (i = 0; i < 4; i++)
396+
{
397+
// horizontal pass.
398+
int dc = tmp[0] + 4;
399+
int a = dc + tmp[8];
400+
int b = dc - tmp[8];
401+
int c = Mul(tmp[4], KC2) - Mul(tmp[12], KC1);
402+
int d = Mul(tmp[4], KC1) + Mul(tmp[12], KC2);
403+
Store(dst, reference, 0, i, a + d);
404+
Store(dst, reference, 1, i, b + c);
405+
Store(dst, reference, 2, i, b - c);
406+
Store(dst, reference, 3, i, a - d);
407+
tmp = tmp.Slice(1);
408+
}
312409
}
313410
}
314411

0 commit comments

Comments
 (0)