ITransform now always does two transforms

brianpopow · brianpopow · commit 544319e9ea86 · 2021-11-12T13:18:41.000+01:00
diff --git a/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs b/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
@@ -329,7 +329,7 @@ public static int ReconstructIntra16(Vp8EncIterator it, Vp8SegmentInfo dqm, Vp8M
             LossyUtils.TransformWht(dcTmp, tmp, scratch);
             for (n = 0; n < 16; n += 2)
             {
-                Vp8Encoding.ITransform(reference.Slice(WebpLookupTables.Vp8Scan[n]), tmp.Slice(n * 16, 32), yuvOut.Slice(WebpLookupTables.Vp8Scan[n]), true, scratch);
+                Vp8Encoding.ITransform(reference.Slice(WebpLookupTables.Vp8Scan[n]), tmp.Slice(n * 16, 32), yuvOut.Slice(WebpLookupTables.Vp8Scan[n]), scratch);
             }
 
             return nz;
@@ -342,7 +342,7 @@ public static int ReconstructIntra4(Vp8EncIterator it, Vp8SegmentInfo dqm, Span<
             Span<int> scratch = it.Scratch3.AsSpan(0, 16);
             Vp8Encoding.FTransform(src, reference, tmp, scratch);
             int nz = QuantizeBlock(tmp, levels, ref dqm.Y1);
-            Vp8Encoding.ITransform(reference, tmp, yuvOut, false, scratch);
+            Vp8Encoding.ITransformOne(reference, tmp, yuvOut, scratch);
 
             return nz;
         }
@@ -375,7 +375,7 @@ public static int ReconstructUv(Vp8EncIterator it, Vp8SegmentInfo dqm, Vp8ModeSc
 
             for (n = 0; n < 8; n += 2)
             {
-                Vp8Encoding.ITransform(reference.Slice(WebpLookupTables.Vp8ScanUv[n]), tmp.Slice(n * 16, 32), yuvOut.Slice(WebpLookupTables.Vp8ScanUv[n]), true, scratch);
+                Vp8Encoding.ITransform(reference.Slice(WebpLookupTables.Vp8ScanUv[n]), tmp.Slice(n * 16, 32), yuvOut.Slice(WebpLookupTables.Vp8ScanUv[n]), scratch);
             }
 
             return nz << 16;
diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
@@ -3,7 +3,6 @@
 
 using System;
 using System.Buffers.Binary;
-using System.Linq;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 #if SUPPORTS_RUNTIME_INTRINSICS
@@ -16,7 +15,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
     /// <summary>
     /// Methods for encoding a VP8 frame.
     /// </summary>
-    internal static unsafe class Vp8Encoding
+    internal static class Vp8Encoding
     {
         private const int KC1 = 20091 + (1 << 16);
 
@@ -83,8 +82,8 @@ static Vp8Encoding()
         }
 
         // Transforms (Paragraph 14.4)
-        // Does one or two inverse transforms.
-        public static void ITransform(Span<byte> reference, Span<short> input, Span<byte> dst, bool doTwo, Span<int> scratch)
+        // Does two inverse transforms.
+        public static void ITransform(Span<byte> reference, Span<short> input, Span<byte> dst, Span<int> scratch)
         {
 #if SUPPORTS_RUNTIME_INTRINSICS
             if (Sse2.IsSupported)
@@ -120,23 +119,20 @@ public static void ITransform(Span<byte> reference, Span<short> input, Span<byte
                 // a01 a11 a21 a31   x x x x
                 // a02 a12 a22 a32   x x x x
                 // a03 a13 a23 a33   x x x x
-                if (doTwo)
-                {
-                    var inb0 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 16)), 0);
-                    var inb1 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 20)), 0);
-                    var inb2 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 24)), 0);
-                    var inb3 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 28)), 0);
-
-                    in0 = Sse2.UnpackLow(in0, inb0);
-                    in1 = Sse2.UnpackLow(in1, inb1);
-                    in2 = Sse2.UnpackLow(in2, inb2);
-                    in3 = Sse2.UnpackLow(in3, inb3);
-
-                    // a00 a10 a20 a30   b00 b10 b20 b30
-                    // a01 a11 a21 a31   b01 b11 b21 b31
-                    // a02 a12 a22 a32   b02 b12 b22 b32
-                    // a03 a13 a23 a33   b03 b13 b23 b33
-                }
+                var inb0 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 16)), 0);
+                var inb1 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 20)), 0);
+                var inb2 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 24)), 0);
+                var inb3 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 28)), 0);
+
+                in0 = Sse2.UnpackLow(in0, inb0);
+                in1 = Sse2.UnpackLow(in1, inb1);
+                in2 = Sse2.UnpackLow(in2, inb2);
+                in3 = Sse2.UnpackLow(in3, inb3);
+
+                // a00 a10 a20 a30   b00 b10 b20 b30
+                // a01 a11 a21 a31   b01 b11 b21 b31
+                // a02 a12 a22 a32   b02 b12 b22 b32
+                // a03 a13 a23 a33   b03 b13 b23 b33
 
                 // Vertical pass and subsequent transpose.
                 // First pass, c and d calculations are longer because of the "trick" multiplications.
@@ -206,22 +202,12 @@ public static void ITransform(Span<byte> reference, Span<short> input, Span<byte
                 Vector128<byte> ref2 = Vector128<byte>.Zero;
                 Vector128<byte> ref3 = Vector128<byte>.Zero;
                 ref byte referenceRef = ref MemoryMarshal.GetReference(reference);
-                if (doTwo)
-                {
-                    // Load eight bytes/pixels per line.
-                    ref0 = Vector128.Create(Unsafe.As<byte, long>(ref referenceRef), 0).AsByte();
-                    ref1 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps)), 0).AsByte();
-                    ref2 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2)), 0).AsByte();
-                    ref3 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3)), 0).AsByte();
-                }
-                else
-                {
-                    // Load four bytes/pixels per line.
-                    ref0 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref referenceRef)).AsByte();
-                    ref1 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps))).AsByte();
-                    ref2 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2))).AsByte();
-                    ref3 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3))).AsByte();
-                }
+
+                // Load eight bytes/pixels per line.
+                ref0 = Vector128.Create(Unsafe.As<byte, long>(ref referenceRef), 0).AsByte();
+                ref1 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps)), 0).AsByte();
+                ref2 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2)), 0).AsByte();
+                ref3 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3)), 0).AsByte();
 
                 // Convert to 16b.
                 ref0 = Sse2.UnpackLow(ref0, Vector128<byte>.Zero);
@@ -243,72 +229,183 @@ public static void ITransform(Span<byte> reference, Span<short> input, Span<byte
 
                 // Unsigned saturate to 8b.
                 ref byte outputRef = ref MemoryMarshal.GetReference(dst);
-                if (doTwo)
-                {
-                    // Store eight bytes/pixels per line.
-                    Unsafe.As<byte, Vector64<byte>>(ref outputRef) = ref0.GetLower();
-                    Unsafe.As<byte, Vector64<byte>>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = ref1.GetLower();
-                    Unsafe.As<byte, Vector64<byte>>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = ref2.GetLower();
-                    Unsafe.As<byte, Vector64<byte>>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3)) = ref3.GetLower();
-                }
-                else
-                {
-                    // Store four bytes/pixels per line.
-                    int output0 = Sse2.ConvertToInt32(ref0.AsInt32());
-                    int output1 = Sse2.ConvertToInt32(ref1.AsInt32());
-                    int output2 = Sse2.ConvertToInt32(ref2.AsInt32());
-                    int output3 = Sse2.ConvertToInt32(ref3.AsInt32());
-
-                    Unsafe.As<byte, int>(ref outputRef) = output0;
-                    Unsafe.As<byte, int>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = output1;
-                    Unsafe.As<byte, int>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = output2;
-                    Unsafe.As<byte, int>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3)) = output3;
-                }
+
+                // Store eight bytes/pixels per line.
+                Unsafe.As<byte, Vector64<byte>>(ref outputRef) = ref0.GetLower();
+                Unsafe.As<byte, Vector64<byte>>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = ref1.GetLower();
+                Unsafe.As<byte, Vector64<byte>>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = ref2.GetLower();
+                Unsafe.As<byte, Vector64<byte>>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3)) = ref3.GetLower();
             }
             else
 #endif
             {
                 ITransformOne(reference, input, dst, scratch);
-                if (doTwo)
-                {
-                    ITransformOne(reference.Slice(4), input.Slice(16), dst.Slice(4), scratch);
-                }
+                ITransformOne(reference.Slice(4), input.Slice(16), dst.Slice(4), scratch);
             }
         }
 
         public static void ITransformOne(Span<byte> reference, Span<short> input, Span<byte> dst, Span<int> scratch)
         {
-            int i;
-            Span<int> tmp = scratch.Slice(0, 16);
-            for (i = 0; i < 4; i++)
+#if SUPPORTS_RUNTIME_INTRINSICS
+            if (Sse2.IsSupported)
             {
-                // vertical pass.
-                int a = input[0] + input[8];
-                int b = input[0] - input[8];
-                int c = Mul(input[4], KC2) - Mul(input[12], KC1);
-                int d = Mul(input[4], KC1) + Mul(input[12], KC2);
-                tmp[0] = a + d;
-                tmp[1] = b + c;
-                tmp[2] = b - c;
-                tmp[3] = a - d;
-                tmp = tmp.Slice(4);
-                input = input.Slice(1);
-            }
+                // Load and concatenate the transform coefficients (we'll do two inverse
+                // transforms in parallel). In the case of only one inverse transform, the
+                // second half of the vectors will just contain random value we'll never
+                // use nor store.
+                ref short inputRef = ref MemoryMarshal.GetReference(input);
+                var in0 = Vector128.Create(Unsafe.As<short, long>(ref inputRef), 0);
+                var in1 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 4)), 0);
+                var in2 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 8)), 0);
+                var in3 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 12)), 0);
 
-            tmp = scratch;
-            for (i = 0; i < 4; i++)
+                // a00 a10 a20 a30   x x x x
+                // a01 a11 a21 a31   x x x x
+                // a02 a12 a22 a32   x x x x
+                // a03 a13 a23 a33   x x x x
+
+                // Vertical pass and subsequent transpose.
+                // First pass, c and d calculations are longer because of the "trick" multiplications.
+                Vector128<short> a = Sse2.Add(in0.AsInt16(), in2.AsInt16());
+                Vector128<short> b = Sse2.Subtract(in0.AsInt16(), in2.AsInt16());
+
+                // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3
+                Vector128<short> c1 = Sse2.MultiplyHigh(in1.AsInt16(), K2);
+                Vector128<short> c2 = Sse2.MultiplyHigh(in3.AsInt16(), K1);
+                Vector128<short> c3 = Sse2.Subtract(in1.AsInt16(), in3.AsInt16());
+                Vector128<short> c4 = Sse2.Subtract(c1, c2);
+                Vector128<short> c = Sse2.Add(c3, c4);
+
+                // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3
+                Vector128<short> d1 = Sse2.MultiplyHigh(in1.AsInt16(), K1);
+                Vector128<short> d2 = Sse2.MultiplyHigh(in3.AsInt16(), K2);
+                Vector128<short> d3 = Sse2.Add(in1.AsInt16(), in3.AsInt16());
+                Vector128<short> d4 = Sse2.Add(d1, d2);
+                Vector128<short> d = Sse2.Add(d3, d4);
+
+                // Second pass.
+                Vector128<short> tmp0 = Sse2.Add(a, d);
+                Vector128<short> tmp1 = Sse2.Add(b, c);
+                Vector128<short> tmp2 = Sse2.Subtract(b, c);
+                Vector128<short> tmp3 = Sse2.Subtract(a, d);
+
+                // Transpose the two 4x4.
+                LossyUtils.Vp8Transpose_2_4x4_16b(tmp0, tmp1, tmp2, tmp3, out Vector128<long> t0, out Vector128<long> t1, out Vector128<long> t2, out Vector128<long> t3);
+
+                // Horizontal pass and subsequent transpose.
+                // First pass, c and d calculations are longer because of the "trick" multiplications.
+                Vector128<short> dc = Sse2.Add(t0.AsInt16(), Four);
+                a = Sse2.Add(dc, t2.AsInt16());
+                b = Sse2.Subtract(dc, t2.AsInt16());
+
+                // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3
+                c1 = Sse2.MultiplyHigh(t1.AsInt16(), K2);
+                c2 = Sse2.MultiplyHigh(t3.AsInt16(), K1);
+                c3 = Sse2.Subtract(t1.AsInt16(), t3.AsInt16());
+                c4 = Sse2.Subtract(c1, c2);
+                c = Sse2.Add(c3, c4);
+
+                // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3
+                d1 = Sse2.MultiplyHigh(t1.AsInt16(), K1);
+                d2 = Sse2.MultiplyHigh(t3.AsInt16(), K2);
+                d3 = Sse2.Add(t1.AsInt16(), t3.AsInt16());
+                d4 = Sse2.Add(d1, d2);
+                d = Sse2.Add(d3, d4);
+
+                // Second pass.
+                tmp0 = Sse2.Add(a, d);
+                tmp1 = Sse2.Add(b, c);
+                tmp2 = Sse2.Subtract(b, c);
+                tmp3 = Sse2.Subtract(a, d);
+                Vector128<short> shifted0 = Sse2.ShiftRightArithmetic(tmp0, 3);
+                Vector128<short> shifted1 = Sse2.ShiftRightArithmetic(tmp1, 3);
+                Vector128<short> shifted2 = Sse2.ShiftRightArithmetic(tmp2, 3);
+                Vector128<short> shifted3 = Sse2.ShiftRightArithmetic(tmp3, 3);
+
+                // Transpose the two 4x4.
+                LossyUtils.Vp8Transpose_2_4x4_16b(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3);
+
+                // Add inverse transform to 'ref' and store.
+                // Load the reference(s).
+                Vector128<byte> ref0 = Vector128<byte>.Zero;
+                Vector128<byte> ref1 = Vector128<byte>.Zero;
+                Vector128<byte> ref2 = Vector128<byte>.Zero;
+                Vector128<byte> ref3 = Vector128<byte>.Zero;
+                ref byte referenceRef = ref MemoryMarshal.GetReference(reference);
+
+                // Load four bytes/pixels per line.
+                ref0 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref referenceRef)).AsByte();
+                ref1 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps))).AsByte();
+                ref2 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2))).AsByte();
+                ref3 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3))).AsByte();
+
+                // Convert to 16b.
+                ref0 = Sse2.UnpackLow(ref0, Vector128<byte>.Zero);
+                ref1 = Sse2.UnpackLow(ref1, Vector128<byte>.Zero);
+                ref2 = Sse2.UnpackLow(ref2, Vector128<byte>.Zero);
+                ref3 = Sse2.UnpackLow(ref3, Vector128<byte>.Zero);
+
+                // Add the inverse transform(s).
+                Vector128<short> ref0InvAdded = Sse2.Add(ref0.AsInt16(), t0.AsInt16());
+                Vector128<short> ref1InvAdded = Sse2.Add(ref1.AsInt16(), t1.AsInt16());
+                Vector128<short> ref2InvAdded = Sse2.Add(ref2.AsInt16(), t2.AsInt16());
+                Vector128<short> ref3InvAdded = Sse2.Add(ref3.AsInt16(), t3.AsInt16());
+
+                // Unsigned saturate to 8b.
+                ref0 = Sse2.PackUnsignedSaturate(ref0InvAdded, ref0InvAdded);
+                ref1 = Sse2.PackUnsignedSaturate(ref1InvAdded, ref1InvAdded);
+                ref2 = Sse2.PackUnsignedSaturate(ref2InvAdded, ref2InvAdded);
+                ref3 = Sse2.PackUnsignedSaturate(ref3InvAdded, ref3InvAdded);
+
+                // Unsigned saturate to 8b.
+                ref byte outputRef = ref MemoryMarshal.GetReference(dst);
+
+                // Store four bytes/pixels per line.
+                int output0 = Sse2.ConvertToInt32(ref0.AsInt32());
+                int output1 = Sse2.ConvertToInt32(ref1.AsInt32());
+                int output2 = Sse2.ConvertToInt32(ref2.AsInt32());
+                int output3 = Sse2.ConvertToInt32(ref3.AsInt32());
+
+                Unsafe.As<byte, int>(ref outputRef) = output0;
+                Unsafe.As<byte, int>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = output1;
+                Unsafe.As<byte, int>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = output2;
+                Unsafe.As<byte, int>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3)) = output3;
+            }
+            else
+#endif
             {
-                // horizontal pass.
-                int dc = tmp[0] + 4;
-                int a = dc + tmp[8];
-                int b = dc - tmp[8];
-                int c = Mul(tmp[4], KC2) - Mul(tmp[12], KC1);
-                int d = Mul(tmp[4], KC1) + Mul(tmp[12], KC2);
-                Store(dst, reference, 0, i, a + d);
-                Store(dst, reference, 1, i, b + c);
-                Store(dst, reference, 2, i, b - c);
-                Store(dst, reference, 3, i, a - d);
-                tmp = tmp.Slice(1);
+                int i;
+                Span<int> tmp = scratch.Slice(0, 16);
+                for (i = 0; i < 4; i++)
+                {
+                    // vertical pass.
+                    int a = input[0] + input[8];
+                    int b = input[0] - input[8];
+                    int c = Mul(input[4], KC2) - Mul(input[12], KC1);
+                    int d = Mul(input[4], KC1) + Mul(input[12], KC2);
+                    tmp[0] = a + d;
+                    tmp[1] = b + c;
+                    tmp[2] = b - c;
+                    tmp[3] = a - d;
+                    tmp = tmp.Slice(4);
+                    input = input.Slice(1);
+                }
+
+                tmp = scratch;
+                for (i = 0; i < 4; i++)
+                {
+                    // horizontal pass.
+                    int dc = tmp[0] + 4;
+                    int a = dc + tmp[8];
+                    int b = dc - tmp[8];
+                    int c = Mul(tmp[4], KC2) - Mul(tmp[12], KC1);
+                    int d = Mul(tmp[4], KC1) + Mul(tmp[12], KC2);
+                    Store(dst, reference, 0, i, a + d);
+                    Store(dst, reference, 1, i, b + c);
+                    Store(dst, reference, 2, i, b - c);
+                    Store(dst, reference, 3, i, a - d);
+                    tmp = tmp.Slice(1);
+                }
             }
         }
 
diff --git a/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs b/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs

Original file line number	Diff line number	Diff line change
`@@ -329,7 +329,7 @@ public static int ReconstructIntra16(Vp8EncIterator it, Vp8SegmentInfo dqm, Vp8M`
`329`	`329`	`LossyUtils.TransformWht(dcTmp, tmp, scratch);`
`330`	`330`	`for (n = 0; n < 16; n += 2)`
`331`	`331`	`{`
`332`		`- Vp8Encoding.ITransform(reference.Slice(WebpLookupTables.Vp8Scan[n]), tmp.Slice(n * 16, 32), yuvOut.Slice(WebpLookupTables.Vp8Scan[n]), true, scratch);`
	`332`	`+ Vp8Encoding.ITransform(reference.Slice(WebpLookupTables.Vp8Scan[n]), tmp.Slice(n * 16, 32), yuvOut.Slice(WebpLookupTables.Vp8Scan[n]), scratch);`
`333`	`333`	`}`
`334`	`334`
`335`	`335`	`return nz;`
`@@ -342,7 +342,7 @@ public static int ReconstructIntra4(Vp8EncIterator it, Vp8SegmentInfo dqm, Span<`
`342`	`342`	`Span<int> scratch = it.Scratch3.AsSpan(0, 16);`
`343`	`343`	`Vp8Encoding.FTransform(src, reference, tmp, scratch);`
`344`	`344`	`int nz = QuantizeBlock(tmp, levels, ref dqm.Y1);`
`345`		`- Vp8Encoding.ITransform(reference, tmp, yuvOut, false, scratch);`
	`345`	`+ Vp8Encoding.ITransformOne(reference, tmp, yuvOut, scratch);`
`346`	`346`
`347`	`347`	`return nz;`
`348`	`348`	`}`
`@@ -375,7 +375,7 @@ public static int ReconstructUv(Vp8EncIterator it, Vp8SegmentInfo dqm, Vp8ModeSc`
`375`	`375`
`376`	`376`	`for (n = 0; n < 8; n += 2)`
`377`	`377`	`{`
`378`		`- Vp8Encoding.ITransform(reference.Slice(WebpLookupTables.Vp8ScanUv[n]), tmp.Slice(n * 16, 32), yuvOut.Slice(WebpLookupTables.Vp8ScanUv[n]), true, scratch);`
	`378`	`+ Vp8Encoding.ITransform(reference.Slice(WebpLookupTables.Vp8ScanUv[n]), tmp.Slice(n * 16, 32), yuvOut.Slice(WebpLookupTables.Vp8ScanUv[n]), scratch);`
`379`	`379`	`}`
`380`	`380`
`381`	`381`	`return nz << 16;`