Skip to content

Commit 3ae4b02

Browse files
Use less permutes and more multiply/add
1 parent eb315fe commit 3ae4b02

File tree

2 files changed

+22
-43
lines changed

2 files changed

+22
-43
lines changed

src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ public static class HwIntrinsics
1616
{
1717
public static ReadOnlySpan<byte> PermuteMaskDeinterleave8x32 => new byte[] { 0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0 };
1818

19+
public static ReadOnlySpan<byte> PermuteMaskEvenOdd8x32 => new byte[] { 0, 0, 0, 0, 2, 0, 0, 0, 4, 0, 0, 0, 6, 0, 0, 0, 1, 0, 0, 0, 3, 0, 0, 0, 5, 0, 0, 0, 7, 0, 0, 0 };
20+
1921
/// <summary>
2022
/// Performs a multiplication and an addition of the <see cref="Vector256{T}"/>.
2123
/// </summary>

src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs

Lines changed: 20 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -66,14 +66,13 @@ internal static void ConvertCore(in ComponentValues values, Span<Vector4> result
6666
var chromaOffset = Vector256.Create(-halfValue);
6767
var scale = Vector256.Create(1 / maxValue);
6868
var rCrMult = Vector256.Create(1.402F);
69-
var gCbMult = Vector256.Create(0.344136F);
70-
var gCrMult = Vector256.Create(0.714136F);
69+
var gCbMult = Vector256.Create(-0.344136F);
70+
var gCrMult = Vector256.Create(-0.714136F);
7171
var bCbMult = Vector256.Create(1.772F);
7272

7373
// Used for packing.
74-
Vector4 vo = Vector4.One;
75-
Vector128<float> valpha = Unsafe.As<Vector4, Vector128<float>>(ref vo);
76-
ref byte control = ref MemoryMarshal.GetReference(HwIntrinsics.PermuteMaskDeinterleave8x32);
74+
var va = Vector256.Create(1F);
75+
ref byte control = ref MemoryMarshal.GetReference(HwIntrinsics.PermuteMaskEvenOdd8x32);
7776
Vector256<int> vcontrol = Unsafe.As<byte, Vector256<int>>(ref control);
7877

7978
// Walking 8 elements at one step:
@@ -87,58 +86,36 @@ internal static void ConvertCore(in ComponentValues values, Span<Vector4> result
8786
Vector256<float> cb = Avx.Add(Unsafe.Add(ref cbBase, i), chromaOffset);
8887
Vector256<float> cr = Avx.Add(Unsafe.Add(ref crBase, i), chromaOffset);
8988

89+
y = Avx2.PermuteVar8x32(y, vcontrol);
90+
cb = Avx2.PermuteVar8x32(cb, vcontrol);
91+
cr = Avx2.PermuteVar8x32(cr, vcontrol);
92+
9093
// r = y + (1.402F * cr);
9194
// g = y - (0.344136F * cb) - (0.714136F * cr);
9295
// b = y + (1.772F * cb);
9396
// Adding & multiplying 8 elements at one time:
9497
Vector256<float> r = HwIntrinsics.MultiplyAdd(y, cr, rCrMult);
95-
Vector256<float> g = Avx.Subtract(Avx.Subtract(y, Avx.Multiply(cb, gCbMult)), Avx.Multiply(cr, gCrMult));
98+
Vector256<float> g = HwIntrinsics.MultiplyAdd(HwIntrinsics.MultiplyAdd(y, cb, gCbMult), cr, gCrMult);
9699
Vector256<float> b = HwIntrinsics.MultiplyAdd(y, cb, bCbMult);
97100

101+
// TODO: We should be savving to RGBA not Vector4
98102
r = Avx.Multiply(Avx.RoundToNearestInteger(r), scale);
99103
g = Avx.Multiply(Avx.RoundToNearestInteger(g), scale);
100104
b = Avx.Multiply(Avx.RoundToNearestInteger(b), scale);
101105

102-
// Collect (r0,r1...r8) (g0,g1...g8) (b0,b1...b8) vector values in the
103-
// expected (r0,g0,g1,1), (r1,g1,g2,1) ... order:
104-
//
105-
// Left side.
106-
Vector256<float> r0 = Avx.InsertVector128(
107-
r,
108-
Unsafe.As<Vector256<float>, Vector128<float>>(ref g),
109-
1);
110-
111-
Vector256<float> r1 = Avx.InsertVector128(
112-
b,
113-
valpha,
114-
1);
115-
116-
// Right side
117-
Vector256<float> r2 = Avx.InsertVector128(
118-
Unsafe.Add(ref Unsafe.As<Vector256<float>, Vector128<float>>(ref r), 1).ToVector256(),
119-
Unsafe.Add(ref Unsafe.As<Vector256<float>, Vector128<float>>(ref g), 1),
120-
1);
121-
122-
Vector256<float> r3 = Avx.InsertVector128(
123-
Unsafe.Add(ref Unsafe.As<Vector256<float>, Vector128<float>>(ref b), 1).ToVector256(),
124-
valpha,
125-
1);
126-
127-
// Split into separate rows
128-
Vector256<float> t0 = Avx.UnpackLow(r0, r1);
129-
Vector256<float> t2 = Avx.UnpackHigh(r0, r1);
130-
131-
// Deinterleave and set
106+
Vector256<float> vte = Avx.UnpackLow(r, b);
107+
Vector256<float> vto = Avx.UnpackLow(g, va);
108+
132109
ref Vector256<float> destination = ref Unsafe.Add(ref resultBase, i * 4);
133-
destination = Avx2.PermuteVar8x32(t0, vcontrol);
134-
Unsafe.Add(ref destination, 1) = Avx2.PermuteVar8x32(t2, vcontrol);
135110

136-
// Repeat for right side.
137-
Vector256<float> t4 = Avx.UnpackLow(r2, r3);
138-
Vector256<float> t6 = Avx.UnpackHigh(r2, r3);
111+
destination = Avx.UnpackLow(vte, vto);
112+
Unsafe.Add(ref destination, 1) = Avx.UnpackHigh(vte, vto);
113+
114+
vte = Avx.UnpackHigh(r, b);
115+
vto = Avx.UnpackHigh(g, va);
139116

140-
Unsafe.Add(ref destination, 2) = Avx2.PermuteVar8x32(t4, vcontrol);
141-
Unsafe.Add(ref destination, 3) = Avx2.PermuteVar8x32(t6, vcontrol);
117+
Unsafe.Add(ref destination, 2) = Avx.UnpackLow(vte, vto);
118+
Unsafe.Add(ref destination, 3) = Avx.UnpackHigh(vte, vto);
142119
}
143120
#else
144121
ref Vector<float> yBase =

0 commit comments

Comments
 (0)