@@ -66,14 +66,13 @@ internal static void ConvertCore(in ComponentValues values, Span<Vector4> result
6666 var chromaOffset = Vector256 . Create ( - halfValue ) ;
6767 var scale = Vector256 . Create ( 1 / maxValue ) ;
6868 var rCrMult = Vector256 . Create ( 1.402F ) ;
69- var gCbMult = Vector256 . Create ( 0.344136F ) ;
70- var gCrMult = Vector256 . Create ( 0.714136F ) ;
69+ var gCbMult = Vector256 . Create ( - 0.344136F ) ;
70+ var gCrMult = Vector256 . Create ( - 0.714136F ) ;
7171 var bCbMult = Vector256 . Create ( 1.772F ) ;
7272
7373 // Used for packing.
74- Vector4 vo = Vector4 . One ;
75- Vector128 < float > valpha = Unsafe . As < Vector4 , Vector128 < float > > ( ref vo ) ;
76- ref byte control = ref MemoryMarshal . GetReference ( HwIntrinsics . PermuteMaskDeinterleave8x32 ) ;
74+ var va = Vector256 . Create ( 1F ) ;
75+ ref byte control = ref MemoryMarshal . GetReference ( HwIntrinsics . PermuteMaskEvenOdd8x32 ) ;
7776 Vector256 < int > vcontrol = Unsafe . As < byte , Vector256 < int > > ( ref control ) ;
7877
7978 // Walking 8 elements at one step:
@@ -87,58 +86,36 @@ internal static void ConvertCore(in ComponentValues values, Span<Vector4> result
8786 Vector256 < float > cb = Avx . Add ( Unsafe . Add ( ref cbBase , i ) , chromaOffset ) ;
8887 Vector256 < float > cr = Avx . Add ( Unsafe . Add ( ref crBase , i ) , chromaOffset ) ;
8988
89+ y = Avx2 . PermuteVar8x32 ( y , vcontrol ) ;
90+ cb = Avx2 . PermuteVar8x32 ( cb , vcontrol ) ;
91+ cr = Avx2 . PermuteVar8x32 ( cr , vcontrol ) ;
92+
9093 // r = y + (1.402F * cr);
9194 // g = y - (0.344136F * cb) - (0.714136F * cr);
9295 // b = y + (1.772F * cb);
9396 // Adding & multiplying 8 elements at one time:
9497 Vector256 < float > r = HwIntrinsics . MultiplyAdd ( y , cr , rCrMult ) ;
95- Vector256 < float > g = Avx . Subtract ( Avx . Subtract ( y , Avx . Multiply ( cb , gCbMult ) ) , Avx . Multiply ( cr , gCrMult ) ) ;
98+ Vector256 < float > g = HwIntrinsics . MultiplyAdd ( HwIntrinsics . MultiplyAdd ( y , cb , gCbMult ) , cr , gCrMult ) ;
9699 Vector256 < float > b = HwIntrinsics . MultiplyAdd ( y , cb , bCbMult ) ;
97100
101+ // TODO: We should be savving to RGBA not Vector4
98102 r = Avx . Multiply ( Avx . RoundToNearestInteger ( r ) , scale ) ;
99103 g = Avx . Multiply ( Avx . RoundToNearestInteger ( g ) , scale ) ;
100104 b = Avx . Multiply ( Avx . RoundToNearestInteger ( b ) , scale ) ;
101105
102- // Collect (r0,r1...r8) (g0,g1...g8) (b0,b1...b8) vector values in the
103- // expected (r0,g0,g1,1), (r1,g1,g2,1) ... order:
104- //
105- // Left side.
106- Vector256 < float > r0 = Avx . InsertVector128 (
107- r ,
108- Unsafe . As < Vector256 < float > , Vector128 < float > > ( ref g ) ,
109- 1 ) ;
110-
111- Vector256 < float > r1 = Avx . InsertVector128 (
112- b ,
113- valpha ,
114- 1 ) ;
115-
116- // Right side
117- Vector256 < float > r2 = Avx . InsertVector128 (
118- Unsafe . Add ( ref Unsafe . As < Vector256 < float > , Vector128 < float > > ( ref r ) , 1 ) . ToVector256 ( ) ,
119- Unsafe . Add ( ref Unsafe . As < Vector256 < float > , Vector128 < float > > ( ref g ) , 1 ) ,
120- 1 ) ;
121-
122- Vector256 < float > r3 = Avx . InsertVector128 (
123- Unsafe . Add ( ref Unsafe . As < Vector256 < float > , Vector128 < float > > ( ref b ) , 1 ) . ToVector256 ( ) ,
124- valpha ,
125- 1 ) ;
126-
127- // Split into separate rows
128- Vector256 < float > t0 = Avx . UnpackLow ( r0 , r1 ) ;
129- Vector256 < float > t2 = Avx . UnpackHigh ( r0 , r1 ) ;
130-
131- // Deinterleave and set
106+ Vector256 < float > vte = Avx . UnpackLow ( r , b ) ;
107+ Vector256 < float > vto = Avx . UnpackLow ( g , va ) ;
108+
132109 ref Vector256 < float > destination = ref Unsafe . Add ( ref resultBase , i * 4 ) ;
133- destination = Avx2 . PermuteVar8x32 ( t0 , vcontrol ) ;
134- Unsafe . Add ( ref destination , 1 ) = Avx2. PermuteVar8x32 ( t2 , vcontrol ) ;
135110
136- // Repeat for right side.
137- Vector256 < float > t4 = Avx . UnpackLow ( r2 , r3 ) ;
138- Vector256 < float > t6 = Avx . UnpackHigh ( r2 , r3 ) ;
111+ destination = Avx . UnpackLow ( vte , vto ) ;
112+ Unsafe . Add ( ref destination , 1 ) = Avx. UnpackHigh ( vte , vto ) ;
113+
114+ vte = Avx . UnpackHigh ( r , b ) ;
115+ vto = Avx . UnpackHigh ( g , va ) ;
139116
140- Unsafe . Add ( ref destination , 2 ) = Avx2 . PermuteVar8x32 ( t4 , vcontrol ) ;
141- Unsafe . Add ( ref destination , 3 ) = Avx2 . PermuteVar8x32 ( t6 , vcontrol ) ;
117+ Unsafe . Add ( ref destination , 2 ) = Avx . UnpackLow ( vte , vto ) ;
118+ Unsafe . Add ( ref destination , 3 ) = Avx . UnpackHigh ( vte , vto ) ;
142119 }
143120#else
144121 ref Vector < float > yBase =
0 commit comments