@@ -59,8 +59,8 @@ internal static void ConvertCore(in ComponentValues values, Span<Vector4> result
5959 ref Vector256 < float > crBase =
6060 ref Unsafe . As < float , Vector256 < float > > ( ref MemoryMarshal . GetReference ( values . Component2 ) ) ;
6161
62- ref Vector4Octet resultBase =
63- ref Unsafe . As < Vector4 , Vector4Octet > ( ref MemoryMarshal . GetReference ( result ) ) ;
62+ ref Vector256 < float > resultBase =
63+ ref Unsafe . As < Vector4 , Vector256 < float > > ( ref MemoryMarshal . GetReference ( result ) ) ;
6464
6565 // Used for the color conversion
6666 var chromaOffset = Vector256 . Create ( - halfValue ) ;
@@ -76,14 +76,6 @@ internal static void ConvertCore(in ComponentValues values, Span<Vector4> result
7676 ref byte control = ref MemoryMarshal . GetReference ( HwIntrinsics . PermuteMaskDeinterleave8x32 ) ;
7777 Vector256 < int > vcontrol = Unsafe . As < byte , Vector256 < int > > ( ref control ) ;
7878
79- Vector4Pair rr = default ;
80- Vector4Pair gg = default ;
81- Vector4Pair bb = default ;
82-
83- ref Vector256 < float > rrRefAsVector = ref Unsafe . As < Vector4Pair , Vector256 < float > > ( ref rr ) ;
84- ref Vector256 < float > ggRefAsVector = ref Unsafe . As < Vector4Pair , Vector256 < float > > ( ref gg ) ;
85- ref Vector256 < float > bbRefAsVector = ref Unsafe . As < Vector4Pair , Vector256 < float > > ( ref bb ) ;
86-
8779 // Walking 8 elements at one step:
8880 int n = result . Length / 8 ;
8981 for ( int i = 0 ; i < n ; i ++ )
@@ -107,13 +99,46 @@ internal static void ConvertCore(in ComponentValues values, Span<Vector4> result
10799 g = Avx . Multiply ( Avx . RoundToNearestInteger ( g ) , scale ) ;
108100 b = Avx . Multiply ( Avx . RoundToNearestInteger ( b ) , scale ) ;
109101
110- rrRefAsVector = r ;
111- ggRefAsVector = g ;
112- bbRefAsVector = b ;
113-
114- // Collect (r0,r1...r8) (g0,g1...g8) (b0,b1...b8) vector values in the expected (r0,g0,g1,1), (r1,g1,g2,1) ... order:
115- ref Vector4Octet destination = ref Unsafe . Add ( ref resultBase , i ) ;
116- destination . PackAvx2 ( ref rr , ref gg , ref bb , in valpha , in vcontrol ) ;
102+ // Collect (r0,r1...r8) (g0,g1...g8) (b0,b1...b8) vector values in the
103+ // expected (r0,g0,g1,1), (r1,g1,g2,1) ... order:
104+ //
105+ // Left side.
106+ Vector256 < float > r0 = Avx . InsertVector128 (
107+ r ,
108+ Unsafe . As < Vector256 < float > , Vector128 < float > > ( ref g ) ,
109+ 1 ) ;
110+
111+ Vector256 < float > r1 = Avx . InsertVector128 (
112+ b ,
113+ valpha ,
114+ 1 ) ;
115+
116+ // Right side
117+ Vector256 < float > r2 = Avx . InsertVector128 (
118+ Unsafe . Add ( ref Unsafe . As < Vector256 < float > , Vector128 < float > > ( ref r ) , 1 ) . ToVector256 ( ) ,
119+ Unsafe . Add ( ref Unsafe . As < Vector256 < float > , Vector128 < float > > ( ref g ) , 1 ) ,
120+ 1 ) ;
121+
122+ Vector256 < float > r3 = Avx . InsertVector128 (
123+ Unsafe . Add ( ref Unsafe . As < Vector256 < float > , Vector128 < float > > ( ref b ) , 1 ) . ToVector256 ( ) ,
124+ valpha ,
125+ 1 ) ;
126+
127+ // Split into separate rows
128+ Vector256 < float > t0 = Avx . UnpackLow ( r0 , r1 ) ;
129+ Vector256 < float > t2 = Avx . UnpackHigh ( r0 , r1 ) ;
130+
131+ // Deinterleave and set
132+ ref Vector256 < float > destination = ref Unsafe . Add ( ref resultBase , i * 4 ) ;
133+ destination = Avx2 . PermuteVar8x32 ( t0 , vcontrol ) ;
134+ Unsafe . Add ( ref destination , 1 ) = Avx2. PermuteVar8x32 ( t2 , vcontrol ) ;
135+
136+ // Repeat for right side.
137+ Vector256 < float > t4 = Avx . UnpackLow ( r2 , r3 ) ;
138+ Vector256 < float > t6 = Avx . UnpackHigh ( r2 , r3 ) ;
139+
140+ Unsafe . Add ( ref destination , 2 ) = Avx2. PermuteVar8x32 ( t4 , vcontrol ) ;
141+ Unsafe . Add ( ref destination , 3 ) = Avx2. PermuteVar8x32 ( t6 , vcontrol ) ;
117142 }
118143#else
119144 ref Vector < float > yBase =
0 commit comments