@@ -136,61 +136,14 @@ public static void ITransform(Span<byte> reference, Span<short> input, Span<byte
136136
137137 // Vertical pass and subsequent transpose.
138138 // First pass, c and d calculations are longer because of the "trick" multiplications.
139- Vector128 < short > a = Sse2 . Add ( in0 . AsInt16 ( ) , in2 . AsInt16 ( ) ) ;
140- Vector128 < short > b = Sse2 . Subtract ( in0 . AsInt16 ( ) , in2 . AsInt16 ( ) ) ;
141-
142- // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3
143- Vector128 < short > c1 = Sse2 . MultiplyHigh ( in1 . AsInt16 ( ) , K2 ) ;
144- Vector128 < short > c2 = Sse2 . MultiplyHigh ( in3 . AsInt16 ( ) , K1 ) ;
145- Vector128 < short > c3 = Sse2 . Subtract ( in1 . AsInt16 ( ) , in3 . AsInt16 ( ) ) ;
146- Vector128 < short > c4 = Sse2 . Subtract ( c1 , c2 ) ;
147- Vector128 < short > c = Sse2 . Add ( c3 , c4 ) ;
148-
149- // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3
150- Vector128 < short > d1 = Sse2 . MultiplyHigh ( in1 . AsInt16 ( ) , K1 ) ;
151- Vector128 < short > d2 = Sse2 . MultiplyHigh ( in3 . AsInt16 ( ) , K2 ) ;
152- Vector128 < short > d3 = Sse2 . Add ( in1 . AsInt16 ( ) , in3 . AsInt16 ( ) ) ;
153- Vector128 < short > d4 = Sse2 . Add ( d1 , d2 ) ;
154- Vector128 < short > d = Sse2 . Add ( d3 , d4 ) ;
155-
156- // Second pass.
157- Vector128 < short > tmp0 = Sse2 . Add ( a , d ) ;
158- Vector128 < short > tmp1 = Sse2 . Add ( b , c ) ;
159- Vector128 < short > tmp2 = Sse2 . Subtract ( b , c ) ;
160- Vector128 < short > tmp3 = Sse2 . Subtract ( a , d ) ;
139+ InverseTransformVerticalPass ( in0 , in2 , in1 , in3 , out Vector128 < short > tmp0 , out Vector128 < short > tmp1 , out Vector128 < short > tmp2 , out Vector128 < short > tmp3 ) ;
161140
162141 // Transpose the two 4x4.
163142 LossyUtils . Vp8Transpose_2_4x4_16b ( tmp0 , tmp1 , tmp2 , tmp3 , out Vector128 < long > t0 , out Vector128 < long > t1 , out Vector128 < long > t2 , out Vector128 < long > t3 ) ;
164143
165144 // Horizontal pass and subsequent transpose.
166145 // First pass, c and d calculations are longer because of the "trick" multiplications.
167- Vector128 < short > dc = Sse2 . Add ( t0 . AsInt16 ( ) , Four ) ;
168- a = Sse2 . Add ( dc , t2 . AsInt16 ( ) ) ;
169- b = Sse2 . Subtract ( dc , t2 . AsInt16 ( ) ) ;
170-
171- // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3
172- c1 = Sse2 . MultiplyHigh ( t1 . AsInt16 ( ) , K2 ) ;
173- c2 = Sse2 . MultiplyHigh ( t3 . AsInt16 ( ) , K1 ) ;
174- c3 = Sse2 . Subtract ( t1 . AsInt16 ( ) , t3 . AsInt16 ( ) ) ;
175- c4 = Sse2 . Subtract ( c1 , c2 ) ;
176- c = Sse2 . Add ( c3 , c4 ) ;
177-
178- // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3
179- d1 = Sse2 . MultiplyHigh ( t1 . AsInt16 ( ) , K1 ) ;
180- d2 = Sse2 . MultiplyHigh ( t3 . AsInt16 ( ) , K2 ) ;
181- d3 = Sse2 . Add ( t1 . AsInt16 ( ) , t3 . AsInt16 ( ) ) ;
182- d4 = Sse2 . Add ( d1 , d2 ) ;
183- d = Sse2 . Add ( d3 , d4 ) ;
184-
185- // Second pass.
186- tmp0 = Sse2 . Add ( a , d ) ;
187- tmp1 = Sse2 . Add ( b , c ) ;
188- tmp2 = Sse2 . Subtract ( b , c ) ;
189- tmp3 = Sse2 . Subtract ( a , d ) ;
190- Vector128 < short > shifted0 = Sse2 . ShiftRightArithmetic ( tmp0 , 3 ) ;
191- Vector128 < short > shifted1 = Sse2 . ShiftRightArithmetic ( tmp1 , 3 ) ;
192- Vector128 < short > shifted2 = Sse2 . ShiftRightArithmetic ( tmp2 , 3 ) ;
193- Vector128 < short > shifted3 = Sse2 . ShiftRightArithmetic ( tmp3 , 3 ) ;
146+ InverseTransformHorizontalPass ( t0 , t2 , t1 , t3 , out Vector128 < short > shifted0 , out Vector128 < short > shifted1 , out Vector128 < short > shifted2 , out Vector128 < short > shifted3 ) ;
194147
195148 // Transpose the two 4x4.
196149 LossyUtils . Vp8Transpose_2_4x4_16b ( shifted0 , shifted1 , shifted2 , shifted3 , out t0 , out t1 , out t2 , out t3 ) ;
@@ -266,61 +219,14 @@ public static void ITransformOne(Span<byte> reference, Span<short> input, Span<b
266219
267220 // Vertical pass and subsequent transpose.
268221 // First pass, c and d calculations are longer because of the "trick" multiplications.
269- Vector128 < short > a = Sse2 . Add ( in0 . AsInt16 ( ) , in2 . AsInt16 ( ) ) ;
270- Vector128 < short > b = Sse2 . Subtract ( in0 . AsInt16 ( ) , in2 . AsInt16 ( ) ) ;
271-
272- // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3
273- Vector128 < short > c1 = Sse2 . MultiplyHigh ( in1 . AsInt16 ( ) , K2 ) ;
274- Vector128 < short > c2 = Sse2 . MultiplyHigh ( in3 . AsInt16 ( ) , K1 ) ;
275- Vector128 < short > c3 = Sse2 . Subtract ( in1 . AsInt16 ( ) , in3 . AsInt16 ( ) ) ;
276- Vector128 < short > c4 = Sse2 . Subtract ( c1 , c2 ) ;
277- Vector128 < short > c = Sse2 . Add ( c3 , c4 ) ;
278-
279- // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3
280- Vector128 < short > d1 = Sse2 . MultiplyHigh ( in1 . AsInt16 ( ) , K1 ) ;
281- Vector128 < short > d2 = Sse2 . MultiplyHigh ( in3 . AsInt16 ( ) , K2 ) ;
282- Vector128 < short > d3 = Sse2 . Add ( in1 . AsInt16 ( ) , in3 . AsInt16 ( ) ) ;
283- Vector128 < short > d4 = Sse2 . Add ( d1 , d2 ) ;
284- Vector128 < short > d = Sse2 . Add ( d3 , d4 ) ;
285-
286- // Second pass.
287- Vector128 < short > tmp0 = Sse2 . Add ( a , d ) ;
288- Vector128 < short > tmp1 = Sse2 . Add ( b , c ) ;
289- Vector128 < short > tmp2 = Sse2 . Subtract ( b , c ) ;
290- Vector128 < short > tmp3 = Sse2 . Subtract ( a , d ) ;
222+ InverseTransformVerticalPass ( in0 , in2 , in1 , in3 , out Vector128 < short > tmp0 , out Vector128 < short > tmp1 , out Vector128 < short > tmp2 , out Vector128 < short > tmp3 ) ;
291223
292224 // Transpose the two 4x4.
293225 LossyUtils . Vp8Transpose_2_4x4_16b ( tmp0 , tmp1 , tmp2 , tmp3 , out Vector128 < long > t0 , out Vector128 < long > t1 , out Vector128 < long > t2 , out Vector128 < long > t3 ) ;
294226
295227 // Horizontal pass and subsequent transpose.
296228 // First pass, c and d calculations are longer because of the "trick" multiplications.
297- Vector128 < short > dc = Sse2 . Add ( t0 . AsInt16 ( ) , Four ) ;
298- a = Sse2 . Add ( dc , t2 . AsInt16 ( ) ) ;
299- b = Sse2 . Subtract ( dc , t2 . AsInt16 ( ) ) ;
300-
301- // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3
302- c1 = Sse2 . MultiplyHigh ( t1 . AsInt16 ( ) , K2 ) ;
303- c2 = Sse2 . MultiplyHigh ( t3 . AsInt16 ( ) , K1 ) ;
304- c3 = Sse2 . Subtract ( t1 . AsInt16 ( ) , t3 . AsInt16 ( ) ) ;
305- c4 = Sse2 . Subtract ( c1 , c2 ) ;
306- c = Sse2 . Add ( c3 , c4 ) ;
307-
308- // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3
309- d1 = Sse2 . MultiplyHigh ( t1 . AsInt16 ( ) , K1 ) ;
310- d2 = Sse2 . MultiplyHigh ( t3 . AsInt16 ( ) , K2 ) ;
311- d3 = Sse2 . Add ( t1 . AsInt16 ( ) , t3 . AsInt16 ( ) ) ;
312- d4 = Sse2 . Add ( d1 , d2 ) ;
313- d = Sse2 . Add ( d3 , d4 ) ;
314-
315- // Second pass.
316- tmp0 = Sse2 . Add ( a , d ) ;
317- tmp1 = Sse2 . Add ( b , c ) ;
318- tmp2 = Sse2 . Subtract ( b , c ) ;
319- tmp3 = Sse2 . Subtract ( a , d ) ;
320- Vector128 < short > shifted0 = Sse2 . ShiftRightArithmetic ( tmp0 , 3 ) ;
321- Vector128 < short > shifted1 = Sse2 . ShiftRightArithmetic ( tmp1 , 3 ) ;
322- Vector128 < short > shifted2 = Sse2 . ShiftRightArithmetic ( tmp2 , 3 ) ;
323- Vector128 < short > shifted3 = Sse2 . ShiftRightArithmetic ( tmp3 , 3 ) ;
229+ InverseTransformHorizontalPass ( t0 , t2 , t1 , t3 , out Vector128 < short > shifted0 , out Vector128 < short > shifted1 , out Vector128 < short > shifted2 , out Vector128 < short > shifted3 ) ;
324230
325231 // Transpose the two 4x4.
326232 LossyUtils . Vp8Transpose_2_4x4_16b ( shifted0 , shifted1 , shifted2 , shifted3 , out t0 , out t1 , out t2 , out t3 ) ;
@@ -409,6 +315,65 @@ public static void ITransformOne(Span<byte> reference, Span<short> input, Span<b
409315 }
410316 }
411317
318+ #if SUPPORTS_RUNTIME_INTRINSICS
319+ private static void InverseTransformVerticalPass ( Vector128 < long > in0 , Vector128 < long > in2 , Vector128 < long > in1 , Vector128 < long > in3 , out Vector128 < short > tmp0 , out Vector128 < short > tmp1 , out Vector128 < short > tmp2 , out Vector128 < short > tmp3 )
320+ {
321+ Vector128 < short > a = Sse2 . Add ( in0 . AsInt16 ( ) , in2 . AsInt16 ( ) ) ;
322+ Vector128 < short > b = Sse2 . Subtract ( in0 . AsInt16 ( ) , in2 . AsInt16 ( ) ) ;
323+
324+ // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3
325+ Vector128 < short > c1 = Sse2 . MultiplyHigh ( in1 . AsInt16 ( ) , K2 ) ;
326+ Vector128 < short > c2 = Sse2 . MultiplyHigh ( in3 . AsInt16 ( ) , K1 ) ;
327+ Vector128 < short > c3 = Sse2 . Subtract ( in1 . AsInt16 ( ) , in3 . AsInt16 ( ) ) ;
328+ Vector128 < short > c4 = Sse2 . Subtract ( c1 , c2 ) ;
329+ Vector128 < short > c = Sse2 . Add ( c3 , c4 ) ;
330+
331+ // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3
332+ Vector128 < short > d1 = Sse2 . MultiplyHigh ( in1 . AsInt16 ( ) , K1 ) ;
333+ Vector128 < short > d2 = Sse2 . MultiplyHigh ( in3 . AsInt16 ( ) , K2 ) ;
334+ Vector128 < short > d3 = Sse2 . Add ( in1 . AsInt16 ( ) , in3 . AsInt16 ( ) ) ;
335+ Vector128 < short > d4 = Sse2 . Add ( d1 , d2 ) ;
336+ Vector128 < short > d = Sse2 . Add ( d3 , d4 ) ;
337+
338+ // Second pass.
339+ tmp0 = Sse2 . Add ( a , d ) ;
340+ tmp1 = Sse2 . Add ( b , c ) ;
341+ tmp2 = Sse2 . Subtract ( b , c ) ;
342+ tmp3 = Sse2 . Subtract ( a , d ) ;
343+ }
344+
345+ private static void InverseTransformHorizontalPass ( Vector128 < long > t0 , Vector128 < long > t2 , Vector128 < long > t1 , Vector128 < long > t3 , out Vector128 < short > shifted0 , out Vector128 < short > shifted1 , out Vector128 < short > shifted2 , out Vector128 < short > shifted3 )
346+ {
347+ Vector128 < short > dc = Sse2 . Add ( t0 . AsInt16 ( ) , Four ) ;
348+ Vector128 < short > a = Sse2 . Add ( dc , t2 . AsInt16 ( ) ) ;
349+ Vector128 < short > b = Sse2 . Subtract ( dc , t2 . AsInt16 ( ) ) ;
350+
351+ // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3
352+ Vector128 < short > c1 = Sse2 . MultiplyHigh ( t1 . AsInt16 ( ) , K2 ) ;
353+ Vector128 < short > c2 = Sse2 . MultiplyHigh ( t3 . AsInt16 ( ) , K1 ) ;
354+ Vector128 < short > c3 = Sse2 . Subtract ( t1 . AsInt16 ( ) , t3 . AsInt16 ( ) ) ;
355+ Vector128 < short > c4 = Sse2 . Subtract ( c1 , c2 ) ;
356+ Vector128 < short > c = Sse2 . Add ( c3 , c4 ) ;
357+
358+ // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3
359+ Vector128 < short > d1 = Sse2 . MultiplyHigh ( t1 . AsInt16 ( ) , K1 ) ;
360+ Vector128 < short > d2 = Sse2 . MultiplyHigh ( t3 . AsInt16 ( ) , K2 ) ;
361+ Vector128 < short > d3 = Sse2 . Add ( t1 . AsInt16 ( ) , t3 . AsInt16 ( ) ) ;
362+ Vector128 < short > d4 = Sse2 . Add ( d1 , d2 ) ;
363+ Vector128 < short > d = Sse2 . Add ( d3 , d4 ) ;
364+
365+ // Second pass.
366+ Vector128 < short > tmp0 = Sse2 . Add ( a , d ) ;
367+ Vector128 < short > tmp1 = Sse2 . Add ( b , c ) ;
368+ Vector128 < short > tmp2 = Sse2 . Subtract ( b , c ) ;
369+ Vector128 < short > tmp3 = Sse2 . Subtract ( a , d ) ;
370+ shifted0 = Sse2 . ShiftRightArithmetic ( tmp0 , 3 ) ;
371+ shifted1 = Sse2 . ShiftRightArithmetic ( tmp1 , 3 ) ;
372+ shifted2 = Sse2 . ShiftRightArithmetic ( tmp2 , 3 ) ;
373+ shifted3 = Sse2 . ShiftRightArithmetic ( tmp3 , 3 ) ;
374+ }
375+ #endif
376+
412377 public static void FTransform2 ( Span < byte > src , Span < byte > reference , Span < short > output , Span < short > output2 , Span < int > scratch )
413378 {
414379 FTransform ( src , reference , output , scratch ) ;
0 commit comments