Speedup: YV24->RGB32/24: add AVX2 code path.

AviSynth · May 18, 2023 · 3c26fd5 · DTL2020 · May 18, 2023 · pinterf
1 parent 735f8f7
commit 3c26fd5
Show file tree

Hide file tree

Showing 4 changed files with 235 additions and 4 deletions.
diff --git a/avs_core/convert/convert_planar.cpp b/avs_core/convert/convert_planar.cpp
@@ -719,10 +719,24 @@ PVideoFrame __stdcall ConvertYUV444ToRGB::GetFrame(int n, IScriptEnvironment* en
 
 
 #ifdef INTEL_INTRINSICS
-  // todo: SSE for not only 8 bit RGB
+  // todo: SIMD for not only 8 bit RGB
   // packed RGB24 and RGB32
-  if ((env->GetCPUFlags() & CPUF_SSE2) && (pixel_step==3 || pixel_step==4)) {
-    //we load using movq so no need to check for alignment
+
+  if ((env->GetCPUFlags() & CPUF_AVX2) && (pixel_step == 3 || pixel_step == 4)) {
+    if (pixel_step == 4) {
+      // RGB32
+      if (src_pitch_a) // move alpha channel from YUVA
+        convert_yv24_to_rgb_avx2<4, true>(dstp, srcY, srcU, srcV, srcA, dst_pitch, src_pitch_y, src_pitch_uv, src_pitch_a, vi.width, vi.height, matrix);
+      else
+        convert_yv24_to_rgb_avx2<4, false>(dstp, srcY, srcU, srcV, srcA, dst_pitch, src_pitch_y, src_pitch_uv, src_pitch_a, vi.width, vi.height, matrix);
+    }
+    else {
+      // RGB24
+      convert_yv24_to_rgb_avx2<3, false>(dstp, srcY, srcU, srcV, srcA, dst_pitch, src_pitch_y, src_pitch_uv, src_pitch_a, vi.width, vi.height, matrix);
+    }
+    return dst;
+  }
+  else if ((env->GetCPUFlags() & CPUF_SSE2) && (pixel_step==3 || pixel_step==4)) {
     if (pixel_step == 4) {
       if(src_pitch_a) // move alpha channel from YUVA
         convert_yv24_to_rgb_sse2<4, true>(dstp, srcY, srcU, srcV, srcA, dst_pitch, src_pitch_y, src_pitch_uv, src_pitch_a, vi.width, vi.height, matrix);

diff --git a/avs_core/convert/intel/convert_planar_avx2.cpp b/avs_core/convert/intel/convert_planar_avx2.cpp
@@ -44,6 +44,213 @@
 
 #include "convert_planar_avx2.h"
 
+#ifndef _mm256_set_m128i
+#define _mm256_set_m128i(v0, v1) _mm256_insertf128_si256(_mm256_castsi128_si256(v1), (v0), 1)
+#endif
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable:4309)
+#endif
+
+// packed rgb helper
+static AVS_FORCEINLINE __m256i convert_yuv_to_rgb_avx2_core(const __m256i& px0189, const __m256i& px23AB, const __m256i& px45CD, const __m256i& px67EF, const __m256i& zero, const __m256i& matrix, const __m256i& round_mask) {
+  //int b = (((int)m[0] * Y + (int)m[1] * U + (int)m[ 2] * V + 4096)>>13);
+
+  //px01 - xx xx 00 V1 00 U1 00 Y1 xx xx 00 V0 00 U0 00 Y0
+
+  auto low_lo = _mm256_madd_epi16(px0189, matrix); //xx*0 + v1*m2 | u1*m1 + y1*m0 | xx*0 + v0*m2 | u0*m1 + y0*m0
+  auto low_hi = _mm256_madd_epi16(px23AB, matrix); //xx*0 + v3*m2 | u3*m1 + y3*m0 | xx*0 + v2*m2 | u2*m1 + y2*m0
+  auto high_lo = _mm256_madd_epi16(px45CD, matrix);
+  auto high_hi = _mm256_madd_epi16(px67EF, matrix);
+
+  auto low_v = _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(low_lo), _mm256_castsi256_ps(low_hi), _MM_SHUFFLE(3, 1, 3, 1))); // v3*m2 | v2*m2 | v1*m2 | v0*m2
+  auto high_v = _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(high_lo), _mm256_castsi256_ps(high_hi), _MM_SHUFFLE(3, 1, 3, 1)));
+
+  auto low_yu = _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(low_lo), _mm256_castsi256_ps(low_hi), _MM_SHUFFLE(2, 0, 2, 0))); // u3*m1 + y3*m0 | u2*m1 + y2*m0 | u1*m1 + y1*m0 | u0*m1 + y0*m0
+  auto high_yu = _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(high_lo), _mm256_castsi256_ps(high_hi), _MM_SHUFFLE(2, 0, 2, 0)));
+
+  auto t_lo = _mm256_add_epi32(low_v, low_yu); // v3*m2 + u3*m1 + y3*m0...
+  auto t_hi = _mm256_add_epi32(high_v, high_yu);
+
+  t_lo = _mm256_add_epi32(t_lo, round_mask); // v3*m2 + u3*m1 + y3*m0 + 4096...
+  t_hi = _mm256_add_epi32(t_hi, round_mask);
+
+  t_lo = _mm256_srai_epi32(t_lo, 13); // (v3*m2 + u3*m1 + y3*m0 + 4096) >> 13...
+  t_hi = _mm256_srai_epi32(t_hi, 13);
+
+  auto result = _mm256_packs_epi32(t_lo, t_hi);
+  result = _mm256_packus_epi16(result, zero); //00 00 00 00 00 00 00 00 b7 b6 b5 b4 b3 b2 b1 b0
+  return result;
+}
+
+template<int rgb_pixel_step, bool hasAlpha>
+#if defined(GCC) || defined(CLANG)
+__attribute__((__target__("avx2")))
+#endif
+void convert_yv24_to_rgb_avx2(BYTE* dstp, const BYTE* srcY, const BYTE* srcU, const BYTE* srcV, const BYTE* srcA, size_t dst_pitch, size_t src_pitch_y, size_t src_pitch_uv, size_t src_pitch_a, size_t width, size_t height, const ConversionMatrix& matrix) {
+  dstp += dst_pitch * (height - 1);  // We start at last line
+
+  size_t mod16_width = rgb_pixel_step == 3 ? width / 16 * 16 : width;
+  // for rgb32 target we can process pixels beyond width, but we have 64bit alignment at target 16 pixels to 16*4=64 rgb pixels
+  // if alignment would only be 32 bytes, we'd do the last cycle to process only 8 source pixels
+
+  auto matrix_b = _mm256_set_epi16(0, matrix.v_b, matrix.u_b, matrix.y_b, 0, matrix.v_b, matrix.u_b, matrix.y_b, 0, matrix.v_b, matrix.u_b, matrix.y_b, 0, matrix.v_b, matrix.u_b, matrix.y_b);
+  auto matrix_g = _mm256_set_epi16(0, matrix.v_g, matrix.u_g, matrix.y_g, 0, matrix.v_g, matrix.u_g, matrix.y_g, 0, matrix.v_g, matrix.u_g, matrix.y_g, 0, matrix.v_g, matrix.u_g, matrix.y_g);
+  auto matrix_r = _mm256_set_epi16(0, matrix.v_r, matrix.u_r, matrix.y_r, 0, matrix.v_r, matrix.u_r, matrix.y_r, 0, matrix.v_r, matrix.u_r, matrix.y_r, 0, matrix.v_r, matrix.u_r, matrix.y_r);
+
+  auto zero128 = _mm_setzero_si128();
+  auto zero = _mm256_setzero_si256();
+  auto round_mask = _mm256_set1_epi32(4096);
+  auto offset = _mm256_set_epi16(0, -128, -128, matrix.offset_y, 0, -128, -128, matrix.offset_y, 0, -128, -128, matrix.offset_y, 0, -128, -128, matrix.offset_y);
+
+  // 16 YUV(A) pixels --> 4x16 RGB quads = 64 bytes. Avisynth's alignment is 64 fortunately.
+
+  for (size_t y = 0; y < height; ++y) {
+    for (size_t x = 0; x < mod16_width; x += 16) {
+      __m128i src_y = _mm_load_si128(reinterpret_cast<const __m128i*>(srcY + x)); //Y15 .. Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0
+      __m128i src_u = _mm_load_si128(reinterpret_cast<const __m128i*>(srcU + x)); //U15 .. U7 U6 U5 U4 U3 U2 U1 U0
+      __m128i src_v = _mm_load_si128(reinterpret_cast<const __m128i*>(srcV + x)); //V15 .. V7 V6 V5 V4 V3 V2 V1 V0
+      __m128i src_a;
+      if constexpr(hasAlpha)
+        src_a = _mm_load_si128(reinterpret_cast<const __m128i*>(srcA + x)); //A15 .. A7 A6 A5 A4 A3 A2 A1 A0
+
+      __m128i t1_lo = _mm_unpacklo_epi8(src_y, src_u); //U7 Y7 U6 Y6 U5 Y5 U4 Y4 U3 Y3 U2 Y2 U1 Y1 U0 Y0
+      __m128i t1_hi = _mm_unpackhi_epi8(src_y, src_u); //U15 Y15 U14 Y14 U13 Y13 U12 Y12 U11 Y11 U10 Y10 U9 Y9 U8 Y8
+      __m128i t2_lo = _mm_unpacklo_epi8(src_v, zero128);  //00 V7 00 V6 00 V5 00 V4 00 V3 00 V2 00 V1 00 V0
+      __m128i t2_hi = _mm_unpackhi_epi8(src_v, zero128);  //00 V15 00 V14 00 V13 00 V12 00 V11 00 V10 00 V9 00 V8
+
+      __m256i t1 = _mm256_set_m128i(t1_hi, t1_lo);
+      __m256i t2 = _mm256_set_m128i(t2_hi, t2_lo);
+      t1 = _mm256_permute4x64_epi64(t1, (0 << 0) | (2 << 2) | (1 << 4) | (3 << 6));
+      t2 = _mm256_permute4x64_epi64(t2, (0 << 0) | (2 << 2) | (1 << 4) | (3 << 6));
+
+      __m256i low = _mm256_unpacklo_epi16(t1, t2); //xx V11 U11 Y11 xx V10 U10 Y10 xx V9 U9 Y9 xx V8 U8 Y8    xx V3 U3 Y3 xx V2 U2 Y2 xx V1 U1 Y1 xx V0 U0 Y0
+      __m256i high = _mm256_unpackhi_epi16(t1, t2); //xx V15 U15 Y15 xx V14 U14 Y14 xx V13 U13 Y13 xx V12 U12 Y12    xx V7 U7 Y7 xx V6 U6 Y6 xx V5 U5 Y5 xx V4 U4 Y4
+
+      __m256i px0189 = _mm256_unpacklo_epi8(low, zero);  //xx xx 00 V1 00 U1 00 Y1 xx xx 00 V0 00 U0 00 Y0
+      __m256i px23AB = _mm256_unpackhi_epi8(low, zero);  //xx xx 00 V3 00 U3 00 Y3 xx xx 00 V2 00 U2 00 Y2
+      __m256i px45CD = _mm256_unpacklo_epi8(high, zero); //xx xx 00 V5 00 U5 00 Y5 xx xx 00 V4 00 U4 00 Y4
+      __m256i px67EF = _mm256_unpackhi_epi8(high, zero); //xx xx 00 V7 00 U7 00 Y7 xx xx 00 V6 00 U6 00 Y6
+
+      px0189 = _mm256_add_epi16(px0189, offset);
+      px23AB = _mm256_add_epi16(px23AB, offset);
+      px45CD = _mm256_add_epi16(px45CD, offset);
+      px67EF = _mm256_add_epi16(px67EF, offset);
+
+      __m256i result_b = convert_yuv_to_rgb_avx2_core(px0189, px23AB, px45CD, px67EF, zero, matrix_b, round_mask); //00 00 00 00 00 00 00 00 b7 b6 b5 b4 b3 b2 b1 b0
+      __m256i result_g = convert_yuv_to_rgb_avx2_core(px0189, px23AB, px45CD, px67EF, zero, matrix_g, round_mask); //00 00 00 00 00 00 00 00 g7 g6 g5 g4 g3 g2 g1 g0
+      __m256i result_r = convert_yuv_to_rgb_avx2_core(px0189, px23AB, px45CD, px67EF, zero, matrix_r, round_mask); //00 00 00 00 00 00 00 00 r7 r6 r5 r4 r3 r2 r1 r0
+
+      __m256i result_bg = _mm256_unpacklo_epi8(result_b, result_g); //g7 b7 g6 b6 g5 b5 g4 b4 g3 b3 g2 b2 g1 b1 g0 b0
+      __m256i alpha;
+      if (hasAlpha) {
+        __m128i a_lo = _mm_unpacklo_epi8(src_a, zero128);  //00 A7 00 A6 00 A5 00 A4 00 A3 00 A2 00 A1 00 A0
+        __m128i a_hi = _mm_unpackhi_epi8(src_a, zero128);  //00 A15 00 A14 00 A13 00 A12 00 A11 00 A10 00 A9 00 A8
+        alpha = _mm256_set_m128i(a_hi, a_lo); // a15 .. a0  at low of each m128i part
+        alpha = _mm256_permute4x64_epi64(alpha, (0 << 0) | (2 << 2) | (1 << 4) | (3 << 6));
+      }
+      else
+        alpha = _mm256_cmpeq_epi32(result_r, result_r); // FF FF FF FF ... default alpha transparent
+
+      __m256i result_ra = _mm256_unpacklo_epi8(result_r, alpha);       //a7 r7 a6 r6 a5 r5 a4 r4 a3 r3 a2 r2 a1 r1 a0 r0
+
+      __m256i result_lo = _mm256_unpacklo_epi16(result_bg, result_ra);
+      __m256i result_hi = _mm256_unpackhi_epi16(result_bg, result_ra);
+
+      if constexpr (rgb_pixel_step == 4) {
+        //rgb32
+        _mm256_store_si256(reinterpret_cast<__m256i*>(dstp + x * 4), result_lo);
+        _mm256_store_si256(reinterpret_cast<__m256i*>(dstp + x * 4 + 32), result_hi);
+      }
+      else {
+        // rgb24
+        // 16*4 bytes to 16*3 bytes
+        __m256i perm10 = _mm256_set_epi32(0, 0, 6, 5, 4, 2, 1, 0);
+        __m256i shuffle_lo = _mm256_set_epi8(
+            0, 0, 0, 0, 14, 13, 12, 10, 9, 8, 6, 5, 4, 2, 1, 0,
+            0, 0, 0, 0, 14, 13, 12, 10, 9, 8, 6, 5, 4, 2, 1, 0
+        );
+        __m128i shuffle_hi_lo = _mm_set_epi8(9, 8, 6, 5,  4,  2,  1,  0, 0, 0, 0, 0, 0, 0, 0, 0);
+
+        __m128i result_hi_lo = _mm256_extracti128_si256(result_hi, 0); // aBbBgBrB aAbAgArA a9b9g9r9 a8b8g8r8
+        __m128i result_hi_hi = _mm256_extracti128_si256(result_hi, 1); // aFbFgFrF aEbEgErE aDbDgDrD aCbCgCrC
+
+        __m256i result_lo_reorg = _mm256_shuffle_epi8(result_lo, shuffle_lo);
+        // x  x  x  x  b7g7r7 b6g6r6 b5g5r5 b4g4r4 x  x  x  x  b3g3r3 b2g2r2 b1g1r1 b0g0r0
+        result_lo_reorg = _mm256_permutevar8x32_epi32(result_lo_reorg, perm10);
+        // x  x  x  x  x x x x b7g7r7 b6g6r6 b5g5r5 b4g4r4 b3g3r3 b2g2r2 b1g1r1 b0g0r0
+
+        __m128i result_hi_lo_reorg = _mm_shuffle_epi8(result_hi_lo, shuffle_hi_lo);
+        // gA rA b9 g9 r9 b8 g8 r8 x x x x x x x x
+
+        __m256i dummy_y0 = _mm256_undefined_si256();
+        auto result_hi_lo_reorg_2 = _mm256_inserti128_si256(dummy_y0, result_hi_lo_reorg, 1);
+        //                                                                      
+        // gA rA b9 g9|r9 b8 g8 r8|x x x  x|x x  x x|x  x x x |x x x  x|x x  x x|x  x x x   // result_hi_lo_reorg_2
+        // x  x  x  x  x  x  x  x  b7g7r7 b6g6r6 b5g5r5 b4g4r4 b3g3r3 b2g2r2 b1g1r1 b0g0r0  // result_lo_reorg
+        auto result_0_15 = _mm256_blend_epi32(result_lo_reorg, result_hi_lo_reorg_2, 0xC0); // 11000000
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(dstp + x * 3), result_0_15); // not necessarily 32 bytes aligned
+
+        // => x  x  x  x  x  x  x  x  x  x  x  x  bB gB rB bA
+        __m128i shuffle_hi_lo_2 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 13, 12, 10);
+        __m128i xmm4 = _mm_shuffle_epi8(result_hi_lo, shuffle_hi_lo_2);
+
+        // => bF gF rF bE gE rE bD gD rD bC gC rC x  x  x  x
+        __m128i shuffle_hi_hi = _mm_set_epi8(14, 13, 12, 10, 9, 8, 6, 5, 4, 2, 1, 0, 0x80, 0x80, 0x80, 0x80);
+        __m128i xmm5 = _mm_shuffle_epi8(result_hi_hi, shuffle_hi_hi);
+
+        // => bF gF rF bE gE rE bD gD rD bC gC rC bB gB rB bA
+        __m128i result_16_23 = _mm_or_si128(xmm4, xmm5);
+        _mm_store_si128(reinterpret_cast<__m128i*>(dstp + x * 3 + 32), result_16_23);
+#if 0
+        // Intel compiler can cope with it 100% optimized. No store, just shuffles and blends as above.
+        alignas(32) BYTE temp[64];
+        _mm256_store_si256(reinterpret_cast<__m256i*>(temp), result_lo);
+        _mm256_store_si256(reinterpret_cast<__m256i*>(temp + 32), result_hi);
+        for (int i = 0; i < 16; ++i) {
+          dstp[(x + i) * 3 + 0] = temp[i * 4 + 0];
+          dstp[(x + i) * 3 + 1] = temp[i * 4 + 1];
+          dstp[(x + i) * 3 + 2] = temp[i * 4 + 2];
+        }
+#endif
+      }
+    }
+
+    if constexpr (rgb_pixel_step == 3) {
+      // for rgb32 (pixel_step == 4) we processed full width and more, including padded bytes
+      for (size_t x = mod16_width; x < width; ++x) {
+        int Y = srcY[x] + matrix.offset_y;
+        int U = srcU[x] - 128;
+        int V = srcV[x] - 128;
+        int b = (((int)matrix.y_b * Y + (int)matrix.u_b * U + (int)matrix.v_b * V + 4096) >> 13);
+        int g = (((int)matrix.y_g * Y + (int)matrix.u_g * U + (int)matrix.v_g * V + 4096) >> 13);
+        int r = (((int)matrix.y_r * Y + (int)matrix.u_r * U + (int)matrix.v_r * V + 4096) >> 13);
+        dstp[x * rgb_pixel_step + 0] = PixelClip(b);
+        dstp[x * rgb_pixel_step + 1] = PixelClip(g);
+        dstp[x * rgb_pixel_step + 2] = PixelClip(r);
+        if constexpr (rgb_pixel_step == 4) { // n/a
+          dstp[x * 4 + 3] = 255;
+        }
+      }
+    }
+    dstp -= dst_pitch;
+    srcY += src_pitch_y;
+    srcU += src_pitch_uv;
+    srcV += src_pitch_uv;
+    if constexpr(hasAlpha)
+      srcA += src_pitch_a;
+  }
+}
+
+//instantiate
+//template<int rgb_pixel_step, bool hasAlpha>
+template void convert_yv24_to_rgb_avx2<3, false>(BYTE* dstp, const BYTE* srcY, const BYTE* srcU, const BYTE* srcV, const BYTE* srcA, size_t dst_pitch, size_t src_pitch_y, size_t src_pitch_uv, size_t src_pitch_a, size_t width, size_t height, const ConversionMatrix& matrix);
+template void convert_yv24_to_rgb_avx2<4, false>(BYTE* dstp, const BYTE* srcY, const BYTE* srcU, const BYTE* srcV, const BYTE* srcA, size_t dst_pitch, size_t src_pitch_y, size_t src_pitch_uv, size_t src_pitch_a, size_t width, size_t height, const ConversionMatrix& matrix);
+template void convert_yv24_to_rgb_avx2<3, true>(BYTE* dstp, const BYTE* srcY, const BYTE* srcU, const BYTE* srcV, const BYTE* srcA, size_t dst_pitch, size_t src_pitch_y, size_t src_pitch_uv, size_t src_pitch_a, size_t width, size_t height, const ConversionMatrix& matrix);
+template void convert_yv24_to_rgb_avx2<4, true>(BYTE* dstp, const BYTE* srcY, const BYTE* srcU, const BYTE* srcV, const BYTE* srcA, size_t dst_pitch, size_t src_pitch_y, size_t src_pitch_uv, size_t src_pitch_a, size_t width, size_t height, const ConversionMatrix& matrix);
+
+
 template<int bits_per_pixel>
 #if defined(GCC) || defined(CLANG)
 __attribute__((__target__("avx2")))
@@ -129,3 +336,7 @@ template void convert_planarrgb_to_yuv_uint16_avx2<10>(BYTE* (&dstp)[3], int(&ds
 template void convert_planarrgb_to_yuv_uint16_avx2<12>(BYTE* (&dstp)[3], int(&dstPitch)[3], const BYTE* (&srcp)[3], const int(&srcPitch)[3], int width, int height, const ConversionMatrix& m);
 template void convert_planarrgb_to_yuv_uint16_avx2<14>(BYTE* (&dstp)[3], int(&dstPitch)[3], const BYTE* (&srcp)[3], const int(&srcPitch)[3], int width, int height, const ConversionMatrix& m);
 template void convert_planarrgb_to_yuv_uint16_avx2<16>(BYTE* (&dstp)[3], int(&dstPitch)[3], const BYTE* (&srcp)[3], const int(&srcPitch)[3], int width, int height, const ConversionMatrix& m);
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
diff --git a/avs_core/convert/intel/convert_planar_avx2.h b/avs_core/convert/intel/convert_planar_avx2.h
@@ -45,4 +45,10 @@ __attribute__((__target__("avx2")))
 #endif
 void convert_planarrgb_to_yuv_uint16_avx2(BYTE *(&dstp)[3], int (&dstPitch)[3], const BYTE *(&srcp)[3], const int (&srcPitch)[3], int width, int height, const ConversionMatrix &m);
 
+template<int rgb_pixel_step, bool hasAlpha>
+#if defined(GCC) || defined(CLANG)
+__attribute__((__target__("avx2")))
+#endif
+void convert_yv24_to_rgb_avx2(BYTE* dstp, const BYTE* srcY, const BYTE* srcU, const BYTE* srcV, const BYTE* srcA, size_t dst_pitch, size_t src_pitch_y, size_t src_pitch_uv, size_t src_pitch_a, size_t width, size_t height, const ConversionMatrix& matrix);
+
 #endif
diff --git a/avs_core/convert/intel/convert_planar_sse.cpp b/avs_core/convert/intel/convert_planar_sse.cpp
@@ -1467,7 +1467,7 @@ void convert_yv24_to_rgb_sse2(BYTE* dstp, const BYTE* srcY, const BYTE* srcU, co
         _mm_store_si128(reinterpret_cast<__m128i*>(temp), result_lo);
         _mm_store_si128(reinterpret_cast<__m128i*>(temp + 16), result_hi);
 
-        for (int i = 0; i < 8; ++i) {
+        for (int i = 0; i < 7; ++i) {
           *reinterpret_cast<int*>(dstp + (x + i) * 3) = *reinterpret_cast<int*>(temp + i * 4);
         }
         //last pixel