Skip to content

Commit

Permalink
Speedup: YV24->RGB32/24: add AVX2 code path.
Browse files Browse the repository at this point in the history
  • Loading branch information
pinterf committed May 18, 2023
1 parent 735f8f7 commit 3c26fd5
Show file tree
Hide file tree
Showing 4 changed files with 235 additions and 4 deletions.
20 changes: 17 additions & 3 deletions avs_core/convert/convert_planar.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -719,10 +719,24 @@ PVideoFrame __stdcall ConvertYUV444ToRGB::GetFrame(int n, IScriptEnvironment* en


#ifdef INTEL_INTRINSICS
// todo: SSE for not only 8 bit RGB
// todo: SIMD for not only 8 bit RGB
// packed RGB24 and RGB32
if ((env->GetCPUFlags() & CPUF_SSE2) && (pixel_step==3 || pixel_step==4)) {
//we load using movq so no need to check for alignment

if ((env->GetCPUFlags() & CPUF_AVX2) && (pixel_step == 3 || pixel_step == 4)) {
if (pixel_step == 4) {
// RGB32
if (src_pitch_a) // move alpha channel from YUVA
convert_yv24_to_rgb_avx2<4, true>(dstp, srcY, srcU, srcV, srcA, dst_pitch, src_pitch_y, src_pitch_uv, src_pitch_a, vi.width, vi.height, matrix);
else
convert_yv24_to_rgb_avx2<4, false>(dstp, srcY, srcU, srcV, srcA, dst_pitch, src_pitch_y, src_pitch_uv, src_pitch_a, vi.width, vi.height, matrix);
}
else {
// RGB24
convert_yv24_to_rgb_avx2<3, false>(dstp, srcY, srcU, srcV, srcA, dst_pitch, src_pitch_y, src_pitch_uv, src_pitch_a, vi.width, vi.height, matrix);
}
return dst;
}
else if ((env->GetCPUFlags() & CPUF_SSE2) && (pixel_step==3 || pixel_step==4)) {
if (pixel_step == 4) {
if(src_pitch_a) // move alpha channel from YUVA
convert_yv24_to_rgb_sse2<4, true>(dstp, srcY, srcU, srcV, srcA, dst_pitch, src_pitch_y, src_pitch_uv, src_pitch_a, vi.width, vi.height, matrix);
Expand Down
211 changes: 211 additions & 0 deletions avs_core/convert/intel/convert_planar_avx2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,213 @@

#include "convert_planar_avx2.h"

#ifndef _mm256_set_m128i
#define _mm256_set_m128i(v0, v1) _mm256_insertf128_si256(_mm256_castsi128_si256(v1), (v0), 1)
#endif

#ifdef _MSC_VER
#pragma warning(push)
#pragma warning(disable:4309)
#endif

// packed rgb helper
static AVS_FORCEINLINE __m256i convert_yuv_to_rgb_avx2_core(const __m256i& px0189, const __m256i& px23AB, const __m256i& px45CD, const __m256i& px67EF, const __m256i& zero, const __m256i& matrix, const __m256i& round_mask) {
//int b = (((int)m[0] * Y + (int)m[1] * U + (int)m[ 2] * V + 4096)>>13);

//px01 - xx xx 00 V1 00 U1 00 Y1 xx xx 00 V0 00 U0 00 Y0

auto low_lo = _mm256_madd_epi16(px0189, matrix); //xx*0 + v1*m2 | u1*m1 + y1*m0 | xx*0 + v0*m2 | u0*m1 + y0*m0
auto low_hi = _mm256_madd_epi16(px23AB, matrix); //xx*0 + v3*m2 | u3*m1 + y3*m0 | xx*0 + v2*m2 | u2*m1 + y2*m0
auto high_lo = _mm256_madd_epi16(px45CD, matrix);
auto high_hi = _mm256_madd_epi16(px67EF, matrix);

auto low_v = _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(low_lo), _mm256_castsi256_ps(low_hi), _MM_SHUFFLE(3, 1, 3, 1))); // v3*m2 | v2*m2 | v1*m2 | v0*m2

This comment has been minimized.

Copy link
@DTL2020

DTL2020 May 18, 2023

Casting to float and back to int is not good and adds penalty - better to rewrite _MM_SHUFFLE () macro from SSE shufps to SSE2 (and later) integer shuffle pshufpd() instruction for 128i/256i shuffle and stop use casting.

This comment has been minimized.

Copy link
@pinterf

pinterf May 18, 2023

Author

Does it really have penalty (shuffle_ps vs shuffle_epi32?). Casting is only a syntactical element in the SIMD intrinsics command.

This comment has been minimized.

Copy link
@DTL2020

DTL2020 via email May 18, 2023

This comment has been minimized.

Copy link
@pinterf

pinterf May 18, 2023

Author

Yep, now I remember faintly, I have read about this already, just forgot it.

This comment has been minimized.

Copy link
@DTL2020

DTL2020 May 18, 2023

Oh - I see the _MM_SHUFFLE is only macro to generate immediate. The real instruction to complaint about is _mm256_shuffle_ps surrouded with casting. The better to use integer shuffle and skip all casting.

This comment has been minimized.

Copy link
@pinterf

pinterf May 19, 2023

Author

In https://www.agner.org/optimize/microarchitecture.pdf Page144:
"Data bypass delays": since Haswell (e.g. i7-4770) no penalty was experienced on using PSHUFPS, so I think I leave this part as is, anyway this kind of shuffle with two input register parameters is quite convenient here. I think there is no such instruction for integer domain.

auto high_v = _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(high_lo), _mm256_castsi256_ps(high_hi), _MM_SHUFFLE(3, 1, 3, 1)));

auto low_yu = _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(low_lo), _mm256_castsi256_ps(low_hi), _MM_SHUFFLE(2, 0, 2, 0))); // u3*m1 + y3*m0 | u2*m1 + y2*m0 | u1*m1 + y1*m0 | u0*m1 + y0*m0
auto high_yu = _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(high_lo), _mm256_castsi256_ps(high_hi), _MM_SHUFFLE(2, 0, 2, 0)));

auto t_lo = _mm256_add_epi32(low_v, low_yu); // v3*m2 + u3*m1 + y3*m0...
auto t_hi = _mm256_add_epi32(high_v, high_yu);

t_lo = _mm256_add_epi32(t_lo, round_mask); // v3*m2 + u3*m1 + y3*m0 + 4096...
t_hi = _mm256_add_epi32(t_hi, round_mask);

t_lo = _mm256_srai_epi32(t_lo, 13); // (v3*m2 + u3*m1 + y3*m0 + 4096) >> 13...
t_hi = _mm256_srai_epi32(t_hi, 13);

auto result = _mm256_packs_epi32(t_lo, t_hi);
result = _mm256_packus_epi16(result, zero); //00 00 00 00 00 00 00 00 b7 b6 b5 b4 b3 b2 b1 b0
return result;
}

template<int rgb_pixel_step, bool hasAlpha>
#if defined(GCC) || defined(CLANG)
__attribute__((__target__("avx2")))
#endif
void convert_yv24_to_rgb_avx2(BYTE* dstp, const BYTE* srcY, const BYTE* srcU, const BYTE* srcV, const BYTE* srcA, size_t dst_pitch, size_t src_pitch_y, size_t src_pitch_uv, size_t src_pitch_a, size_t width, size_t height, const ConversionMatrix& matrix) {
dstp += dst_pitch * (height - 1); // We start at last line

size_t mod16_width = rgb_pixel_step == 3 ? width / 16 * 16 : width;
// for rgb32 target we can process pixels beyond width, but we have 64bit alignment at target 16 pixels to 16*4=64 rgb pixels
// if alignment would only be 32 bytes, we'd do the last cycle to process only 8 source pixels

auto matrix_b = _mm256_set_epi16(0, matrix.v_b, matrix.u_b, matrix.y_b, 0, matrix.v_b, matrix.u_b, matrix.y_b, 0, matrix.v_b, matrix.u_b, matrix.y_b, 0, matrix.v_b, matrix.u_b, matrix.y_b);
auto matrix_g = _mm256_set_epi16(0, matrix.v_g, matrix.u_g, matrix.y_g, 0, matrix.v_g, matrix.u_g, matrix.y_g, 0, matrix.v_g, matrix.u_g, matrix.y_g, 0, matrix.v_g, matrix.u_g, matrix.y_g);
auto matrix_r = _mm256_set_epi16(0, matrix.v_r, matrix.u_r, matrix.y_r, 0, matrix.v_r, matrix.u_r, matrix.y_r, 0, matrix.v_r, matrix.u_r, matrix.y_r, 0, matrix.v_r, matrix.u_r, matrix.y_r);

auto zero128 = _mm_setzero_si128();
auto zero = _mm256_setzero_si256();
auto round_mask = _mm256_set1_epi32(4096);
auto offset = _mm256_set_epi16(0, -128, -128, matrix.offset_y, 0, -128, -128, matrix.offset_y, 0, -128, -128, matrix.offset_y, 0, -128, -128, matrix.offset_y);

// 16 YUV(A) pixels --> 4x16 RGB quads = 64 bytes. Avisynth's alignment is 64 fortunately.

for (size_t y = 0; y < height; ++y) {
for (size_t x = 0; x < mod16_width; x += 16) {

This comment has been minimized.

Copy link
@DTL2020

DTL2020 May 18, 2023

For better performance it is good to increase 'workunit size' for each SIMD loop pass to about half of register file or more. When you combine several equal instructions without data dependency CPU core may use both superscalar computing (>1 dispatch port at the same clocks) and supply result at Throughput rate (not wait for Latency clocks to get each result). So total clocks for the same data size computing become lower. And total task is divided to less SIMD passes to compute become computed faster.

__m128i src_y = _mm_load_si128(reinterpret_cast<const __m128i*>(srcY + x)); //Y15 .. Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0
__m128i src_u = _mm_load_si128(reinterpret_cast<const __m128i*>(srcU + x)); //U15 .. U7 U6 U5 U4 U3 U2 U1 U0
__m128i src_v = _mm_load_si128(reinterpret_cast<const __m128i*>(srcV + x)); //V15 .. V7 V6 V5 V4 V3 V2 V1 V0
__m128i src_a;
if constexpr(hasAlpha)
src_a = _mm_load_si128(reinterpret_cast<const __m128i*>(srcA + x)); //A15 .. A7 A6 A5 A4 A3 A2 A1 A0

__m128i t1_lo = _mm_unpacklo_epi8(src_y, src_u); //U7 Y7 U6 Y6 U5 Y5 U4 Y4 U3 Y3 U2 Y2 U1 Y1 U0 Y0
__m128i t1_hi = _mm_unpackhi_epi8(src_y, src_u); //U15 Y15 U14 Y14 U13 Y13 U12 Y12 U11 Y11 U10 Y10 U9 Y9 U8 Y8
__m128i t2_lo = _mm_unpacklo_epi8(src_v, zero128); //00 V7 00 V6 00 V5 00 V4 00 V3 00 V2 00 V1 00 V0
__m128i t2_hi = _mm_unpackhi_epi8(src_v, zero128); //00 V15 00 V14 00 V13 00 V12 00 V11 00 V10 00 V9 00 V8

__m256i t1 = _mm256_set_m128i(t1_hi, t1_lo);
__m256i t2 = _mm256_set_m128i(t2_hi, t2_lo);
t1 = _mm256_permute4x64_epi64(t1, (0 << 0) | (2 << 2) | (1 << 4) | (3 << 6));
t2 = _mm256_permute4x64_epi64(t2, (0 << 0) | (2 << 2) | (1 << 4) | (3 << 6));

__m256i low = _mm256_unpacklo_epi16(t1, t2); //xx V11 U11 Y11 xx V10 U10 Y10 xx V9 U9 Y9 xx V8 U8 Y8 xx V3 U3 Y3 xx V2 U2 Y2 xx V1 U1 Y1 xx V0 U0 Y0
__m256i high = _mm256_unpackhi_epi16(t1, t2); //xx V15 U15 Y15 xx V14 U14 Y14 xx V13 U13 Y13 xx V12 U12 Y12 xx V7 U7 Y7 xx V6 U6 Y6 xx V5 U5 Y5 xx V4 U4 Y4

__m256i px0189 = _mm256_unpacklo_epi8(low, zero); //xx xx 00 V1 00 U1 00 Y1 xx xx 00 V0 00 U0 00 Y0
__m256i px23AB = _mm256_unpackhi_epi8(low, zero); //xx xx 00 V3 00 U3 00 Y3 xx xx 00 V2 00 U2 00 Y2
__m256i px45CD = _mm256_unpacklo_epi8(high, zero); //xx xx 00 V5 00 U5 00 Y5 xx xx 00 V4 00 U4 00 Y4
__m256i px67EF = _mm256_unpackhi_epi8(high, zero); //xx xx 00 V7 00 U7 00 Y7 xx xx 00 V6 00 U6 00 Y6

px0189 = _mm256_add_epi16(px0189, offset);
px23AB = _mm256_add_epi16(px23AB, offset);
px45CD = _mm256_add_epi16(px45CD, offset);
px67EF = _mm256_add_epi16(px67EF, offset);

__m256i result_b = convert_yuv_to_rgb_avx2_core(px0189, px23AB, px45CD, px67EF, zero, matrix_b, round_mask); //00 00 00 00 00 00 00 00 b7 b6 b5 b4 b3 b2 b1 b0
__m256i result_g = convert_yuv_to_rgb_avx2_core(px0189, px23AB, px45CD, px67EF, zero, matrix_g, round_mask); //00 00 00 00 00 00 00 00 g7 g6 g5 g4 g3 g2 g1 g0
__m256i result_r = convert_yuv_to_rgb_avx2_core(px0189, px23AB, px45CD, px67EF, zero, matrix_r, round_mask); //00 00 00 00 00 00 00 00 r7 r6 r5 r4 r3 r2 r1 r0

__m256i result_bg = _mm256_unpacklo_epi8(result_b, result_g); //g7 b7 g6 b6 g5 b5 g4 b4 g3 b3 g2 b2 g1 b1 g0 b0
__m256i alpha;
if (hasAlpha) {
__m128i a_lo = _mm_unpacklo_epi8(src_a, zero128); //00 A7 00 A6 00 A5 00 A4 00 A3 00 A2 00 A1 00 A0
__m128i a_hi = _mm_unpackhi_epi8(src_a, zero128); //00 A15 00 A14 00 A13 00 A12 00 A11 00 A10 00 A9 00 A8
alpha = _mm256_set_m128i(a_hi, a_lo); // a15 .. a0 at low of each m128i part
alpha = _mm256_permute4x64_epi64(alpha, (0 << 0) | (2 << 2) | (1 << 4) | (3 << 6));
}
else
alpha = _mm256_cmpeq_epi32(result_r, result_r); // FF FF FF FF ... default alpha transparent

__m256i result_ra = _mm256_unpacklo_epi8(result_r, alpha); //a7 r7 a6 r6 a5 r5 a4 r4 a3 r3 a2 r2 a1 r1 a0 r0

__m256i result_lo = _mm256_unpacklo_epi16(result_bg, result_ra);
__m256i result_hi = _mm256_unpackhi_epi16(result_bg, result_ra);

if constexpr (rgb_pixel_step == 4) {
//rgb32
_mm256_store_si256(reinterpret_cast<__m256i*>(dstp + x * 4), result_lo);

This comment has been minimized.

Copy link
@DTL2020

DTL2020 May 18, 2023

If store 64 bytes (cache line) or more (better 8 ymm regs of 256 bytes single store burst) - it good to add cache control option and use _mm256_stream_si256() for uncached store. With big frame sizes much larger L2/L3 cache size (multiplied to AVS threads used) it may be no good to touch caches and it may about double store performance. See example at https://github.com/DTL2020/ConvertYV12toRGB/blob/bd88a2b5be6c84bb5650860eafb14fa6a4ea2716/DecodeYV12toRGB.cpp#L512

This comment has been minimized.

Copy link
@pinterf

pinterf May 18, 2023

Author

I've tested it (_stream) but nothing has been changed for the usual 4k test script speed.
Unfortunately avs+ core must work somewhat like "one size fits all".
Can you imagine scenarios (real-world scripts) where using _stream would be slower than simple _store?

This comment has been minimized.

Copy link
@DTL2020

DTL2020 May 18, 2023

Using non-cached store may be slower in total script processing if user process very small frame size and store to L2/L3 cache may be read-back to CPU core (L1D or register file) faster in compare with RAM read-back. But processing of small frame size typically very fast. I think may be add compile definitions so AVS may be compiled with cached stores for old users with SD frame sizes and with non-cached stores for users with FHD/UHD1/2 frame sizes ? So it may compile 2 different executables at build-time and users can select different .dll to install for typical workflows.

Also may be added some templated function with run-time selection based on frame size ?

This comment has been minimized.

Copy link
@DTL2020

DTL2020 May 18, 2023

"tested it (_stream) but nothing has been changed for the usual 4k test script speed."

Looks current implementation too slow (uses too small workunit size for SIMD and so on) so passing all caches (L1+L2/L3) and down to RAM still not shows benefit over non-cached store. If you make faster processing - finally you can see that cached store start to be slower in compare with direct to RAM streaming (caches are really slow too and passing data via caches looks like eat some performance at high data rates). Caches need to broadcast coherency over cores individual caches and update itself and so on and it adds overhead at the very large dataflow speed (like 50% of max RAM speed). Direct to RAM streaming may be simply invalidates cache lines without data copy and it may be faster (like cache line write (combining) buffers write directly to RAM - not touching very complex L1+L2+.. caches). If implementation produce data at rate about 10..5% and lower of max RAM bandwidth it may be no visible difference between cached and direct RAM store.

_mm256_store_si256(reinterpret_cast<__m256i*>(dstp + x * 4 + 32), result_hi);
}
else {
// rgb24
// 16*4 bytes to 16*3 bytes
__m256i perm10 = _mm256_set_epi32(0, 0, 6, 5, 4, 2, 1, 0);
__m256i shuffle_lo = _mm256_set_epi8(
0, 0, 0, 0, 14, 13, 12, 10, 9, 8, 6, 5, 4, 2, 1, 0,
0, 0, 0, 0, 14, 13, 12, 10, 9, 8, 6, 5, 4, 2, 1, 0
);
__m128i shuffle_hi_lo = _mm_set_epi8(9, 8, 6, 5, 4, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0);

__m128i result_hi_lo = _mm256_extracti128_si256(result_hi, 0); // aBbBgBrB aAbAgArA a9b9g9r9 a8b8g8r8
__m128i result_hi_hi = _mm256_extracti128_si256(result_hi, 1); // aFbFgFrF aEbEgErE aDbDgDrD aCbCgCrC

__m256i result_lo_reorg = _mm256_shuffle_epi8(result_lo, shuffle_lo);
// x x x x b7g7r7 b6g6r6 b5g5r5 b4g4r4 x x x x b3g3r3 b2g2r2 b1g1r1 b0g0r0
result_lo_reorg = _mm256_permutevar8x32_epi32(result_lo_reorg, perm10);
// x x x x x x x x b7g7r7 b6g6r6 b5g5r5 b4g4r4 b3g3r3 b2g2r2 b1g1r1 b0g0r0

__m128i result_hi_lo_reorg = _mm_shuffle_epi8(result_hi_lo, shuffle_hi_lo);
// gA rA b9 g9 r9 b8 g8 r8 x x x x x x x x

__m256i dummy_y0 = _mm256_undefined_si256();
auto result_hi_lo_reorg_2 = _mm256_inserti128_si256(dummy_y0, result_hi_lo_reorg, 1);
//
// gA rA b9 g9|r9 b8 g8 r8|x x x x|x x x x|x x x x |x x x x|x x x x|x x x x // result_hi_lo_reorg_2
// x x x x x x x x b7g7r7 b6g6r6 b5g5r5 b4g4r4 b3g3r3 b2g2r2 b1g1r1 b0g0r0 // result_lo_reorg
auto result_0_15 = _mm256_blend_epi32(result_lo_reorg, result_hi_lo_reorg_2, 0xC0); // 11000000
_mm256_storeu_si256(reinterpret_cast<__m256i*>(dstp + x * 3), result_0_15); // not necessarily 32 bytes aligned

// => x x x x x x x x x x x x bB gB rB bA
__m128i shuffle_hi_lo_2 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 13, 12, 10);
__m128i xmm4 = _mm_shuffle_epi8(result_hi_lo, shuffle_hi_lo_2);

// => bF gF rF bE gE rE bD gD rD bC gC rC x x x x
__m128i shuffle_hi_hi = _mm_set_epi8(14, 13, 12, 10, 9, 8, 6, 5, 4, 2, 1, 0, 0x80, 0x80, 0x80, 0x80);
__m128i xmm5 = _mm_shuffle_epi8(result_hi_hi, shuffle_hi_hi);

// => bF gF rF bE gE rE bD gD rD bC gC rC bB gB rB bA
__m128i result_16_23 = _mm_or_si128(xmm4, xmm5);
_mm_store_si128(reinterpret_cast<__m128i*>(dstp + x * 3 + 32), result_16_23);
#if 0
// Intel compiler can cope with it 100% optimized. No store, just shuffles and blends as above.
alignas(32) BYTE temp[64];
_mm256_store_si256(reinterpret_cast<__m256i*>(temp), result_lo);
_mm256_store_si256(reinterpret_cast<__m256i*>(temp + 32), result_hi);
for (int i = 0; i < 16; ++i) {
dstp[(x + i) * 3 + 0] = temp[i * 4 + 0];
dstp[(x + i) * 3 + 1] = temp[i * 4 + 1];
dstp[(x + i) * 3 + 2] = temp[i * 4 + 2];
}
#endif
}
}

if constexpr (rgb_pixel_step == 3) {
// for rgb32 (pixel_step == 4) we processed full width and more, including padded bytes
for (size_t x = mod16_width; x < width; ++x) {
int Y = srcY[x] + matrix.offset_y;
int U = srcU[x] - 128;
int V = srcV[x] - 128;
int b = (((int)matrix.y_b * Y + (int)matrix.u_b * U + (int)matrix.v_b * V + 4096) >> 13);
int g = (((int)matrix.y_g * Y + (int)matrix.u_g * U + (int)matrix.v_g * V + 4096) >> 13);
int r = (((int)matrix.y_r * Y + (int)matrix.u_r * U + (int)matrix.v_r * V + 4096) >> 13);
dstp[x * rgb_pixel_step + 0] = PixelClip(b);
dstp[x * rgb_pixel_step + 1] = PixelClip(g);
dstp[x * rgb_pixel_step + 2] = PixelClip(r);
if constexpr (rgb_pixel_step == 4) { // n/a
dstp[x * 4 + 3] = 255;
}
}
}
dstp -= dst_pitch;
srcY += src_pitch_y;
srcU += src_pitch_uv;
srcV += src_pitch_uv;
if constexpr(hasAlpha)
srcA += src_pitch_a;
}
}

//instantiate
//template<int rgb_pixel_step, bool hasAlpha>
template void convert_yv24_to_rgb_avx2<3, false>(BYTE* dstp, const BYTE* srcY, const BYTE* srcU, const BYTE* srcV, const BYTE* srcA, size_t dst_pitch, size_t src_pitch_y, size_t src_pitch_uv, size_t src_pitch_a, size_t width, size_t height, const ConversionMatrix& matrix);
template void convert_yv24_to_rgb_avx2<4, false>(BYTE* dstp, const BYTE* srcY, const BYTE* srcU, const BYTE* srcV, const BYTE* srcA, size_t dst_pitch, size_t src_pitch_y, size_t src_pitch_uv, size_t src_pitch_a, size_t width, size_t height, const ConversionMatrix& matrix);
template void convert_yv24_to_rgb_avx2<3, true>(BYTE* dstp, const BYTE* srcY, const BYTE* srcU, const BYTE* srcV, const BYTE* srcA, size_t dst_pitch, size_t src_pitch_y, size_t src_pitch_uv, size_t src_pitch_a, size_t width, size_t height, const ConversionMatrix& matrix);
template void convert_yv24_to_rgb_avx2<4, true>(BYTE* dstp, const BYTE* srcY, const BYTE* srcU, const BYTE* srcV, const BYTE* srcA, size_t dst_pitch, size_t src_pitch_y, size_t src_pitch_uv, size_t src_pitch_a, size_t width, size_t height, const ConversionMatrix& matrix);


template<int bits_per_pixel>
#if defined(GCC) || defined(CLANG)
__attribute__((__target__("avx2")))
Expand Down Expand Up @@ -129,3 +336,7 @@ template void convert_planarrgb_to_yuv_uint16_avx2<10>(BYTE* (&dstp)[3], int(&ds
template void convert_planarrgb_to_yuv_uint16_avx2<12>(BYTE* (&dstp)[3], int(&dstPitch)[3], const BYTE* (&srcp)[3], const int(&srcPitch)[3], int width, int height, const ConversionMatrix& m);
template void convert_planarrgb_to_yuv_uint16_avx2<14>(BYTE* (&dstp)[3], int(&dstPitch)[3], const BYTE* (&srcp)[3], const int(&srcPitch)[3], int width, int height, const ConversionMatrix& m);
template void convert_planarrgb_to_yuv_uint16_avx2<16>(BYTE* (&dstp)[3], int(&dstPitch)[3], const BYTE* (&srcp)[3], const int(&srcPitch)[3], int width, int height, const ConversionMatrix& m);

#ifdef _MSC_VER
#pragma warning(pop)
#endif
6 changes: 6 additions & 0 deletions avs_core/convert/intel/convert_planar_avx2.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,4 +45,10 @@ __attribute__((__target__("avx2")))
#endif
void convert_planarrgb_to_yuv_uint16_avx2(BYTE *(&dstp)[3], int (&dstPitch)[3], const BYTE *(&srcp)[3], const int (&srcPitch)[3], int width, int height, const ConversionMatrix &m);

template<int rgb_pixel_step, bool hasAlpha>
#if defined(GCC) || defined(CLANG)
__attribute__((__target__("avx2")))
#endif
void convert_yv24_to_rgb_avx2(BYTE* dstp, const BYTE* srcY, const BYTE* srcU, const BYTE* srcV, const BYTE* srcA, size_t dst_pitch, size_t src_pitch_y, size_t src_pitch_uv, size_t src_pitch_a, size_t width, size_t height, const ConversionMatrix& matrix);

#endif
2 changes: 1 addition & 1 deletion avs_core/convert/intel/convert_planar_sse.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1467,7 +1467,7 @@ void convert_yv24_to_rgb_sse2(BYTE* dstp, const BYTE* srcY, const BYTE* srcU, co
_mm_store_si128(reinterpret_cast<__m128i*>(temp), result_lo);
_mm_store_si128(reinterpret_cast<__m128i*>(temp + 16), result_hi);

for (int i = 0; i < 8; ++i) {
for (int i = 0; i < 7; ++i) {
*reinterpret_cast<int*>(dstp + (x + i) * 3) = *reinterpret_cast<int*>(temp + i * 4);
}
//last pixel
Expand Down

0 comments on commit 3c26fd5

Please sign in to comment.