Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add SSE2 fillers #2566

Merged
merged 5 commits into from
Dec 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion buildconfig/Setup.Android.SDL2.in
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ key src_c/key.c $(SDL) $(DEBUG)
mouse src_c/mouse.c $(SDL) $(DEBUG)
rect src_c/rect.c src_c/pgcompat_rect.c $(SDL) $(DEBUG)
rwobject src_c/rwobject.c $(SDL) $(DEBUG)
surface src_c/simd_blitters_sse2.c src_c/simd_blitters_avx2.c src_c/surface.c src_c/alphablit.c src_c/surface_fill.c src_c/simd_surface_fill_avx2.c $(SDL) $(DEBUG)
surface src_c/simd_blitters_sse2.c src_c/simd_blitters_avx2.c src_c/surface.c src_c/alphablit.c src_c/surface_fill.c src_c/simd_surface_fill_avx2.c src_c/simd_surface_fill_sse2.c $(SDL) $(DEBUG)
surflock src_c/surflock.c $(SDL) $(DEBUG)
time src_c/time.c $(SDL) $(DEBUG)
joystick src_c/joystick.c $(SDL) $(DEBUG)
Expand Down
2 changes: 1 addition & 1 deletion buildconfig/Setup.Emscripten.SDL2.in
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ _sdl2.controller_old src_c/void.c
#_sdl2.touch src_c/_sdl2/touch.c $(SDL) $(DEBUG) -Isrc_c
_sdl2.touch src_c/void.c

#transform src_c/simd_transform_sse2.c src_c/simd_transform_avx2.c src_c/transform.c src_c/rotozoom.c src_c/scale2x.c src_c/scale_mmx.c src_c/simd_surface_fill_avx2.c $(SDL) $(DEBUG) -D_NO_MMX_FOR_X86_64
#transform src_c/simd_transform_sse2.c src_c/simd_transform_avx2.c src_c/transform.c src_c/rotozoom.c src_c/scale2x.c src_c/scale_mmx.c src_c/simd_surface_fill_avx2.c src_c/simd_surface_fill_sse2.c $(SDL) $(DEBUG) -D_NO_MMX_FOR_X86_64
transform src_c/void.c


2 changes: 1 addition & 1 deletion buildconfig/Setup.SDL2.in
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ key src_c/key.c $(SDL) $(DEBUG)
mouse src_c/mouse.c $(SDL) $(DEBUG)
rect src_c/rect.c src_c/pgcompat_rect.c $(SDL) $(DEBUG)
rwobject src_c/rwobject.c $(SDL) $(DEBUG)
surface src_c/simd_blitters_sse2.c src_c/simd_blitters_avx2.c src_c/surface.c src_c/alphablit.c src_c/surface_fill.c src_c/simd_surface_fill_avx2.c $(SDL) $(DEBUG)
surface src_c/simd_blitters_sse2.c src_c/simd_blitters_avx2.c src_c/surface.c src_c/alphablit.c src_c/surface_fill.c src_c/simd_surface_fill_avx2.c src_c/simd_surface_fill_sse2.c $(SDL) $(DEBUG)
surflock src_c/surflock.c $(SDL) $(DEBUG)
time src_c/time.c $(SDL) $(DEBUG)
joystick src_c/joystick.c $(SDL) $(DEBUG)
Expand Down
71 changes: 71 additions & 0 deletions src_c/simd_fill.h
Original file line number Diff line number Diff line change
@@ -1,16 +1,56 @@
#define NO_PYGAME_C_API
#include "_surface.h"

#if !defined(PG_ENABLE_ARM_NEON) && defined(__aarch64__)
// arm64 has neon optimisations enabled by default, even when fpu=neon is not
// passed
#define PG_ENABLE_ARM_NEON 1
#endif

/* See if we are compiled 64 bit on GCC or MSVC */
#if _WIN32 || _WIN64
#if _WIN64
#define ENV64BIT
#endif
#endif

// Check GCC
#if __GNUC__
#if __x86_64__ || __ppc64__ || __aarch64__
#define ENV64BIT
#endif
#endif

#if PG_ENABLE_ARM_NEON
// sse2neon.h is from here: https://github.com/DLTcollab/sse2neon
#include "include/sse2neon.h"
#endif /* PG_ENABLE_ARM_NEON */

#if defined(__SSE2__)
#define PG_ENABLE_SSE_NEON 1
#elif PG_ENABLE_ARM_NEON
#define PG_ENABLE_SSE_NEON 1
#else
#define PG_ENABLE_SSE_NEON 0
#endif

int
_pg_has_avx2();

/* This returns True if either SSE2 or NEON is present at runtime.
* Relevant because they use the same codepaths. Only the relevant runtime
* SDL cpu feature check is compiled in.*/
int
_pg_HasSSE_NEON();

// AVX2 functions
int
surface_fill_blend_add_avx2(SDL_Surface *surface, SDL_Rect *rect,
Uint32 color);
int
surface_fill_blend_rgba_add_avx2(SDL_Surface *surface, SDL_Rect *rect,
Uint32 color);

int
surface_fill_blend_sub_avx2(SDL_Surface *surface, SDL_Rect *rect,
Uint32 color);
Expand All @@ -35,3 +75,34 @@ surface_fill_blend_max_avx2(SDL_Surface *surface, SDL_Rect *rect,
int
surface_fill_blend_rgba_max_avx2(SDL_Surface *surface, SDL_Rect *rect,
Uint32 color);
// SSE2 functions
int
surface_fill_blend_add_sse2(SDL_Surface *surface, SDL_Rect *rect,
Uint32 color);
int
surface_fill_blend_rgba_add_sse2(SDL_Surface *surface, SDL_Rect *rect,
Uint32 color);
int
surface_fill_blend_sub_sse2(SDL_Surface *surface, SDL_Rect *rect,
Uint32 color);
int
surface_fill_blend_rgba_sub_sse2(SDL_Surface *surface, SDL_Rect *rect,
Uint32 color);
int
surface_fill_blend_mult_sse2(SDL_Surface *surface, SDL_Rect *rect,
Uint32 color);
int
surface_fill_blend_rgba_mult_sse2(SDL_Surface *surface, SDL_Rect *rect,
Uint32 color);
int
surface_fill_blend_min_sse2(SDL_Surface *surface, SDL_Rect *rect,
Uint32 color);
int
surface_fill_blend_rgba_min_sse2(SDL_Surface *surface, SDL_Rect *rect,
Uint32 color);
int
surface_fill_blend_max_sse2(SDL_Surface *surface, SDL_Rect *rect,
Uint32 color);
int
surface_fill_blend_rgba_max_sse2(SDL_Surface *surface, SDL_Rect *rect,
Uint32 color);
168 changes: 168 additions & 0 deletions src_c/simd_surface_fill_sse2.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
#include "simd_fill.h"

#define BAD_SSE2_FUNCTION_CALL \
printf( \
"Fatal Error: Attempted calling an SSE2 function when both compile " \
"time and runtime support is missing. If you are seeing this " \
"message, you have stumbled across a pygame bug, please report it " \
"to the devs!"); \
PG_EXIT(1)

int
_pg_HasSSE_NEON()
{
#if defined(__SSE2__)
return SDL_HasSSE2();
#elif PG_ENABLE_ARM_NEON
return SDL_HasNEON();
#else
return 0;
#endif
}

#define SETUP_SSE2_FILLER(COLOR_PROCESS_CODE) \
/* initialize surface data */ \
int width = rect->w, height = rect->h; \
int skip = surface->pitch / 4 - width; \
/* indicates the number of pixels that can't be processed in 4-pixel \
* blocks */ \
int pxl_excess = width % 4; \
/* indicates the number of 4-pixel blocks that can be processed */ \
int n_iters_4 = width / 4; \
int i, j; \
/* load pixel data */ \
Uint32 *pixels = \
(Uint32 *)surface->pixels + rect->y * (surface->pitch / 4) + rect->x; \
\
__m128i mm128_dst; \
/* prep and load the color */ \
Uint32 amask = surface->format->Amask; \
if (amask) { \
{ \
COLOR_PROCESS_CODE \
} \
} \
__m128i mm128_color = _mm_set1_epi32(color);

#define RUN_SSE2_FILLER(FILL_CODE) \
while (height--) { \
for (i = 0; i < n_iters_4; i++) { \
/* load 4 pixels */ \
mm128_dst = _mm_loadu_si128((__m128i *)pixels); \
\
{FILL_CODE} \
\
/* store 4 pixels */ \
_mm_storeu_si128((__m128i *)pixels, mm128_dst); \
\
pixels += 4; \
} \
\
if (pxl_excess) { \
for (j = 0; j < pxl_excess; j++, pixels++) { \
mm128_dst = _mm_cvtsi32_si128(*pixels); \
\
{FILL_CODE} \
\
*pixels = _mm_cvtsi128_si32(mm128_dst); \
itzpr3d4t0r marked this conversation as resolved.
Show resolved Hide resolved
} \
} \
pixels += skip; \
}

/* Setup for RUN_16BIT_SHUFFLE_OUT */
#define SETUP_SHUFFLE \
__m128i shuff_dst, _shuff16_temp, mm128_colorA, mm128_colorB; \
mm128_colorA = _mm_unpacklo_epi8(mm128_color, _mm_setzero_si128()); \
mm128_colorB = _mm_unpackhi_epi8(mm128_color, _mm_setzero_si128());

#define RUN_16BIT_SHUFFLE_OUT(FILL_CODE) \
/* ==== shuffle pixels out into two registers each, src */ \
/* and dst set up for 16 bit math, like 0A0R0G0B ==== */ \
shuff_dst = _mm_unpacklo_epi8(mm128_dst, _mm_setzero_si128()); \
mm128_color = mm128_colorA; \
\
{FILL_CODE} \
\
_shuff16_temp = shuff_dst; \
\
shuff_dst = _mm_unpackhi_epi8(mm128_dst, _mm_setzero_si128()); \
mm128_color = mm128_colorB; \
\
{FILL_CODE} \
\
/* ==== recombine A and B pixels ==== */ \
mm128_dst = _mm_packus_epi16(_shuff16_temp, shuff_dst);

#define FILLERS(NAME, COLOR_PROCESS_CODE, FILL_CODE) \
int surface_fill_blend_##NAME##_sse2(SDL_Surface *surface, \
SDL_Rect *rect, Uint32 color) \
{ \
SETUP_SSE2_FILLER(COLOR_PROCESS_CODE) \
RUN_SSE2_FILLER(FILL_CODE) \
return 0; \
} \
int surface_fill_blend_rgba_##NAME##_sse2(SDL_Surface *surface, \
SDL_Rect *rect, Uint32 color) \
{ \
SETUP_SSE2_FILLER({}) \
RUN_SSE2_FILLER(FILL_CODE) \
return 0; \
}

#define FILLERS_SHUFF(NAME, COLOR_PROCESS_CODE, FILL_CODE) \
int surface_fill_blend_##NAME##_sse2(SDL_Surface *surface, \
SDL_Rect *rect, Uint32 color) \
{ \
SETUP_SSE2_FILLER(COLOR_PROCESS_CODE) \
SETUP_SHUFFLE \
RUN_SSE2_FILLER(RUN_16BIT_SHUFFLE_OUT(FILL_CODE)) \
return 0; \
} \
int surface_fill_blend_rgba_##NAME##_sse2(SDL_Surface *surface, \
SDL_Rect *rect, Uint32 color) \
{ \
SETUP_SSE2_FILLER({}) \
SETUP_SHUFFLE \
RUN_SSE2_FILLER(RUN_16BIT_SHUFFLE_OUT(FILL_CODE)) \
return 0; \
}

#define INVALID_DEFS(NAME) \
int surface_fill_blend_##NAME##_sse2(SDL_Surface *surface, \
SDL_Rect *rect, Uint32 color) \
{ \
BAD_SSE2_FUNCTION_CALL; \
return -1; \
} \
int surface_fill_blend_rgba_##NAME##_sse2(SDL_Surface *surface, \
SDL_Rect *rect, Uint32 color) \
{ \
BAD_SSE2_FUNCTION_CALL; \
return -1; \
}

#define ADD_CODE mm128_dst = _mm_adds_epu8(mm128_dst, mm128_color);
#define SUB_CODE mm128_dst = _mm_subs_epu8(mm128_dst, mm128_color);
#define MIN_CODE mm128_dst = _mm_min_epu8(mm128_dst, mm128_color);
#define MAX_CODE mm128_dst = _mm_max_epu8(mm128_dst, mm128_color);
#define MULT_CODE \
{ \
shuff_dst = _mm_mullo_epi16(shuff_dst, mm128_color); \
shuff_dst = _mm_adds_epu16(shuff_dst, _mm_set1_epi16(255)); \
shuff_dst = _mm_srli_epi16(shuff_dst, 8); \
}

#if defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON)
FILLERS(add, color &= ~amask;, ADD_CODE)
FILLERS(sub, color &= ~amask;, SUB_CODE)
FILLERS(min, color |= amask;, MIN_CODE)
FILLERS(max, color &= ~amask;, MAX_CODE)
FILLERS_SHUFF(mult, color |= amask;, MULT_CODE)
#else
INVALID_DEFS(add)
INVALID_DEFS(sub)
INVALID_DEFS(min)
INVALID_DEFS(max)
INVALID_DEFS(mult)
#endif /* defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON) */
Loading
Loading