Skip to content

Commit

Permalink
Merge pull request #2566 from itzpr3d4t0r/add-sse2-fillers
Browse files Browse the repository at this point in the history
Add SSE2 fillers
  • Loading branch information
Starbuck5 authored Dec 11, 2023
2 parents 5482b63 + ecce2b0 commit 6111af4
Show file tree
Hide file tree
Showing 6 changed files with 364 additions and 38 deletions.
2 changes: 1 addition & 1 deletion buildconfig/Setup.Android.SDL2.in
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ key src_c/key.c $(SDL) $(DEBUG)
mouse src_c/mouse.c $(SDL) $(DEBUG)
rect src_c/rect.c src_c/pgcompat_rect.c $(SDL) $(DEBUG)
rwobject src_c/rwobject.c $(SDL) $(DEBUG)
surface src_c/simd_blitters_sse2.c src_c/simd_blitters_avx2.c src_c/surface.c src_c/alphablit.c src_c/surface_fill.c src_c/simd_surface_fill_avx2.c $(SDL) $(DEBUG)
surface src_c/simd_blitters_sse2.c src_c/simd_blitters_avx2.c src_c/surface.c src_c/alphablit.c src_c/surface_fill.c src_c/simd_surface_fill_avx2.c src_c/simd_surface_fill_sse2.c $(SDL) $(DEBUG)
surflock src_c/surflock.c $(SDL) $(DEBUG)
time src_c/time.c $(SDL) $(DEBUG)
joystick src_c/joystick.c $(SDL) $(DEBUG)
Expand Down
2 changes: 1 addition & 1 deletion buildconfig/Setup.Emscripten.SDL2.in
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ _sdl2.controller_old src_c/void.c
#_sdl2.touch src_c/_sdl2/touch.c $(SDL) $(DEBUG) -Isrc_c
_sdl2.touch src_c/void.c

#transform src_c/simd_transform_sse2.c src_c/simd_transform_avx2.c src_c/transform.c src_c/rotozoom.c src_c/scale2x.c src_c/scale_mmx.c src_c/simd_surface_fill_avx2.c $(SDL) $(DEBUG) -D_NO_MMX_FOR_X86_64
#transform src_c/simd_transform_sse2.c src_c/simd_transform_avx2.c src_c/transform.c src_c/rotozoom.c src_c/scale2x.c src_c/scale_mmx.c src_c/simd_surface_fill_avx2.c src_c/simd_surface_fill_sse2.c $(SDL) $(DEBUG) -D_NO_MMX_FOR_X86_64
transform src_c/void.c


2 changes: 1 addition & 1 deletion buildconfig/Setup.SDL2.in
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ key src_c/key.c $(SDL) $(DEBUG)
mouse src_c/mouse.c $(SDL) $(DEBUG)
rect src_c/rect.c src_c/pgcompat_rect.c $(SDL) $(DEBUG)
rwobject src_c/rwobject.c $(SDL) $(DEBUG)
surface src_c/simd_blitters_sse2.c src_c/simd_blitters_avx2.c src_c/surface.c src_c/alphablit.c src_c/surface_fill.c src_c/simd_surface_fill_avx2.c $(SDL) $(DEBUG)
surface src_c/simd_blitters_sse2.c src_c/simd_blitters_avx2.c src_c/surface.c src_c/alphablit.c src_c/surface_fill.c src_c/simd_surface_fill_avx2.c src_c/simd_surface_fill_sse2.c $(SDL) $(DEBUG)
surflock src_c/surflock.c $(SDL) $(DEBUG)
time src_c/time.c $(SDL) $(DEBUG)
joystick src_c/joystick.c $(SDL) $(DEBUG)
Expand Down
71 changes: 71 additions & 0 deletions src_c/simd_fill.h
Original file line number Diff line number Diff line change
@@ -1,16 +1,56 @@
#define NO_PYGAME_C_API
#include "_surface.h"

#if !defined(PG_ENABLE_ARM_NEON) && defined(__aarch64__)
// arm64 has neon optimisations enabled by default, even when fpu=neon is not
// passed
#define PG_ENABLE_ARM_NEON 1
#endif

/* See if we are compiled 64 bit on GCC or MSVC */
#if _WIN32 || _WIN64
#if _WIN64
#define ENV64BIT
#endif
#endif

// Check GCC
#if __GNUC__
#if __x86_64__ || __ppc64__ || __aarch64__
#define ENV64BIT
#endif
#endif

#if PG_ENABLE_ARM_NEON
// sse2neon.h is from here: https://github.com/DLTcollab/sse2neon
#include "include/sse2neon.h"
#endif /* PG_ENABLE_ARM_NEON */

#if defined(__SSE2__)
#define PG_ENABLE_SSE_NEON 1
#elif PG_ENABLE_ARM_NEON
#define PG_ENABLE_SSE_NEON 1
#else
#define PG_ENABLE_SSE_NEON 0
#endif

int
_pg_has_avx2();

/* This returns True if either SSE2 or NEON is present at runtime.
* Relevant because they use the same codepaths. Only the relevant runtime
* SDL cpu feature check is compiled in.*/
int
_pg_HasSSE_NEON();

// AVX2 functions
int
surface_fill_blend_add_avx2(SDL_Surface *surface, SDL_Rect *rect,
Uint32 color);
int
surface_fill_blend_rgba_add_avx2(SDL_Surface *surface, SDL_Rect *rect,
Uint32 color);

int
surface_fill_blend_sub_avx2(SDL_Surface *surface, SDL_Rect *rect,
Uint32 color);
Expand All @@ -35,3 +75,34 @@ surface_fill_blend_max_avx2(SDL_Surface *surface, SDL_Rect *rect,
int
surface_fill_blend_rgba_max_avx2(SDL_Surface *surface, SDL_Rect *rect,
Uint32 color);
// SSE2 functions
int
surface_fill_blend_add_sse2(SDL_Surface *surface, SDL_Rect *rect,
Uint32 color);
int
surface_fill_blend_rgba_add_sse2(SDL_Surface *surface, SDL_Rect *rect,
Uint32 color);
int
surface_fill_blend_sub_sse2(SDL_Surface *surface, SDL_Rect *rect,
Uint32 color);
int
surface_fill_blend_rgba_sub_sse2(SDL_Surface *surface, SDL_Rect *rect,
Uint32 color);
int
surface_fill_blend_mult_sse2(SDL_Surface *surface, SDL_Rect *rect,
Uint32 color);
int
surface_fill_blend_rgba_mult_sse2(SDL_Surface *surface, SDL_Rect *rect,
Uint32 color);
int
surface_fill_blend_min_sse2(SDL_Surface *surface, SDL_Rect *rect,
Uint32 color);
int
surface_fill_blend_rgba_min_sse2(SDL_Surface *surface, SDL_Rect *rect,
Uint32 color);
int
surface_fill_blend_max_sse2(SDL_Surface *surface, SDL_Rect *rect,
Uint32 color);
int
surface_fill_blend_rgba_max_sse2(SDL_Surface *surface, SDL_Rect *rect,
Uint32 color);
168 changes: 168 additions & 0 deletions src_c/simd_surface_fill_sse2.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
#include "simd_fill.h"

#define BAD_SSE2_FUNCTION_CALL \
printf( \
"Fatal Error: Attempted calling an SSE2 function when both compile " \
"time and runtime support is missing. If you are seeing this " \
"message, you have stumbled across a pygame bug, please report it " \
"to the devs!"); \
PG_EXIT(1)

int
_pg_HasSSE_NEON()
{
#if defined(__SSE2__)
return SDL_HasSSE2();
#elif PG_ENABLE_ARM_NEON
return SDL_HasNEON();
#else
return 0;
#endif
}

#define SETUP_SSE2_FILLER(COLOR_PROCESS_CODE) \
/* initialize surface data */ \
int width = rect->w, height = rect->h; \
int skip = surface->pitch / 4 - width; \
/* indicates the number of pixels that can't be processed in 4-pixel \
* blocks */ \
int pxl_excess = width % 4; \
/* indicates the number of 4-pixel blocks that can be processed */ \
int n_iters_4 = width / 4; \
int i, j; \
/* load pixel data */ \
Uint32 *pixels = \
(Uint32 *)surface->pixels + rect->y * (surface->pitch / 4) + rect->x; \
\
__m128i mm128_dst; \
/* prep and load the color */ \
Uint32 amask = surface->format->Amask; \
if (amask) { \
{ \
COLOR_PROCESS_CODE \
} \
} \
__m128i mm128_color = _mm_set1_epi32(color);

#define RUN_SSE2_FILLER(FILL_CODE) \
while (height--) { \
for (i = 0; i < n_iters_4; i++) { \
/* load 4 pixels */ \
mm128_dst = _mm_loadu_si128((__m128i *)pixels); \
\
{FILL_CODE} \
\
/* store 4 pixels */ \
_mm_storeu_si128((__m128i *)pixels, mm128_dst); \
\
pixels += 4; \
} \
\
if (pxl_excess) { \
for (j = 0; j < pxl_excess; j++, pixels++) { \
mm128_dst = _mm_cvtsi32_si128(*pixels); \
\
{FILL_CODE} \
\
*pixels = _mm_cvtsi128_si32(mm128_dst); \
} \
} \
pixels += skip; \
}

/* Setup for RUN_16BIT_SHUFFLE_OUT */
#define SETUP_SHUFFLE \
__m128i shuff_dst, _shuff16_temp, mm128_colorA, mm128_colorB; \
mm128_colorA = _mm_unpacklo_epi8(mm128_color, _mm_setzero_si128()); \
mm128_colorB = _mm_unpackhi_epi8(mm128_color, _mm_setzero_si128());

#define RUN_16BIT_SHUFFLE_OUT(FILL_CODE) \
/* ==== shuffle pixels out into two registers each, src */ \
/* and dst set up for 16 bit math, like 0A0R0G0B ==== */ \
shuff_dst = _mm_unpacklo_epi8(mm128_dst, _mm_setzero_si128()); \
mm128_color = mm128_colorA; \
\
{FILL_CODE} \
\
_shuff16_temp = shuff_dst; \
\
shuff_dst = _mm_unpackhi_epi8(mm128_dst, _mm_setzero_si128()); \
mm128_color = mm128_colorB; \
\
{FILL_CODE} \
\
/* ==== recombine A and B pixels ==== */ \
mm128_dst = _mm_packus_epi16(_shuff16_temp, shuff_dst);

#define FILLERS(NAME, COLOR_PROCESS_CODE, FILL_CODE) \
int surface_fill_blend_##NAME##_sse2(SDL_Surface *surface, \
SDL_Rect *rect, Uint32 color) \
{ \
SETUP_SSE2_FILLER(COLOR_PROCESS_CODE) \
RUN_SSE2_FILLER(FILL_CODE) \
return 0; \
} \
int surface_fill_blend_rgba_##NAME##_sse2(SDL_Surface *surface, \
SDL_Rect *rect, Uint32 color) \
{ \
SETUP_SSE2_FILLER({}) \
RUN_SSE2_FILLER(FILL_CODE) \
return 0; \
}

#define FILLERS_SHUFF(NAME, COLOR_PROCESS_CODE, FILL_CODE) \
int surface_fill_blend_##NAME##_sse2(SDL_Surface *surface, \
SDL_Rect *rect, Uint32 color) \
{ \
SETUP_SSE2_FILLER(COLOR_PROCESS_CODE) \
SETUP_SHUFFLE \
RUN_SSE2_FILLER(RUN_16BIT_SHUFFLE_OUT(FILL_CODE)) \
return 0; \
} \
int surface_fill_blend_rgba_##NAME##_sse2(SDL_Surface *surface, \
SDL_Rect *rect, Uint32 color) \
{ \
SETUP_SSE2_FILLER({}) \
SETUP_SHUFFLE \
RUN_SSE2_FILLER(RUN_16BIT_SHUFFLE_OUT(FILL_CODE)) \
return 0; \
}

#define INVALID_DEFS(NAME) \
int surface_fill_blend_##NAME##_sse2(SDL_Surface *surface, \
SDL_Rect *rect, Uint32 color) \
{ \
BAD_SSE2_FUNCTION_CALL; \
return -1; \
} \
int surface_fill_blend_rgba_##NAME##_sse2(SDL_Surface *surface, \
SDL_Rect *rect, Uint32 color) \
{ \
BAD_SSE2_FUNCTION_CALL; \
return -1; \
}

#define ADD_CODE mm128_dst = _mm_adds_epu8(mm128_dst, mm128_color);
#define SUB_CODE mm128_dst = _mm_subs_epu8(mm128_dst, mm128_color);
#define MIN_CODE mm128_dst = _mm_min_epu8(mm128_dst, mm128_color);
#define MAX_CODE mm128_dst = _mm_max_epu8(mm128_dst, mm128_color);
#define MULT_CODE \
{ \
shuff_dst = _mm_mullo_epi16(shuff_dst, mm128_color); \
shuff_dst = _mm_adds_epu16(shuff_dst, _mm_set1_epi16(255)); \
shuff_dst = _mm_srli_epi16(shuff_dst, 8); \
}

#if defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON)
FILLERS(add, color &= ~amask;, ADD_CODE)
FILLERS(sub, color &= ~amask;, SUB_CODE)
FILLERS(min, color |= amask;, MIN_CODE)
FILLERS(max, color &= ~amask;, MAX_CODE)
FILLERS_SHUFF(mult, color |= amask;, MULT_CODE)
#else
INVALID_DEFS(add)
INVALID_DEFS(sub)
INVALID_DEFS(min)
INVALID_DEFS(max)
INVALID_DEFS(mult)
#endif /* defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON) */
Loading

0 comments on commit 6111af4

Please sign in to comment.