Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

etcpak: Sync with upstream #87984

Merged
merged 1 commit into from
Feb 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion modules/etcpak/SCsub
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ thirdparty_dir = "#thirdparty/etcpak/"
thirdparty_sources = [
"Dither.cpp",
"ProcessDxtc.cpp",
"ProcessRgtc.cpp",
"ProcessRGB.cpp",
"Tables.cpp",
]
Expand Down
9 changes: 4 additions & 5 deletions modules/etcpak/image_compress_etcpak.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@

#include <ProcessDxtc.hpp>
#include <ProcessRGB.hpp>
#include <ProcessRgtc.hpp>

EtcpakType _determine_etc_type(Image::UsedChannels p_channels) {
switch (p_channels) {
Expand Down Expand Up @@ -246,11 +245,11 @@ void _compress_etcpak(EtcpakType p_compresstype, Image *r_img) {
break;

case EtcpakType::ETCPAK_TYPE_ETC2_R:
CompressEtc2R8(src_mip_read, dest_mip_write, blocks, mip_w);
CompressEacR(src_mip_read, dest_mip_write, blocks, mip_w);
break;

case EtcpakType::ETCPAK_TYPE_ETC2_RG:
CompressEtc2RG8(src_mip_read, dest_mip_write, blocks, mip_w);
CompressEacRg(src_mip_read, dest_mip_write, blocks, mip_w);
break;

case EtcpakType::ETCPAK_TYPE_DXT1:
Expand All @@ -263,11 +262,11 @@ void _compress_etcpak(EtcpakType p_compresstype, Image *r_img) {
break;

case EtcpakType::ETCPAK_TYPE_RGTC_R:
CompressRgtcR(src_mip_read, dest_mip_write, blocks, mip_w);
CompressBc4(src_mip_read, dest_mip_write, blocks, mip_w);
break;

case EtcpakType::ETCPAK_TYPE_RGTC_RG:
CompressRgtcRG(src_mip_read, dest_mip_write, blocks, mip_w);
CompressBc5(src_mip_read, dest_mip_write, blocks, mip_w);
break;

default:
Expand Down
8 changes: 1 addition & 7 deletions thirdparty/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ its functionality to IPv4 only.
## etcpak

- Upstream: https://github.com/wolfpld/etcpak
- Version: 1.0 (153f0e04a18b93c277684b577365210adcf8e11c, 2022)
- Version: git (5380688660a3801aec4b25483366027fe0442d7b, 2024)
- License: BSD-3-Clause

Files extracted from upstream source:
Expand All @@ -225,12 +225,6 @@ Files extracted from upstream source:
```
- `AUTHORS.txt` and `LICENSE.txt`

Two files (`ProcessRGB.{cpp,hpp}`) have been modified to provide ETC2_R and ETC2_RG compression,
the changes are based on the existing code.

Two files (`ProcessRgtc.{cpp,hpp}`) have been added to provide RGTC compression implementation,
based on library's `ProcessDxtc.{cpp,hpp}`.

## fonts

- `DroidSans*.woff2`:
Expand Down
152 changes: 141 additions & 11 deletions thirdparty/etcpak/ProcessDxtc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -739,18 +739,8 @@ static etcpak_force_inline uint64_t ProcessRGB_SSE( __m128i px0, __m128i px1, __
return uint64_t( ( uint64_t( to565( vmin ) ) << 16 ) | to565( vmax ) | ( uint64_t( vp ) << 32 ) );
}

static etcpak_force_inline uint64_t ProcessAlpha_SSE( __m128i px0, __m128i px1, __m128i px2, __m128i px3 )
static etcpak_force_inline uint64_t ProcessOneChannel_SSE( __m128i a )
{
__m128i mask = _mm_setr_epi32( 0x0f0b0703, -1, -1, -1 );

__m128i m0 = _mm_shuffle_epi8( px0, mask );
__m128i m1 = _mm_shuffle_epi8( px1, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 3, 0, 3 ) ) );
__m128i m2 = _mm_shuffle_epi8( px2, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 0, 3, 3 ) ) );
__m128i m3 = _mm_shuffle_epi8( px3, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 0, 3, 3, 3 ) ) );
__m128i m4 = _mm_or_si128( m0, m1 );
__m128i m5 = _mm_or_si128( m2, m3 );
__m128i a = _mm_or_si128( m4, m5 );

__m128i solidCmp = _mm_shuffle_epi8( a, _mm_setzero_si128() );
__m128i cmpRes = _mm_cmpeq_epi8( a, solidCmp );
if( _mm_testc_si128( cmpRes, _mm_set1_epi32( -1 ) ) )
Expand Down Expand Up @@ -800,6 +790,21 @@ static etcpak_force_inline uint64_t ProcessAlpha_SSE( __m128i px0, __m128i px1,
}
return (uint64_t)(uint16_t)_mm_cvtsi128_si32( minmax ) | ( data << 16 );
}

static etcpak_force_inline uint64_t ProcessAlpha_SSE( __m128i px0, __m128i px1, __m128i px2, __m128i px3 )
{
__m128i mask = _mm_setr_epi32( 0x0f0b0703, -1, -1, -1 );

__m128i m0 = _mm_shuffle_epi8( px0, mask );
__m128i m1 = _mm_shuffle_epi8( px1, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 3, 0, 3 ) ) );
__m128i m2 = _mm_shuffle_epi8( px2, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 0, 3, 3 ) ) );
__m128i m3 = _mm_shuffle_epi8( px3, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 0, 3, 3, 3 ) ) );
__m128i m4 = _mm_or_si128( m0, m1 );
__m128i m5 = _mm_or_si128( m2, m3 );
__m128i a = _mm_or_si128( m4, m5 );

return ProcessOneChannel_SSE( a );
}
#endif

void CompressDxt1( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width )
Expand Down Expand Up @@ -954,3 +959,128 @@ void CompressDxt5( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t w
}
while( --blocks );
}

void CompressBc4( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width )
{
int i = 0;
auto ptr = dst;
do
{
#ifdef __SSE4_1__
__m128i px0 = _mm_loadu_si128( (__m128i*)( src + width * 0 ) );
__m128i px1 = _mm_loadu_si128( (__m128i*)( src + width * 1 ) );
__m128i px2 = _mm_loadu_si128( (__m128i*)( src + width * 2 ) );
__m128i px3 = _mm_loadu_si128( (__m128i*)( src + width * 3 ) );

src += 4;
if( ++i == width/4 )
{
src += width * 3;
i = 0;
}

__m128i mask = _mm_setr_epi32( 0x0c080400, -1, -1, -1 );

__m128i m0 = _mm_shuffle_epi8( px0, mask );
__m128i m1 = _mm_shuffle_epi8( px1, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 3, 0, 3 ) ) );
__m128i m2 = _mm_shuffle_epi8( px2, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 0, 3, 3 ) ) );
__m128i m3 = _mm_shuffle_epi8( px3, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 0, 3, 3, 3 ) ) );
__m128i m4 = _mm_or_si128( m0, m1 );
__m128i m5 = _mm_or_si128( m2, m3 );

*ptr++ = ProcessOneChannel_SSE( _mm_or_si128( m4, m5 ) );
#else
uint8_t r[4*4];
auto rgba = src;
for( int i=0; i<4; i++ )
{
r[i*4] = rgba[0] & 0xff;
r[i*4+1] = rgba[1] & 0xff;
r[i*4+2] = rgba[2] & 0xff;
r[i*4+3] = rgba[3] & 0xff;

rgba += width;
}

src += 4;
if( ++i == width/4 )
{
src += width * 3;
i = 0;
}

*ptr++ = ProcessAlpha( r );
#endif
} while( --blocks );
}

void CompressBc5( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width )
{
int i = 0;
auto ptr = dst;
do
{
#ifdef __SSE4_1__
__m128i px0 = _mm_loadu_si128( (__m128i*)( src + width * 0 ) );
__m128i px1 = _mm_loadu_si128( (__m128i*)( src + width * 1 ) );
__m128i px2 = _mm_loadu_si128( (__m128i*)( src + width * 2 ) );
__m128i px3 = _mm_loadu_si128( (__m128i*)( src + width * 3 ) );

src += 4;
if( ++i == width/4 )
{
src += width*3;
i = 0;
}

__m128i mask = _mm_setr_epi32( 0x0c080400, -1, -1, -1 );

__m128i m0 = _mm_shuffle_epi8( px0, mask );
__m128i m1 = _mm_shuffle_epi8( px1, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 3, 0, 3 ) ) );
__m128i m2 = _mm_shuffle_epi8( px2, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 0, 3, 3 ) ) );
__m128i m3 = _mm_shuffle_epi8( px3, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 0, 3, 3, 3 ) ) );
__m128i m4 = _mm_or_si128( m0, m1 );
__m128i m5 = _mm_or_si128( m2, m3 );

*ptr++ = ProcessOneChannel_SSE( _mm_or_si128( m4, m5 ) );

mask = _mm_setr_epi32( 0x0d090501, -1, -1, -1 );

m0 = _mm_shuffle_epi8( px0, mask );
m1 = _mm_shuffle_epi8( px1, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 3, 0, 3 ) ) );
m2 = _mm_shuffle_epi8( px2, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 0, 3, 3 ) ) );
m3 = _mm_shuffle_epi8( px3, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 0, 3, 3, 3 ) ) );
m4 = _mm_or_si128( m0, m1 );
m5 = _mm_or_si128( m2, m3 );

*ptr++ = ProcessOneChannel_SSE( _mm_or_si128( m4, m5 ) );
#else
uint8_t rg[4*4*2];
auto rgba = src;
for( int i=0; i<4; i++ )
{
rg[i*4] = rgba[0] & 0xff;
rg[i*4+1] = rgba[1] & 0xff;
rg[i*4+2] = rgba[2] & 0xff;
rg[i*4+3] = rgba[3] & 0xff;

rg[16+i*4] = (rgba[0] & 0xff00) >> 8;
rg[16+i*4+1] = (rgba[1] & 0xff00) >> 8;
rg[16+i*4+2] = (rgba[2] & 0xff00) >> 8;
rg[16+i*4+3] = (rgba[3] & 0xff00) >> 8;

rgba += width;
}

src += 4;
if( ++i == width/4 )
{
src += width*3;
i = 0;
}

*ptr++ = ProcessAlpha( rg );
*ptr++ = ProcessAlpha( &rg[16] );
#endif
} while( --blocks );
}
3 changes: 3 additions & 0 deletions thirdparty/etcpak/ProcessDxtc.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,7 @@ void CompressDxt1( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t w
void CompressDxt1Dither( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width );
void CompressDxt5( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width );

void CompressBc4( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width );
void CompressBc5( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width );

#endif
52 changes: 29 additions & 23 deletions thirdparty/etcpak/ProcessRGB.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3266,16 +3266,21 @@ etcpak_force_inline static int16x8_t WidenMultiplier_EAC_NEON( int16x8_t multipl

#endif

template<bool checkSolid = true>
static etcpak_force_inline uint64_t ProcessAlpha_ETC2( const uint8_t* src )
{
#if defined __SSE4_1__
// Check solid
__m128i s = _mm_loadu_si128( (__m128i*)src );
__m128i solidCmp = _mm_set1_epi8( src[0] );
__m128i cmpRes = _mm_cmpeq_epi8( s, solidCmp );
if( _mm_testc_si128( cmpRes, _mm_set1_epi32( -1 ) ) )

if( checkSolid )
{
return src[0];
// Check solid
__m128i solidCmp = _mm_set1_epi8( src[0] );
__m128i cmpRes = _mm_cmpeq_epi8( s, solidCmp );
if( _mm_testc_si128( cmpRes, _mm_set1_epi32( -1 ) ) )
{
return src[0];
}
}

// Calculate min, max
Expand Down Expand Up @@ -3684,12 +3689,15 @@ static etcpak_force_inline uint64_t ProcessAlpha_ETC2( const uint8_t* src )
int srcMid;
uint8x16_t srcAlphaBlock = vld1q_u8( src );
{
uint8_t ref = src[0];
uint8x16_t a0 = vdupq_n_u8( ref );
uint8x16_t r = vceqq_u8( srcAlphaBlock, a0 );
int64x2_t m = vreinterpretq_s64_u8( r );
if( m[0] == -1 && m[1] == -1 )
return ref;
if( checkSolid )
{
uint8_t ref = src[0];
uint8x16_t a0 = vdupq_n_u8( ref );
uint8x16_t r = vceqq_u8( srcAlphaBlock, a0 );
int64x2_t m = vreinterpretq_s64_u8( r );
if( m[0] == -1 && m[1] == -1 )
return ref;
}

// srcRange
#ifdef __aarch64__
Expand Down Expand Up @@ -3759,6 +3767,7 @@ static etcpak_force_inline uint64_t ProcessAlpha_ETC2( const uint8_t* src )
#undef EAC_RECONSTRUCT_VALUE

#else
if( checkSolid )
{
bool solid = true;
const uint8_t* ptr = src + 1;
Expand Down Expand Up @@ -3849,7 +3858,6 @@ static etcpak_force_inline uint64_t ProcessAlpha_ETC2( const uint8_t* src )
#endif
}


void CompressEtc1Alpha( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width )
{
int w = 0;
Expand Down Expand Up @@ -4176,14 +4184,13 @@ void CompressEtc2Rgba( const uint32_t* src, uint64_t* dst, uint32_t blocks, size
src += width * 3;
w = 0;
}
*dst++ = ProcessAlpha_ETC2( alpha );
*dst++ = ProcessAlpha_ETC2<true>( alpha );
*dst++ = ProcessRGB_ETC2( (uint8_t*)rgba, useHeuristics );
}
while( --blocks );
}

// -- GODOT start --
void CompressEtc2R8( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width )
void CompressEacR( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width )
{
int w = 0;
uint8_t r[4*4];
Expand Down Expand Up @@ -4239,12 +4246,12 @@ void CompressEtc2R8( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t
src += width * 3;
w = 0;
}
*dst++ = ProcessAlpha_ETC2( r );
*dst++ = ProcessAlpha_ETC2<false>( r );
}
while( --blocks );
}

void CompressEtc2RG8( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width )
void CompressEacRg( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width )
{
int w = 0;
uint8_t rg[4*4*2];
Expand Down Expand Up @@ -4300,15 +4307,15 @@ void CompressEtc2RG8( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_
src += width;
v = *src;
*ptrr++ = (v & 0xff0000) >> 16;
*ptrg++ = (v & 0xff00) >> 8;
*ptrg++ = (v & 0xff00) >> 8;
src += width;
v = *src;
*ptrr++ = (v & 0xff0000) >> 16;
*ptrg++ = (v & 0xff00) >> 8;
*ptrg++ = (v & 0xff00) >> 8;
src += width;
v = *src;
*ptrr++ = (v & 0xff0000) >> 16;
*ptrg++ = (v & 0xff00) >> 8;
*ptrg++ = (v & 0xff00) >> 8;
src -= width * 3 - 1;
}
#endif
Expand All @@ -4317,9 +4324,8 @@ void CompressEtc2RG8( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_
src += width * 3;
w = 0;
}
*dst++ = ProcessAlpha_ETC2( rg );
*dst++ = ProcessAlpha_ETC2( &rg[16] );
*dst++ = ProcessAlpha_ETC2<false>( rg );
*dst++ = ProcessAlpha_ETC2<false>( &rg[16] );
}
while( --blocks );
}
// -- GODOT end --
8 changes: 4 additions & 4 deletions thirdparty/etcpak/ProcessRGB.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ void CompressEtc1Rgb( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_
void CompressEtc1RgbDither( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width );
void CompressEtc2Rgb( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width, bool useHeuristics );
void CompressEtc2Rgba( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width, bool useHeuristics );
// -- GODOT start --
void CompressEtc2R8( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width );
void CompressEtc2RG8( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width );
// -- GODOT end --

void CompressEacR( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width );
void CompressEacRg( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width );

#endif
Loading
Loading