godotengine · akien-mga · Feb 7, 2024 · Feb 5, 2024
@@ -13,7 +13,6 @@ thirdparty_dir = "#thirdparty/etcpak/"
 thirdparty_sources = [
     "Dither.cpp",
     "ProcessDxtc.cpp",
-    "ProcessRgtc.cpp",
     "ProcessRGB.cpp",
     "Tables.cpp",
 ]

@@ -35,7 +35,6 @@
 
 #include <ProcessDxtc.hpp>
 #include <ProcessRGB.hpp>
-#include <ProcessRgtc.hpp>
 
 EtcpakType _determine_etc_type(Image::UsedChannels p_channels) {
 	switch (p_channels) {
@@ -246,11 +245,11 @@ void _compress_etcpak(EtcpakType p_compresstype, Image *r_img) {
 				break;
 
 			case EtcpakType::ETCPAK_TYPE_ETC2_R:
-				CompressEtc2R8(src_mip_read, dest_mip_write, blocks, mip_w);
+				CompressEacR(src_mip_read, dest_mip_write, blocks, mip_w);
 				break;
 
 			case EtcpakType::ETCPAK_TYPE_ETC2_RG:
-				CompressEtc2RG8(src_mip_read, dest_mip_write, blocks, mip_w);
+				CompressEacRg(src_mip_read, dest_mip_write, blocks, mip_w);
 				break;
 
 			case EtcpakType::ETCPAK_TYPE_DXT1:
@@ -263,11 +262,11 @@ void _compress_etcpak(EtcpakType p_compresstype, Image *r_img) {
 				break;
 
 			case EtcpakType::ETCPAK_TYPE_RGTC_R:
-				CompressRgtcR(src_mip_read, dest_mip_write, blocks, mip_w);
+				CompressBc4(src_mip_read, dest_mip_write, blocks, mip_w);
 				break;
 
 			case EtcpakType::ETCPAK_TYPE_RGTC_RG:
-				CompressRgtcRG(src_mip_read, dest_mip_write, blocks, mip_w);
+				CompressBc5(src_mip_read, dest_mip_write, blocks, mip_w);
 				break;
 
 			default:

@@ -213,7 +213,7 @@ its functionality to IPv4 only.
 ## etcpak
 
 - Upstream: https://github.com/wolfpld/etcpak
-- Version: 1.0 (153f0e04a18b93c277684b577365210adcf8e11c, 2022)
+- Version: git (5380688660a3801aec4b25483366027fe0442d7b, 2024)
 - License: BSD-3-Clause
 
 Files extracted from upstream source:
@@ -225,12 +225,6 @@ Files extracted from upstream source:
   ```
 - `AUTHORS.txt` and `LICENSE.txt`
 
-Two files (`ProcessRGB.{cpp,hpp}`) have been modified to provide ETC2_R and ETC2_RG compression,
-the changes are based on the existing code.
-
-Two files (`ProcessRgtc.{cpp,hpp}`) have been added to provide RGTC compression implementation,
-based on library's `ProcessDxtc.{cpp,hpp}`.
-
 ## fonts
 
 - `DroidSans*.woff2`:

@@ -739,18 +739,8 @@ static etcpak_force_inline uint64_t ProcessRGB_SSE( __m128i px0, __m128i px1, __
     return uint64_t( ( uint64_t( to565( vmin ) ) << 16 ) | to565( vmax ) | ( uint64_t( vp ) << 32 ) );
 }
 
-static etcpak_force_inline uint64_t ProcessAlpha_SSE( __m128i px0, __m128i px1, __m128i px2, __m128i px3 )
+static etcpak_force_inline uint64_t ProcessOneChannel_SSE( __m128i a )
 {
-    __m128i mask = _mm_setr_epi32( 0x0f0b0703, -1, -1, -1 );
-
-    __m128i m0 = _mm_shuffle_epi8( px0, mask );
-    __m128i m1 = _mm_shuffle_epi8( px1, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 3, 0, 3 ) ) );
-    __m128i m2 = _mm_shuffle_epi8( px2, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 0, 3, 3 ) ) );
-    __m128i m3 = _mm_shuffle_epi8( px3, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 0, 3, 3, 3 ) ) );
-    __m128i m4 = _mm_or_si128( m0, m1 );
-    __m128i m5 = _mm_or_si128( m2, m3 );
-    __m128i a = _mm_or_si128( m4, m5 );
-
     __m128i solidCmp = _mm_shuffle_epi8( a, _mm_setzero_si128() );
     __m128i cmpRes = _mm_cmpeq_epi8( a, solidCmp );
     if( _mm_testc_si128( cmpRes, _mm_set1_epi32( -1 ) ) )
@@ -800,6 +790,21 @@ static etcpak_force_inline uint64_t ProcessAlpha_SSE( __m128i px0, __m128i px1,
     }
     return (uint64_t)(uint16_t)_mm_cvtsi128_si32( minmax ) | ( data << 16 );
 }
+
+static etcpak_force_inline uint64_t ProcessAlpha_SSE( __m128i px0, __m128i px1, __m128i px2, __m128i px3 )
+{
+    __m128i mask = _mm_setr_epi32( 0x0f0b0703, -1, -1, -1 );
+
+    __m128i m0 = _mm_shuffle_epi8( px0, mask );
+    __m128i m1 = _mm_shuffle_epi8( px1, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 3, 0, 3 ) ) );
+    __m128i m2 = _mm_shuffle_epi8( px2, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 0, 3, 3 ) ) );
+    __m128i m3 = _mm_shuffle_epi8( px3, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 0, 3, 3, 3 ) ) );
+    __m128i m4 = _mm_or_si128( m0, m1 );
+    __m128i m5 = _mm_or_si128( m2, m3 );
+    __m128i a = _mm_or_si128( m4, m5 );
+
+    return ProcessOneChannel_SSE( a );
+}
 #endif
 
 void CompressDxt1( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width )
@@ -954,3 +959,128 @@ void CompressDxt5( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t w
     }
     while( --blocks );
 }
+
+void CompressBc4( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width )
+{
+    int i = 0;
+    auto ptr = dst;
+    do
+    {
+#ifdef __SSE4_1__
+        __m128i px0 = _mm_loadu_si128( (__m128i*)( src + width * 0 ) );
+        __m128i px1 = _mm_loadu_si128( (__m128i*)( src + width * 1 ) );
+        __m128i px2 = _mm_loadu_si128( (__m128i*)( src + width * 2 ) );
+        __m128i px3 = _mm_loadu_si128( (__m128i*)( src + width * 3 ) );
+
+        src += 4;
+        if( ++i == width/4 )
+        {
+            src += width * 3;
+            i = 0;
+        }
+
+        __m128i mask = _mm_setr_epi32( 0x0c080400, -1, -1, -1 );
+
+        __m128i m0 = _mm_shuffle_epi8( px0, mask );
+        __m128i m1 = _mm_shuffle_epi8( px1, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 3, 0, 3 ) ) );
+        __m128i m2 = _mm_shuffle_epi8( px2, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 0, 3, 3 ) ) );
+        __m128i m3 = _mm_shuffle_epi8( px3, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 0, 3, 3, 3 ) ) );
+        __m128i m4 = _mm_or_si128( m0, m1 );
+        __m128i m5 = _mm_or_si128( m2, m3 );
+
+        *ptr++ = ProcessOneChannel_SSE( _mm_or_si128( m4, m5 ) );
+#else
+        uint8_t r[4*4];
+        auto rgba = src;
+        for( int i=0; i<4; i++ )
+        {
+            r[i*4] = rgba[0] & 0xff;
+            r[i*4+1] = rgba[1] & 0xff;
+            r[i*4+2] = rgba[2] & 0xff;
+            r[i*4+3] = rgba[3] & 0xff;
+
+            rgba += width;
+        }
+
+        src += 4;
+        if( ++i == width/4 )
+        {
+            src += width * 3;
+            i = 0;
+        }
+
+        *ptr++ = ProcessAlpha( r );
+#endif
+    } while( --blocks );
+}
+
+void CompressBc5( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width )
+{
+    int i = 0;
+    auto ptr = dst;
+    do
+    {
+#ifdef __SSE4_1__
+        __m128i px0 = _mm_loadu_si128( (__m128i*)( src + width * 0 ) );
+        __m128i px1 = _mm_loadu_si128( (__m128i*)( src + width * 1 ) );
+        __m128i px2 = _mm_loadu_si128( (__m128i*)( src + width * 2 ) );
+        __m128i px3 = _mm_loadu_si128( (__m128i*)( src + width * 3 ) );
+
+        src += 4;
+        if( ++i == width/4 )
+        {
+            src += width*3;
+            i = 0;
+        }
+
+        __m128i mask = _mm_setr_epi32( 0x0c080400, -1, -1, -1 );
+
+        __m128i m0 = _mm_shuffle_epi8( px0, mask );
+        __m128i m1 = _mm_shuffle_epi8( px1, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 3, 0, 3 ) ) );
+        __m128i m2 = _mm_shuffle_epi8( px2, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 0, 3, 3 ) ) );
+        __m128i m3 = _mm_shuffle_epi8( px3, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 0, 3, 3, 3 ) ) );
+        __m128i m4 = _mm_or_si128( m0, m1 );
+        __m128i m5 = _mm_or_si128( m2, m3 );
+
+        *ptr++ = ProcessOneChannel_SSE( _mm_or_si128( m4, m5 ) );
+
+        mask = _mm_setr_epi32( 0x0d090501, -1, -1, -1 );
+
+        m0 = _mm_shuffle_epi8( px0, mask );
+        m1 = _mm_shuffle_epi8( px1, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 3, 0, 3 ) ) );
+        m2 = _mm_shuffle_epi8( px2, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 0, 3, 3 ) ) );
+        m3 = _mm_shuffle_epi8( px3, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 0, 3, 3, 3 ) ) );
+        m4 = _mm_or_si128( m0, m1 );
+        m5 = _mm_or_si128( m2, m3 );
+
+        *ptr++ = ProcessOneChannel_SSE( _mm_or_si128( m4, m5 ) );
+#else
+        uint8_t rg[4*4*2];
+        auto rgba = src;
+        for( int i=0; i<4; i++ )
+        {
+            rg[i*4] = rgba[0] & 0xff;
+            rg[i*4+1] = rgba[1] & 0xff;
+            rg[i*4+2] = rgba[2] & 0xff;
+            rg[i*4+3] = rgba[3] & 0xff;
+
+            rg[16+i*4] = (rgba[0] & 0xff00) >> 8;
+            rg[16+i*4+1] = (rgba[1] & 0xff00) >> 8;
+            rg[16+i*4+2] = (rgba[2] & 0xff00) >> 8;
+            rg[16+i*4+3] = (rgba[3] & 0xff00) >> 8;
+
+            rgba += width;
+        }
+
+        src += 4;
+        if( ++i == width/4 )
+        {
+            src += width*3;
+            i = 0;
+        }
+
+        *ptr++ = ProcessAlpha( rg );
+        *ptr++ = ProcessAlpha( &rg[16] );
+#endif
+    } while( --blocks );
+}
@@ -8,4 +8,7 @@ void CompressDxt1( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t w
 void CompressDxt1Dither( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width );
 void CompressDxt5( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width );
 
+void CompressBc4( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width );
+void CompressBc5( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width );
+
 #endif
@@ -3266,16 +3266,21 @@ etcpak_force_inline static int16x8_t WidenMultiplier_EAC_NEON( int16x8_t multipl
 
 #endif
 
+template<bool checkSolid = true>
 static etcpak_force_inline uint64_t ProcessAlpha_ETC2( const uint8_t* src )
 {
 #if defined __SSE4_1__
-    // Check solid
     __m128i s = _mm_loadu_si128( (__m128i*)src );
-    __m128i solidCmp = _mm_set1_epi8( src[0] );
-    __m128i cmpRes = _mm_cmpeq_epi8( s, solidCmp );
-    if( _mm_testc_si128( cmpRes, _mm_set1_epi32( -1 ) ) )
+
+    if( checkSolid )
     {
-        return src[0];
+        // Check solid
+        __m128i solidCmp = _mm_set1_epi8( src[0] );
+        __m128i cmpRes = _mm_cmpeq_epi8( s, solidCmp );
+        if( _mm_testc_si128( cmpRes, _mm_set1_epi32( -1 ) ) )
+        {
+            return src[0];
+        }
     }
 
     // Calculate min, max
@@ -3684,12 +3689,15 @@ static etcpak_force_inline uint64_t ProcessAlpha_ETC2( const uint8_t* src )
     int srcMid;
     uint8x16_t srcAlphaBlock = vld1q_u8( src );
     {
-        uint8_t ref = src[0];
-        uint8x16_t a0 = vdupq_n_u8( ref );
-        uint8x16_t r = vceqq_u8( srcAlphaBlock, a0 );
-        int64x2_t m = vreinterpretq_s64_u8( r );
-        if( m[0] == -1 && m[1] == -1 )
-            return ref;
+        if( checkSolid )
+        {
+            uint8_t ref = src[0];
+            uint8x16_t a0 = vdupq_n_u8( ref );
+            uint8x16_t r = vceqq_u8( srcAlphaBlock, a0 );
+            int64x2_t m = vreinterpretq_s64_u8( r );
+            if( m[0] == -1 && m[1] == -1 )
+                return ref;
+        }
 
         // srcRange
 #ifdef __aarch64__
@@ -3759,6 +3767,7 @@ static etcpak_force_inline uint64_t ProcessAlpha_ETC2( const uint8_t* src )
 #undef EAC_RECONSTRUCT_VALUE
 
 #else
+    if( checkSolid )
     {
         bool solid = true;
         const uint8_t* ptr = src + 1;
@@ -3849,7 +3858,6 @@ static etcpak_force_inline uint64_t ProcessAlpha_ETC2( const uint8_t* src )
 #endif
 }
 
-
 void CompressEtc1Alpha( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width )
 {
     int w = 0;
@@ -4176,14 +4184,13 @@ void CompressEtc2Rgba( const uint32_t* src, uint64_t* dst, uint32_t blocks, size
             src += width * 3;
             w = 0;
         }
-        *dst++ = ProcessAlpha_ETC2( alpha );
+        *dst++ = ProcessAlpha_ETC2<true>( alpha );
         *dst++ = ProcessRGB_ETC2( (uint8_t*)rgba, useHeuristics );
     }
     while( --blocks );
 }
 
-// -- GODOT start --
-void CompressEtc2R8( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width )
+void CompressEacR( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width )
 {
     int w = 0;
     uint8_t r[4*4];
@@ -4239,12 +4246,12 @@ void CompressEtc2R8( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t
             src += width * 3;
             w = 0;
         }
-        *dst++ = ProcessAlpha_ETC2( r );
+        *dst++ = ProcessAlpha_ETC2<false>( r );
     }
     while( --blocks );
 }
 
-void CompressEtc2RG8( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width )
+void CompressEacRg( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width )
 {
     int w = 0;
     uint8_t rg[4*4*2];
@@ -4300,15 +4307,15 @@ void CompressEtc2RG8( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_
             src += width;
             v = *src;
             *ptrr++ = (v & 0xff0000) >> 16;
-			*ptrg++ = (v & 0xff00) >> 8;
+            *ptrg++ = (v & 0xff00) >> 8;
             src += width;
             v = *src;
             *ptrr++ = (v & 0xff0000) >> 16;
-			*ptrg++ = (v & 0xff00) >> 8;
+            *ptrg++ = (v & 0xff00) >> 8;
             src += width;
             v = *src;
             *ptrr++ = (v & 0xff0000) >> 16;
-			*ptrg++ = (v & 0xff00) >> 8;
+            *ptrg++ = (v & 0xff00) >> 8;
             src -= width * 3 - 1;
         }
 #endif
@@ -4317,9 +4324,8 @@ void CompressEtc2RG8( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_
             src += width * 3;
             w = 0;
         }
-        *dst++ = ProcessAlpha_ETC2( rg );
-        *dst++ = ProcessAlpha_ETC2( &rg[16] );
+        *dst++ = ProcessAlpha_ETC2<false>( rg );
+        *dst++ = ProcessAlpha_ETC2<false>( &rg[16] );
     }
     while( --blocks );
 }
-// -- GODOT end --
@@ -9,8 +9,8 @@ void CompressEtc1Rgb( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_
 void CompressEtc1RgbDither( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width );
 void CompressEtc2Rgb( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width, bool useHeuristics );
 void CompressEtc2Rgba( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width, bool useHeuristics );
-// -- GODOT start --
-void CompressEtc2R8( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width );
-void CompressEtc2RG8( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width );
-// -- GODOT end --
+
+void CompressEacR( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width );
+void CompressEacRg( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width );
+
 #endif