source/YCoCgDXT.cpp

/*
 YCoCgDXT.c
 Hap Codec

 Copyright (c) 2012-2013, Tom Butterworth and Vidvox LLC. All rights reserved. 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
 
 * Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 
 * Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in the
 documentation and/or other materials provided with the distribution.
 
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


    Based on code by J.M.P. van Waveren / id Software, Inc.
    and changes by Chris Sidhall / Electronic Arts

    My changes are trivial:
      - Remove dependencies on other EAWebKit files
      - Mark unexported functions as static
      - Refactor to eliminate use of a global variable
      - Correct spelling of NVIDIA_7X_HARDWARE_BUG_FIX macro
      - Remove single usage of an assert macro


 Copyright (C) 2009-2011 Electronic Arts, Inc.  All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions
 are met:
 
 1.  Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2.  Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in the
 documentation and/or other materials provided with the distribution.
 3.  Neither the name of Electronic Arts, Inc. ("EA") nor the names of
 its contributors may be used to endorse or promote products derived
 from this software without specific prior written permission.
 
 THIS SOFTWARE IS PROVIDED BY ELECTRONIC ARTS AND ITS CONTRIBUTORS "AS IS" AND ANY
 EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 DISCLAIMED. IN NO EVENT SHALL ELECTRONIC ARTS OR ITS CONTRIBUTORS BE LIABLE FOR ANY
 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

///////////////////////////////////////////////////////////////////////////////
// BCImageCompressionEA.cpp
// File created by Chrs Sidhall
// Also please see Copyright (C) 2007 Id Software, Inc used in this file.
///////////////////////////////////////////////////////////////////////////////

#include "YCoCgDXT.h"
#include <string.h>
#include <stdlib.h>

/* ALWAYS_INLINE */
/* Derived from EAWebKit's AlwaysInline.h, losing some of its support for other compilers */

#ifndef ALWAYS_INLINE
#if (defined(__GNUC__) || defined(__clang__)) && !defined(DEBUG)
#define ALWAYS_INLINE inline __attribute__((__always_inline__))
#elif defined(_MSC_VER) && defined(NDEBUG)
#define ALWAYS_INLINE __forceinline
#else
#define ALWAYS_INLINE inline
#endif
#endif

// CSidhall Note: The compression code is directly from http://developer.nvidia.com/object/real-time-ycocg-dxt-compression.html
// It was missing some Emit functions but have tried to keep it as close as possible to the orignal version.
// Also removed some alpha handling which was never used and added a few overloaded functions (like ExtractBlock).


/*
 Real-Time YCoCg DXT Compression
 Copyright (C) 2007 Id Software, Inc.
 Written by J.M.P. van Waveren
 
 This code is free software; you can redistribute it and/or
 modify it under the terms of the GNU Lesser General Public
 License as published by the Free Software Foundation; either
 version 2.1 of the License, or (at your option) any later version.
 
 This code is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 Lesser General Public License for more details.
 */

/*
 * This code was modified by Electronic Arts Inc Copyright � 2009
 */

#ifndef word
typedef unsigned short  word;
#endif 

#ifndef dword
typedef unsigned int    dword;
#endif

#define INSET_COLOR_SHIFT       4       // inset color bounding box
#define INSET_ALPHA_SHIFT       5       // inset alpha bounding box

#define C565_5_MASK             0xF8    // 0xFF minus last three bits
#define C565_6_MASK             0xFC    // 0xFF minus last two bits

#define NVIDIA_G7X_HARDWARE_BUG_FIX     // keep the colors sorted as: max, min

#if defined(__LITTLE_ENDIAN__) || defined(_WIN32)
#define EA_SYSTEM_LITTLE_ENDIAN
#endif

static ALWAYS_INLINE word ColorTo565( const byte *color ) {
    return ( ( color[ 0 ] >> 3 ) << 11 ) | ( ( color[ 1 ] >> 2 ) << 5 ) | ( color[ 2 ] >> 3 );
}

static ALWAYS_INLINE void EmitByte( byte b, byte **outData ) {
    (*outData)[0] = b;
    *outData += 1;
}

static ALWAYS_INLINE void EmitUInt( unsigned int s, byte **outData ){
    (*outData)[0] = ( s >>  0 ) & 255;
    (*outData)[1] = ( s >>  8 ) & 255;
    (*outData)[2] = ( s >>  16 ) & 255;
    (*outData)[3] = ( s >>  24 ) & 255;
    *outData += 4;
}

static ALWAYS_INLINE void EmitUShort( unsigned short s, byte **outData ){
    (*outData)[0] = ( s >>  0 ) & 255;
    (*outData)[1] = ( s >>  8 ) & 255;
    *outData += 2;
}

static ALWAYS_INLINE void ExtractBlock( const byte *inPtr, const int stride, byte *colorBlock ) {
    for ( int j = 0; j < 4; j++ ) {
        memcpy( &colorBlock[j*4*4], inPtr, 4*4 );
        inPtr += stride;
    }
}

// This box extract replicates the last rows and columns if the row or columns are not 4 texels aligned
// This is so we don't get random pixels which could affect the color interpolation
static void ExtractBlock( const byte *inPtr, const int stride, const int widthRemain, const int heightRemain, byte *colorBlock ) {
    int *pBlock32 = (int *) colorBlock;  // Since we are using ARGA, we assume 4 byte alignment is already being used
    int *pSource32 = (int*) inPtr; 
    
    int hIndex=0;
    for(int j =0; j < 4; j++) {
        int wIndex = 0;
        for(int i=0; i < 4; i++) {
            pBlock32[i] = pSource32[wIndex];    
            // Set up offset for next column source (keep existing if we are at the end)         
            if(wIndex < (widthRemain - 1)) {
                wIndex++;
            }
        }
        
        // Set up offset for next texel row source (keep existing if we are at the end)
        pBlock32 +=4;    
        if(hIndex < (heightRemain-1)) {
            pSource32 +=(stride >> 2);
            hIndex++;
        }
    }
}

static void GetMinMaxYCoCg( byte *colorBlock, byte *minColor, byte *maxColor ) {
    minColor[0] = minColor[1] = minColor[2] = minColor[3] = 255;
    maxColor[0] = maxColor[1] = maxColor[2] = maxColor[3] = 0;
    
    for ( int i = 0; i < 16; i++ ) {
        if ( colorBlock[i*4+0] < minColor[0] ) {
            minColor[0] = colorBlock[i*4+0];
        }
        if ( colorBlock[i*4+1] < minColor[1] ) {
            minColor[1] = colorBlock[i*4+1];
        }
        // Note: the alpha is not used so no point in checking for it
        //        if ( colorBlock[i*4+2] < minColor[2] ) {
        //            minColor[2] = colorBlock[i*4+2];
        //        }
        if ( colorBlock[i*4+3] < minColor[3] ) {
            minColor[3] = colorBlock[i*4+3];
        }
        if ( colorBlock[i*4+0] > maxColor[0] ) {
            maxColor[0] = colorBlock[i*4+0];
        }
        if ( colorBlock[i*4+1] > maxColor[1] ) {
            maxColor[1] = colorBlock[i*4+1];
        }
        // Note: the alpha is not used so no point in checking for it
        //        if ( colorBlock[i*4+2] > maxColor[2] ) {
        //            maxColor[2] = colorBlock[i*4+2];
        //        }
        if ( colorBlock[i*4+3] > maxColor[3] ) {
            maxColor[3] = colorBlock[i*4+3];
        }
    }
}


// EA/Alex Mole: abs isn't inlined and gets called a *lot* in this code :)
// Let's make us an inlined one!
static ALWAYS_INLINE int absEA( int liArg )
{
    return ( liArg >= 0 ) ? liArg : -liArg;
}


static void ScaleYCoCg( byte *colorBlock, byte *minColor, byte *maxColor ) {
    int m0 = absEA( minColor[0] - 128 );      // (the 128 is to center to color to grey (128,128) )
    int m1 = absEA( minColor[1] - 128 );
    int m2 = absEA( maxColor[0] - 128 );
    int m3 = absEA( maxColor[1] - 128 );
    
    if ( m1 > m0 ) m0 = m1;
    if ( m3 > m2 ) m2 = m3;
    if ( m2 > m0 ) m0 = m2;
    
    const int s0 = 128 / 2 - 1;
    const int s1 = 128 / 4 - 1;
    
    int mask0 = -( m0 <= s0 );
    int mask1 = -( m0 <= s1 );
    int scale = 1 + ( 1 & mask0 ) + ( 2 & mask1 );
    
    minColor[0] = ( minColor[0] - 128 ) * scale + 128;
    minColor[1] = ( minColor[1] - 128 ) * scale + 128;
    minColor[2] = ( scale - 1 ) << 3;
    
    maxColor[0] = ( maxColor[0] - 128 ) * scale + 128;
    maxColor[1] = ( maxColor[1] - 128 ) * scale + 128;
    maxColor[2] = ( scale - 1 ) << 3;
    
    for ( int i = 0; i < 16; i++ ) {
        colorBlock[i*4+0] = ( colorBlock[i*4+0] - 128 ) * scale + 128;
        colorBlock[i*4+1] = ( colorBlock[i*4+1] - 128 ) * scale + 128;
    }
}

static void InsetYCoCgBBox( byte *minColor, byte *maxColor ) {
    int inset[4];
    int mini[4];
    int maxi[4];
    
    inset[0] = ( maxColor[0] - minColor[0] ) - ((1<<(INSET_COLOR_SHIFT-1))-1);
    inset[1] = ( maxColor[1] - minColor[1] ) - ((1<<(INSET_COLOR_SHIFT-1))-1);
    inset[3] = ( maxColor[3] - minColor[3] ) - ((1<<(INSET_ALPHA_SHIFT-1))-1);
    
    mini[0] = ( ( minColor[0] << INSET_COLOR_SHIFT ) + inset[0] ) >> INSET_COLOR_SHIFT;
    mini[1] = ( ( minColor[1] << INSET_COLOR_SHIFT ) + inset[1] ) >> INSET_COLOR_SHIFT;
    mini[3] = ( ( minColor[3] << INSET_ALPHA_SHIFT ) + inset[3] ) >> INSET_ALPHA_SHIFT;
    
    maxi[0] = ( ( maxColor[0] << INSET_COLOR_SHIFT ) - inset[0] ) >> INSET_COLOR_SHIFT;
    maxi[1] = ( ( maxColor[1] << INSET_COLOR_SHIFT ) - inset[1] ) >> INSET_COLOR_SHIFT;
    maxi[3] = ( ( maxColor[3] << INSET_ALPHA_SHIFT ) - inset[3] ) >> INSET_ALPHA_SHIFT;
    
    mini[0] = ( mini[0] >= 0 ) ? mini[0] : 0;
    mini[1] = ( mini[1] >= 0 ) ? mini[1] : 0;
    mini[3] = ( mini[3] >= 0 ) ? mini[3] : 0;
    
    maxi[0] = ( maxi[0] <= 255 ) ? maxi[0] : 255;
    maxi[1] = ( maxi[1] <= 255 ) ? maxi[1] : 255;
    maxi[3] = ( maxi[3] <= 255 ) ? maxi[3] : 255;
    
    minColor[0] = ( mini[0] & C565_5_MASK ) | ( mini[0] >> 5 );
    minColor[1] = ( mini[1] & C565_6_MASK ) | ( mini[1] >> 6 );
    minColor[3] = mini[3];
    
    maxColor[0] = ( maxi[0] & C565_5_MASK ) | ( maxi[0] >> 5 );
    maxColor[1] = ( maxi[1] & C565_6_MASK ) | ( maxi[1] >> 6 );
    maxColor[3] = maxi[3];
}

static void SelectYCoCgDiagonal( const byte *colorBlock, byte *minColor, byte *maxColor ) {
    byte mid0 = ( (int) minColor[0] + maxColor[0] + 1 ) >> 1;
    byte mid1 = ( (int) minColor[1] + maxColor[1] + 1 ) >> 1;
    
    byte side = 0;
    for ( int i = 0; i < 16; i++ ) {
        byte b0 = colorBlock[i*4+0] >= mid0;
        byte b1 = colorBlock[i*4+1] >= mid1;
        side += ( b0 ^ b1 );
    }
    
    byte mask = -( side > 8 );
    
#ifdef NVIDIA_G7X_HARDWARE_BUG_FIX
    mask &= -( minColor[0] != maxColor[0] );
#endif
    
    byte c0 = minColor[1];
    byte c1 = maxColor[1];
    
    // PlayStation 3 compiler warning fix:    
    // c0 ^= c1 ^= mask &= c0 ^= c1;    // Orignial code
    byte c2 = c0 ^ c1;
    c0 = c2;
    c0 ^= c1 ^= mask &=c2;
    
    minColor[1] = c0;
    maxColor[1] = c1;
}

static void EmitAlphaIndices( const byte *colorBlock, const byte minAlpha, const byte maxAlpha, byte **outData ) {
        
    const int ALPHA_RANGE = 7;
    
    byte mid, ab1, ab2, ab3, ab4, ab5, ab6, ab7;
    byte indexes[16];
    
    mid = ( maxAlpha - minAlpha ) / ( 2 * ALPHA_RANGE );
    
    ab1 = minAlpha + mid;
    ab2 = ( 6 * maxAlpha + 1 * minAlpha ) / ALPHA_RANGE + mid;
    ab3 = ( 5 * maxAlpha + 2 * minAlpha ) / ALPHA_RANGE + mid;
    ab4 = ( 4 * maxAlpha + 3 * minAlpha ) / ALPHA_RANGE + mid;
    ab5 = ( 3 * maxAlpha + 4 * minAlpha ) / ALPHA_RANGE + mid;
    ab6 = ( 2 * maxAlpha + 5 * minAlpha ) / ALPHA_RANGE + mid;
    ab7 = ( 1 * maxAlpha + 6 * minAlpha ) / ALPHA_RANGE + mid;
    
    for ( int i = 0; i < 16; i++ ) {
        byte a = colorBlock[i*4+3]; // Here it seems to be using the Y (luna) for the alpha
        int b1 = ( a <= ab1 );
        int b2 = ( a <= ab2 );
        int b3 = ( a <= ab3 );
        int b4 = ( a <= ab4 );
        int b5 = ( a <= ab5 );
        int b6 = ( a <= ab6 );
        int b7 = ( a <= ab7 );
        int index = ( b1 + b2 + b3 + b4 + b5 + b6 + b7 + 1 ) & 7;
        indexes[i] = index ^ ( 2 > index );
    }
    
    EmitByte( (indexes[ 0] >> 0) | (indexes[ 1] << 3) | (indexes[ 2] << 6),  outData );
    EmitByte( (indexes[ 2] >> 2) | (indexes[ 3] << 1) | (indexes[ 4] << 4) | (indexes[ 5] << 7), outData );
    EmitByte( (indexes[ 5] >> 1) | (indexes[ 6] << 2) | (indexes[ 7] << 5), outData );
    
    EmitByte( (indexes[ 8] >> 0) | (indexes[ 9] << 3) | (indexes[10] << 6), outData );
    EmitByte( (indexes[10] >> 2) | (indexes[11] << 1) | (indexes[12] << 4) | (indexes[13] << 7), outData );
    EmitByte( (indexes[13] >> 1) | (indexes[14] << 2) | (indexes[15] << 5), outData );
}

static void EmitColorIndices( const byte *colorBlock, const byte *minColor, const byte *maxColor, byte **outData ) {
    word colors[4][4];
    unsigned int result = 0;
    
    colors[0][0] = ( maxColor[0] & C565_5_MASK ) | ( maxColor[0] >> 5 );
    colors[0][1] = ( maxColor[1] & C565_6_MASK ) | ( maxColor[1] >> 6 );
    colors[0][2] = ( maxColor[2] & C565_5_MASK ) | ( maxColor[2] >> 5 );
    colors[0][3] = 0;
    colors[1][0] = ( minColor[0] & C565_5_MASK ) | ( minColor[0] >> 5 );
    colors[1][1] = ( minColor[1] & C565_6_MASK ) | ( minColor[1] >> 6 );
    colors[1][2] = ( minColor[2] & C565_5_MASK ) | ( minColor[2] >> 5 );
    colors[1][3] = 0;
    colors[2][0] = ( 2 * colors[0][0] + 1 * colors[1][0] ) / 3;
    colors[2][1] = ( 2 * colors[0][1] + 1 * colors[1][1] ) / 3;
    colors[2][2] = ( 2 * colors[0][2] + 1 * colors[1][2] ) / 3;
    colors[2][3] = 0;
    colors[3][0] = ( 1 * colors[0][0] + 2 * colors[1][0] ) / 3;
    colors[3][1] = ( 1 * colors[0][1] + 2 * colors[1][1] ) / 3;
    colors[3][2] = ( 1 * colors[0][2] + 2 * colors[1][2] ) / 3;
    colors[3][3] = 0;
    
    for ( int i = 15; i >= 0; i-- ) {
        int c0, c1;
        
        c0 = colorBlock[i*4+0];
        c1 = colorBlock[i*4+1];
        
        int d0 = absEA( colors[0][0] - c0 ) + absEA( colors[0][1] - c1 );
        int d1 = absEA( colors[1][0] - c0 ) + absEA( colors[1][1] - c1 );
        int d2 = absEA( colors[2][0] - c0 ) + absEA( colors[2][1] - c1 );
        int d3 = absEA( colors[3][0] - c0 ) + absEA( colors[3][1] - c1 );
        
        bool b0 = d0 > d3;
        bool b1 = d1 > d2;
        bool b2 = d0 > d2;
        bool b3 = d1 > d3;
        bool b4 = d2 > d3;
        
        int x0 = b1 & b2;
        int x1 = b0 & b3;
        int x2 = b0 & b4;
        
        int indexFinal =  ( x2 | ( ( x0 | x1 ) << 1 ) ) << ( i << 1 );
        result |= indexFinal;
    }
    
    EmitUInt( result, outData );
}

/*F*************************************************************************************************/
/*!
 \Function    CompressYCoCgDXT5( const byte *inBuf, byte *outBuf, const int width, const int height, const int stride ) 
 
 \Description        This is the C version of the YcoCgDXT5.  
 
 Input data needs to be converted from ARGB to YCoCg before calling this function.
 
 Does not support alpha at all since it uses the alpha channel to store the Y (luma).
 
 The output size is 4:1 but will be based on rounded up texture sizes on 4 texel boundaries 
 So for example if the source texture is 33 x 32, the compressed size will be 36x32. 
 
 The DXT5 compresses groups of 4x4 texels into 16 bytes (4:1 saving) 
 
 The compressed format:
 2 bytes of min and max Y luma values (these are used to rebuild an 8 element Luma table)
 6 bytes of indexes into the luma table
 3 bits per index so 16 indexes total 
 2 shorts of min and max color values (these are used to rebuild a 4 element chroma table)
 5 bits Co
 6 bits Cg
 5 bits Scale. The scale can only be 1, 2 or 4. 
 4 bytes of indexes into the Chroma CocG table 
 2 bits per index so 16 indexes total
 
 \Input              const byte *inBuf   Input buffer of the YCoCG textel data
 \Input              const byte *outBuf  Output buffer for the compressed data
 \Input              int width           in source width 
 \Input              int height          in source height
 \Input              int stride          in source in buffer stride in bytes
 
 \Output             int ouput size
 
 \Version    1.1     CSidhall 01/12/09 modified to account for non aligned textures
 1.2     1/10/10 Added stride
 */
/*************************************************************************************************F*/
extern "C" int CompressYCoCgDXT5( const byte *inBuf, byte *outBuf, const int width, const int height , const int stride) {
    
    int outputBytes =0;
    
    byte block[64];
    byte minColor[4];
    byte maxColor[4];
    
    byte *outData = outBuf;
    
    int blockLineSize = stride * 4;  // 4 lines per loop
    
    for ( int j = 0; j < height; j += 4, inBuf +=blockLineSize ) {
        int heightRemain = height - j;    
        for ( int i = 0; i < width; i += 4 ) {
            
            // Note: Modified from orignal source so that it can handle the edge blending better with non aligned 4x textures
            int widthRemain = width - i;
            if ((heightRemain < 4) || (widthRemain < 4) ) {
                ExtractBlock( inBuf + i * 4, stride, widthRemain, heightRemain,  block );  
            }
            else {
                ExtractBlock( inBuf + i * 4, stride, block );
            }
            // A simple min max extract for each color channel including alpha             
            GetMinMaxYCoCg( block, minColor, maxColor );
            ScaleYCoCg( block, minColor, maxColor );    // Sets the scale in the min[2] and max[2] offset
            InsetYCoCgBBox( minColor, maxColor );
            SelectYCoCgDiagonal( block, minColor, maxColor );
            
            EmitByte( maxColor[3], &outData );    // Note: the luma is stored in the alpha channel
            EmitByte( minColor[3], &outData );
            
            EmitAlphaIndices( block, minColor[3], maxColor[3], &outData );
            
            EmitUShort( ColorTo565( maxColor ), &outData );
            EmitUShort( ColorTo565( minColor ), &outData );
            
            EmitColorIndices( block, minColor, maxColor, &outData );
        }
    }
    
    outputBytes = (int)(outData - outBuf);
    
    return outputBytes;
}


//--- YCoCgDXT5 Decompression ---
static void RestoreLumaAlphaBlock(  const void * pSource, byte * colorBlock){
    byte *pS=(unsigned char *) pSource;
    byte luma[8];     
    
    // Grabbed this standard table building from undxt.cpp UnInterpolatedAlphaBlock() 
    luma[0] = *pS++;                                                      
    luma[1] = *pS++;                                                      
    luma[2] = (byte)((6 * luma[0] + 1 * luma[1] + 3) / 7);    
    luma[3] = (byte)((5 * luma[0] + 2 * luma[1] + 3) / 7);    
    luma[4] = (byte)((4 * luma[0] + 3 * luma[1] + 3) / 7);    
    luma[5] = (byte)((3 * luma[0] + 4 * luma[1] + 3) / 7);    
    luma[6] = (byte)((2 * luma[0] + 5 * luma[1] + 3) / 7);    
    luma[7] = (byte)((1 * luma[0] + 6 * luma[1] + 3) / 7);    
    
    int rawIndexes;
    int raw;
    int colorIndex=3;
    // We have 6 bytes of indexes (3 bits * 16 texels)
    // Easier to process in 2 groups of 8 texels... 
    for(int j=0; j < 2; j++) {
        // Pack the indexes so we can shift out the indexes as a group 
        rawIndexes = *pS++;
        raw = *pS++;
        rawIndexes |= raw << 8;
        raw = *pS++;
        rawIndexes |= raw << 16;  
        
        // Since we still have to operate on the texels, just store it in a linear array workspace     
        for(int i=0; i < 8; i++) {       
            static const int LUMA_INDEX_FILTER = 0x7;   // To isolate the 3 bit luma index
            
            byte index = (byte)(rawIndexes & LUMA_INDEX_FILTER);
            colorBlock[colorIndex] = luma[index];
            colorIndex += 4;                                            
            rawIndexes  >>=3;      
        }
    }   
}


// Converts a 5.6.5 short back into 3 bytes 
static ALWAYS_INLINE void Convert565ToColor( const unsigned short value , byte *pOutColor ) 
{
    int c = value >> (5+6);
    pOutColor[0] = c << 3;  // Was a 5 bit so scale back up 
    
    c = value >> 5;
    c &=0x3f;               // Filter out the top value
    pOutColor[1] = c << 2;  // Was a 6 bit 
    
    c = value & 0x1f;         // Filter out the top values
    pOutColor[2] = c << 3;  // was a 5 bit so scale back up 
}

#ifndef EA_SYSTEM_LITTLE_ENDIAN
// Flip around the 2 bytes in a short
static ALWAYS_INLINE short ShortFlipBytes( short raw ) 
{
    return ((raw >> 8) & 0xff) | (raw << 8);
}
#endif

static void RestoreChromaBlock( const void * pSource, byte *colorBlock)
{
    unsigned short *pS =(unsigned short *) pSource;
    pS +=4;  // Color info stars after 8 bytes (first 8 is the Y/alpha channel info)
    
    unsigned short rawColor = *pS++;     
#ifndef EA_SYSTEM_LITTLE_ENDIAN  
    rawColor = ShortFlipBytes(rawColor);
#endif
    
    
    byte color[4][4];   // Color workspace 
    
    // Build the color lookup table 
    // The luma should have already been extracted and sitting at offset[3]
    Convert565ToColor( rawColor , &color[0][0] );     
    rawColor = *pS++;     
#ifndef EA_SYSTEM_LITTLE_ENDIAN  
    rawColor = ShortFlipBytes(rawColor);
#endif
    
    Convert565ToColor( rawColor , &color[1][0] ); 
    
    // EA/Alex Mole: mixing float & int operations is horrifyingly slow on some platforms, so we do it different!
#if defined(__PPU__) || defined(_XBOX)
    color[2][0] = (byte) ( ( ((int)color[0][0] * 3) + ((int)color[1][0]    ) ) >> 2 );
    color[2][1] = (byte) ( ( ((int)color[0][1] * 3) + ((int)color[1][1]    ) ) >> 2 );
    color[3][0] = (byte) ( ( ((int)color[0][0]    ) + ((int)color[1][0] * 3) ) >> 2 );
    color[3][1] = (byte) ( ( ((int)color[0][1]    ) + ((int)color[1][1] * 3) ) >> 2 );
#else
    color[2][0] = (byte) ( (color[0][0] * 0.75f) + (color[1][0] * 0.25f) );
    color[2][1] = (byte) ( (color[0][1] * 0.75f) + (color[1][1] * 0.25f) );
    color[3][0] = (byte) ( (color[0][0] * 0.25f) + (color[1][0] * 0.75f) );
    color[3][1] = (byte) ( (color[0][1] * 0.25f) + (color[1][1] * 0.75f) );
#endif
    
    byte scale = ((color[0][2] >> 3) + 1) >> 1; // Adjust for shifts instead of divide
    
    // Scale back values here so we don't have to do it for all 16 texels
    // Note: This is really only for the software version.  In hardware, the scale would need to be restored during the YCoCg to RGB conversion.
    for(int i=0; i < 4; i++) {
        color[i][0] = ((color[i][0] - 128) >> scale) + 128;
        color[i][1] = ((color[i][1] - 128) >> scale) + 128;
    }
    
    // Rebuild the color block using the indexes (2 bits per texel)
    int rawIndexes;
    int colorIndex=0;
    
    // We have 2 shorts of indexes (2 bits * 16 texels = 32 bits). (If can confirm 4x alignment, can grab it as a word with single loop) 
    for(int j=0; j < 2; j++) {
        rawIndexes = *pS++;
#ifndef EA_SYSTEM_LITTLE_ENDIAN  
        rawIndexes = ShortFlipBytes(rawIndexes);
#endif
        
        // Since we still have to operate on block, just store it in a linear array workspace     
        for(int i=0; i < 8; i++) {       
            static const int COCG_INDEX_FILTER = 0x3;   // To isolate the 2 bit chroma index
            
            unsigned char index = (unsigned char)(rawIndexes & COCG_INDEX_FILTER);
            colorBlock[colorIndex] = color[index][0];
            colorBlock[colorIndex+1] = color[index][1];
            colorBlock[colorIndex+2] = 255;  
            colorIndex += 4;                                            
            rawIndexes  >>=2;      
        }
    }   
}

// This stores a 4x4 texel block but can overflow the output rectangle size if it is not 4 texels aligned in size
static int ALWAYS_INLINE StoreBlock( const byte *colorBlock, const int stride, byte *outPtr ) {
    
    for ( int j = 0; j < 4; j++ ) {
        memcpy( (void*) outPtr,&colorBlock[j*4*4], 4*4 );
        outPtr += stride;
    }
    return 64;
}

// This store only the texels that are within the width and height boundaries so does not overflow
static int StoreBlock( const byte *colorBlock , const int stride, const int widthRemain, const int heightRemain,  byte *outPtr) 
{
    int outCount =0;
    int width = stride >> 2;    // Convert to int offsets
    
    int *pBlock32 = (int *) colorBlock;  // Since we are using ARGB, we assume 4 byte alignment is already being used
    int *pOutput32 = (int*) outPtr; 
    
    int widthMax = 4;
    if(widthRemain < 4) {
        widthMax = widthRemain;
    }
    
    int heightMax = 4;
    if(heightRemain < 4) {
        heightMax = heightRemain;    
    }
    
    for(int j =0; j < heightMax; j++) {
        for(int i=0; i < widthMax; i++) {
            pOutput32[i] = pBlock32[i];    
            outCount +=4;       
        }
        
        // Set up offset for next texel row source (keep existing if we are at the end)
        pBlock32 +=4;    
        pOutput32 +=width;
    }
    return outCount;
}


/*F*************************************************************************************************/
/*!
 \Function    DeCompressYCoCgDXT5( const byte *inBuf, byte *outBuf, const int width, const int height, const int stride ) 
 
 \Description  Decompression for YCoCgDXT5  
 Bascially does the reverse order of he compression.  
 
 Ouptut data still needs to be converted from YCoCg to ARGB after this function has completed
 (probably more efficient to convert it inside here but have not done so to stay closer to the orginal
 sample code and just make it easier to follow).
 
 16 bytes get unpacked into a 4x4 texel block (64 bytes output).
 
 The compressed format:
 2 bytes of min and max Y luma values (these are used to rebuild an 8 element Luma table)
 6 bytes of indexes into the luma table
 3 bits per index so 16 indexes total 
 2 shorts of min and max color values (these are used to rebuild a 4 element chroma table)
 5 bits Co
 6 bits Cg
 5 bits Scale. The scale can only be 1, 2 or 4. 
 4 bytes of indexes into the Chroma CocG table 
 2 bits per index so 16 indexes total
 
 
 \Input          const byte *inBuf
 \Input          byte *outBuf, 
 \Input          const int width
 \input          const int height 
 \input          const int stride for inBuf
 
 \Output         int size output in bytes
 
 
 \Version    1.0        01/12/09 Created
 1.1        12/21/09 Alex Mole: removed branches from tight inner loop
 1.2        11/10/10 CSidhall: Added stride for textures with different image and canvas sizes.
 */
/*************************************************************************************************F*/
extern "C" int DeCompressYCoCgDXT5( const byte *inBuf, byte *outBuf, const int width, const int height, const int stride )
{
    byte colorBlock[64];    // 4x4 texel work space a linear array 
    int outByteCount =0;
    const byte *pCurInBuffer = inBuf;
    
    int blockLineSize = stride * 4;  // 4 lines per loop
    for( int j = 0; j < ( height & ~3 ); j += 4, outBuf += blockLineSize )
    {
        int i;
        for( i = 0; i < ( width & ~3 ); i += 4 )
        {
            RestoreLumaAlphaBlock(pCurInBuffer, colorBlock);
            RestoreChromaBlock(pCurInBuffer, colorBlock);           
            outByteCount += StoreBlock(colorBlock, stride, outBuf + i * 4);
            pCurInBuffer += 16; // 16 bytes per block of compressed data
        }
        
        // Do we have some leftover columns?
        if( width & 3 )
        {
            int widthRemain = width & 3;
            
            RestoreLumaAlphaBlock(pCurInBuffer, colorBlock);
            RestoreChromaBlock(pCurInBuffer, colorBlock);
            
            outByteCount += StoreBlock(colorBlock , stride, widthRemain, 4 /* heightRemain >= 4 */, outBuf + i * 4);
            
            pCurInBuffer += 16; // 16 bytes per block of compressed data
        }
    }
    
    // Do we have some leftover lines?
    if( height & 3 )
    {
        int heightRemain = height & 3;
        
        for( int i = 0; i < width; i += 4 )
        {
            RestoreLumaAlphaBlock(pCurInBuffer, colorBlock);
            RestoreChromaBlock(pCurInBuffer, colorBlock);
            
            int widthRemain = width - i;
            outByteCount += StoreBlock(colorBlock , stride, widthRemain, heightRemain,  outBuf + i * 4);
            
            pCurInBuffer += 16; // 16 bytes per block of compressed data
        }
    }
    
    return outByteCount;
}