Skip to content

Commit

Permalink
Merge pull request #2965 from facebook/offbase
Browse files Browse the repository at this point in the history
Converge sumtype (offset | repcode) numeric representation towards offBase
  • Loading branch information
Cyan4973 authored Jan 24, 2022
2 parents 70df5de + 03903f5 commit cc7d23b
Show file tree
Hide file tree
Showing 12 changed files with 275 additions and 218 deletions.
53 changes: 52 additions & 1 deletion contrib/seekable_format/zstdseek_decompress.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,64 @@
# endif
#endif

/* ************************************************************
* Detect POSIX version
* PLATFORM_POSIX_VERSION = 0 for non-Unix e.g. Windows
* PLATFORM_POSIX_VERSION = 1 for Unix-like but non-POSIX
* PLATFORM_POSIX_VERSION > 1 is equal to found _POSIX_VERSION
* Value of PLATFORM_POSIX_VERSION can be forced on command line
***************************************************************/
#ifndef PLATFORM_POSIX_VERSION

# if (defined(__APPLE__) && defined(__MACH__)) || defined(__SVR4) || defined(_AIX) || defined(__hpux) /* POSIX.1-2001 (SUSv3) conformant */ \
|| defined(__DragonFly__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) /* BSD distros */
/* exception rule : force posix version to 200112L,
* note: it's better to use unistd.h's _POSIX_VERSION whenever possible */
# define PLATFORM_POSIX_VERSION 200112L

/* try to determine posix version through official unistd.h's _POSIX_VERSION (http://pubs.opengroup.org/onlinepubs/7908799/xsh/unistd.h.html).
* note : there is no simple way to know in advance if <unistd.h> is present or not on target system,
* Posix specification mandates its presence and its content, but target system must respect this spec.
* It's necessary to _not_ #include <unistd.h> whenever target OS is not unix-like
* otherwise it will block preprocessing stage.
* The following list of build macros tries to "guess" if target OS is likely unix-like, and therefore can #include <unistd.h>
*/
# elif !defined(_WIN32) \
&& ( defined(__unix__) || defined(__unix) \
|| defined(__midipix__) || defined(__VMS) || defined(__HAIKU__) )

# if defined(__linux__) || defined(__linux) || defined(__CYGWIN__)
# ifndef _POSIX_C_SOURCE
# define _POSIX_C_SOURCE 200809L /* feature test macro : https://www.gnu.org/software/libc/manual/html_node/Feature-Test-Macros.html */
# endif
# endif
# include <unistd.h> /* declares _POSIX_VERSION */
# if defined(_POSIX_VERSION) /* POSIX compliant */
# define PLATFORM_POSIX_VERSION _POSIX_VERSION
# else
# define PLATFORM_POSIX_VERSION 1
# endif

# ifdef __UCLIBC__
# ifndef __USE_MISC
# define __USE_MISC /* enable st_mtim on uclibc */
# endif
# endif

# else /* non-unix target platform (like Windows) */
# define PLATFORM_POSIX_VERSION 0
# endif

#endif /* PLATFORM_POSIX_VERSION */


/* ************************************************************
* Avoid fseek()'s 2GiB barrier with MSVC, macOS, *BSD, MinGW
***************************************************************/
#if defined(_MSC_VER) && _MSC_VER >= 1400
# define LONG_SEEK _fseeki64
#elif !defined(__64BIT__) && (PLATFORM_POSIX_VERSION >= 200112L) /* No point defining Large file for 64 bit */
# define LONG_SEEK fseeko
# define LONG_SEEK fseeko
#elif defined(__MINGW32__) && !defined(__STRICT_ANSI__) && !defined(__NO_MINGW_LFS) && defined(__MSVCRT__)
# define LONG_SEEK fseeko64
#elif defined(_WIN32) && !defined(__DJGPP__)
Expand Down
72 changes: 37 additions & 35 deletions lib/compress/zstd_compress.c
Original file line number Diff line number Diff line change
Expand Up @@ -2940,7 +2940,7 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc)
/* seqStoreSeqs[i].offset == offCode+1, and ZSTD_updateRep() expects offCode
so we provide seqStoreSeqs[i].offset - 1 */
ZSTD_updateRep(updatedRepcodes.rep,
seqStoreSeqs[i].offBase - 1,
seqStoreSeqs[i].offBase,
seqStoreSeqs[i].litLength == 0);
literalsRead += outSeqs[i].litLength;
}
Expand Down Expand Up @@ -3433,20 +3433,22 @@ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore,
}

/**
* Returns the raw offset represented by the combination of offCode, ll0, and repcode history.
* offCode must represent a repcode in the numeric representation of ZSTD_storeSeq().
* Returns the raw offset represented by the combination of offBase, ll0, and repcode history.
* offBase must represent a repcode in the numeric representation of ZSTD_storeSeq().
*/
static U32
ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offCode, const U32 ll0)
ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offBase, const U32 ll0)
{
U32 const adjustedOffCode = STORED_REPCODE(offCode) - 1 + ll0; /* [ 0 - 3 ] */
assert(STORED_IS_REPCODE(offCode));
if (adjustedOffCode == ZSTD_REP_NUM) {
/* litlength == 0 and offCode == 2 implies selection of first repcode - 1 */
assert(rep[0] > 0);
U32 const adjustedRepCode = OFFBASE_TO_REPCODE(offBase) - 1 + ll0; /* [ 0 - 3 ] */
assert(OFFBASE_IS_REPCODE(offBase));
if (adjustedRepCode == ZSTD_REP_NUM) {
/* litlength == 0 and offCode == 2 implies selection of first repcode - 1
* This is only valid if it results in a valid offset value, aka > 0.
*/
assert(rep[0] > 1);
return rep[0] - 1;
}
return rep[adjustedOffCode];
return rep[adjustedRepCode];
}

/**
Expand All @@ -3468,11 +3470,11 @@ static void ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_
for (; idx < nbSeq; ++idx) {
seqDef* const seq = seqStore->sequencesStart + idx;
U32 const ll0 = (seq->litLength == 0);
U32 const offCode = OFFBASE_TO_STORED(seq->offBase);
U32 const offBase = seq->offBase;
assert(seq->offBase > 0);
if (STORED_IS_REPCODE(offCode)) {
U32 const dRawOffset = ZSTD_resolveRepcodeToRawOffset(dRepcodes->rep, offCode, ll0);
U32 const cRawOffset = ZSTD_resolveRepcodeToRawOffset(cRepcodes->rep, offCode, ll0);
if (OFFBASE_IS_REPCODE(offBase)) {
U32 const dRawOffset = ZSTD_resolveRepcodeToRawOffset(dRepcodes->rep, offBase, ll0);
U32 const cRawOffset = ZSTD_resolveRepcodeToRawOffset(cRepcodes->rep, offBase, ll0);
/* Adjust simulated decompression repcode history if we come across a mismatch. Replace
* the repcode with the offset it actually references, determined by the compression
* repcode history.
Expand All @@ -3484,8 +3486,8 @@ static void ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_
/* Compression repcode history is always updated with values directly from the unmodified seqStore.
* Decompression repcode history may use modified seq->offset value taken from compression repcode history.
*/
ZSTD_updateRep(dRepcodes->rep, OFFBASE_TO_STORED(seq->offBase), ll0);
ZSTD_updateRep(cRepcodes->rep, offCode, ll0);
ZSTD_updateRep(dRepcodes->rep, seq->offBase, ll0);
ZSTD_updateRep(cRepcodes->rep, offBase, ll0);
}
}

Expand Down Expand Up @@ -5770,26 +5772,26 @@ ZSTD_validateSequence(U32 offCode, U32 matchLength,
* window size. After output surpasses windowSize, we're limited to windowSize offsets again.
*/
size_t const offsetBound = posInSrc > windowSize ? (size_t)windowSize : posInSrc + (size_t)dictSize;
RETURN_ERROR_IF(offCode > STORE_OFFSET(offsetBound), corruption_detected, "Offset too large!");
RETURN_ERROR_IF(offCode > OFFSET_TO_OFFBASE(offsetBound), corruption_detected, "Offset too large!");
RETURN_ERROR_IF(matchLength < MINMATCH, corruption_detected, "Matchlength too small");
return 0;
}

/* Returns an offset code, given a sequence's raw offset, the ongoing repcode array, and whether litLength == 0 */
static U32 ZSTD_finalizeOffCode(U32 rawOffset, const U32 rep[ZSTD_REP_NUM], U32 ll0)
static U32 ZSTD_finalizeOffBase(U32 rawOffset, const U32 rep[ZSTD_REP_NUM], U32 ll0)
{
U32 offCode = STORE_OFFSET(rawOffset);
U32 offBase = OFFSET_TO_OFFBASE(rawOffset);

if (!ll0 && rawOffset == rep[0]) {
offCode = STORE_REPCODE_1;
offBase = REPCODE1_TO_OFFBASE;
} else if (rawOffset == rep[1]) {
offCode = STORE_REPCODE(2 - ll0);
offBase = REPCODE_TO_OFFBASE(2 - ll0);
} else if (rawOffset == rep[2]) {
offCode = STORE_REPCODE(3 - ll0);
offBase = REPCODE_TO_OFFBASE(3 - ll0);
} else if (ll0 && rawOffset == rep[0] - 1) {
offCode = STORE_REPCODE_3;
offBase = REPCODE3_TO_OFFBASE;
}
return offCode;
return offBase;
}

/* Returns 0 on success, and a ZSTD_error otherwise. This function scans through an array of
Expand Down Expand Up @@ -5819,19 +5821,19 @@ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
U32 const litLength = inSeqs[idx].litLength;
U32 const ll0 = (litLength == 0);
U32 const matchLength = inSeqs[idx].matchLength;
U32 const offCode = ZSTD_finalizeOffCode(inSeqs[idx].offset, updatedRepcodes.rep, ll0);
ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0);
U32 const offBase = ZSTD_finalizeOffBase(inSeqs[idx].offset, updatedRepcodes.rep, ll0);
ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0);

DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offCode, matchLength, litLength);
DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength);
if (cctx->appliedParams.validateSequences) {
seqPos->posInSrc += litLength + matchLength;
FORWARD_IF_ERROR(ZSTD_validateSequence(offCode, matchLength, seqPos->posInSrc,
FORWARD_IF_ERROR(ZSTD_validateSequence(offBase, matchLength, seqPos->posInSrc,
cctx->appliedParams.cParams.windowLog, dictSize),
"Sequence validation failed");
}
RETURN_ERROR_IF(idx - seqPos->idx > cctx->seqStore.maxNbSeq, memory_allocation,
"Not enough memory allocated. Try adjusting ZSTD_c_minMatch.");
ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, matchLength);
ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offBase, matchLength);
ip += matchLength + litLength;
}
ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(repcodes_t));
Expand Down Expand Up @@ -5888,7 +5890,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
U32 litLength = currSeq.litLength;
U32 matchLength = currSeq.matchLength;
U32 const rawOffset = currSeq.offset;
U32 offCode;
U32 offBase;

/* Modify the sequence depending on where endPosInSequence lies */
if (endPosInSequence >= currSeq.litLength + currSeq.matchLength) {
Expand Down Expand Up @@ -5942,20 +5944,20 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
}
/* Check if this offset can be represented with a repcode */
{ U32 const ll0 = (litLength == 0);
offCode = ZSTD_finalizeOffCode(rawOffset, updatedRepcodes.rep, ll0);
ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0);
offBase = ZSTD_finalizeOffBase(rawOffset, updatedRepcodes.rep, ll0);
ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0);
}

if (cctx->appliedParams.validateSequences) {
seqPos->posInSrc += litLength + matchLength;
FORWARD_IF_ERROR(ZSTD_validateSequence(offCode, matchLength, seqPos->posInSrc,
FORWARD_IF_ERROR(ZSTD_validateSequence(offBase, matchLength, seqPos->posInSrc,
cctx->appliedParams.cParams.windowLog, dictSize),
"Sequence validation failed");
}
DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offCode, matchLength, litLength);
DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength);
RETURN_ERROR_IF(idx - seqPos->idx > cctx->seqStore.maxNbSeq, memory_allocation,
"Not enough memory allocated. Try adjusting ZSTD_c_minMatch.");
ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, matchLength);
ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offBase, matchLength);
ip += matchLength + litLength;
}
DEBUGLOG(5, "Ending seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength);
Expand Down
56 changes: 27 additions & 29 deletions lib/compress/zstd_compress_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -577,29 +577,27 @@ ZSTD_safecopyLiterals(BYTE* op, BYTE const* ip, BYTE const* const iend, BYTE con
while (ip < iend) *op++ = *ip++;
}

#define ZSTD_REP_MOVE (ZSTD_REP_NUM-1)
#define STORE_REPCODE_1 STORE_REPCODE(1)
#define STORE_REPCODE_2 STORE_REPCODE(2)
#define STORE_REPCODE_3 STORE_REPCODE(3)
#define STORE_REPCODE(r) (assert((r)>=1), assert((r)<=3), (r)-1)
#define STORE_OFFSET(o) (assert((o)>0), o + ZSTD_REP_MOVE)
#define STORED_IS_OFFSET(o) ((o) > ZSTD_REP_MOVE)
#define STORED_IS_REPCODE(o) ((o) <= ZSTD_REP_MOVE)
#define STORED_OFFSET(o) (assert(STORED_IS_OFFSET(o)), (o)-ZSTD_REP_MOVE)
#define STORED_REPCODE(o) (assert(STORED_IS_REPCODE(o)), (o)+1) /* returns ID 1,2,3 */
#define STORED_TO_OFFBASE(o) ((o)+1)
#define OFFBASE_TO_STORED(o) ((o)-1)

#define REPCODE1_TO_OFFBASE REPCODE_TO_OFFBASE(1)
#define REPCODE2_TO_OFFBASE REPCODE_TO_OFFBASE(2)
#define REPCODE3_TO_OFFBASE REPCODE_TO_OFFBASE(3)
#define REPCODE_TO_OFFBASE(r) (assert((r)>=1), assert((r)<=ZSTD_REP_NUM), (r)) /* accepts IDs 1,2,3 */
#define OFFSET_TO_OFFBASE(o) (assert((o)>0), o + ZSTD_REP_NUM)
#define OFFBASE_IS_OFFSET(o) ((o) > ZSTD_REP_NUM)
#define OFFBASE_IS_REPCODE(o) ( 1 <= (o) && (o) <= ZSTD_REP_NUM)
#define OFFBASE_TO_OFFSET(o) (assert(OFFBASE_IS_OFFSET(o)), (o) - ZSTD_REP_NUM)
#define OFFBASE_TO_REPCODE(o) (assert(OFFBASE_IS_REPCODE(o)), (o)) /* returns ID 1,2,3 */

/*! ZSTD_storeSeq() :
* Store a sequence (litlen, litPtr, offCode and matchLength) into seqStore_t.
* @offBase_minus1 : Users should use employ macros STORE_REPCODE_X and STORE_OFFSET().
* Store a sequence (litlen, litPtr, offBase and matchLength) into seqStore_t.
* @offBase : Users should employ macros REPCODE_TO_OFFBASE() and OFFSET_TO_OFFBASE().
* @matchLength : must be >= MINMATCH
* Allowed to overread literals up to litLimit.
* Allowed to over-read literals up to litLimit.
*/
HINT_INLINE UNUSED_ATTR void
ZSTD_storeSeq(seqStore_t* seqStorePtr,
size_t litLength, const BYTE* literals, const BYTE* litLimit,
U32 offBase_minus1,
U32 offBase,
size_t matchLength)
{
BYTE const* const litLimit_w = litLimit - WILDCOPY_OVERLENGTH;
Expand All @@ -608,8 +606,8 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
static const BYTE* g_start = NULL;
if (g_start==NULL) g_start = (const BYTE*)literals; /* note : index only works for compression within a single segment */
{ U32 const pos = (U32)((const BYTE*)literals - g_start);
DEBUGLOG(6, "Cpos%7u :%3u literals, match%4u bytes at offCode%7u",
pos, (U32)litLength, (U32)matchLength, (U32)offBase_minus1);
DEBUGLOG(6, "Cpos%7u :%3u literals, match%4u bytes at offBase%7u",
pos, (U32)litLength, (U32)matchLength, (U32)offBase);
}
#endif
assert((size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart) < seqStorePtr->maxNbSeq);
Expand All @@ -619,9 +617,9 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
assert(literals + litLength <= litLimit);
if (litEnd <= litLimit_w) {
/* Common case we can use wildcopy.
* First copy 16 bytes, because literals are likely short.
*/
assert(WILDCOPY_OVERLENGTH >= 16);
* First copy 16 bytes, because literals are likely short.
*/
ZSTD_STATIC_ASSERT(WILDCOPY_OVERLENGTH >= 16);
ZSTD_copy16(seqStorePtr->lit, literals);
if (litLength > 16) {
ZSTD_wildcopy(seqStorePtr->lit+16, literals+16, (ptrdiff_t)litLength-16, ZSTD_no_overlap);
Expand All @@ -640,7 +638,7 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
seqStorePtr->sequences[0].litLength = (U16)litLength;

/* match offset */
seqStorePtr->sequences[0].offBase = STORED_TO_OFFBASE(offBase_minus1);
seqStorePtr->sequences[0].offBase = offBase;

/* match Length */
assert(matchLength >= MINMATCH);
Expand All @@ -658,17 +656,17 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,

/* ZSTD_updateRep() :
* updates in-place @rep (array of repeat offsets)
* @offBase_minus1 : sum-type, with same numeric representation as ZSTD_storeSeq()
* @offBase : sum-type, using numeric representation of ZSTD_storeSeq()
*/
MEM_STATIC void
ZSTD_updateRep(U32 rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0)
ZSTD_updateRep(U32 rep[ZSTD_REP_NUM], U32 const offBase, U32 const ll0)
{
if (STORED_IS_OFFSET(offBase_minus1)) { /* full offset */
if (OFFBASE_IS_OFFSET(offBase)) { /* full offset */
rep[2] = rep[1];
rep[1] = rep[0];
rep[0] = STORED_OFFSET(offBase_minus1);
rep[0] = OFFBASE_TO_OFFSET(offBase);
} else { /* repcode */
U32 const repCode = STORED_REPCODE(offBase_minus1) - 1 + ll0;
U32 const repCode = OFFBASE_TO_REPCODE(offBase) - 1 + ll0;
if (repCode > 0) { /* note : if repCode==0, no change */
U32 const currentOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode];
rep[2] = (repCode >= 2) ? rep[1] : rep[2];
Expand All @@ -685,11 +683,11 @@ typedef struct repcodes_s {
} repcodes_t;

MEM_STATIC repcodes_t
ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0)
ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase, U32 const ll0)
{
repcodes_t newReps;
ZSTD_memcpy(&newReps, rep, sizeof(newReps));
ZSTD_updateRep(newReps.rep, offBase_minus1, ll0);
ZSTD_updateRep(newReps.rep, offBase, ll0);
return newReps;
}

Expand Down
Loading

0 comments on commit cc7d23b

Please sign in to comment.