From bd4b2c0f19fc4518010ecb592047d833a4c1a741 Mon Sep 17 00:00:00 2001 From: Elliot Gorokhovsky Date: Wed, 23 Feb 2022 18:01:35 -0500 Subject: [PATCH 01/10] prefetch dict content inside loop --- lib/compress/zstd_fast.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/lib/compress/zstd_fast.c b/lib/compress/zstd_fast.c index 62c4c2cea02..42c52d2af75 100644 --- a/lib/compress/zstd_fast.c +++ b/lib/compress/zstd_fast.c @@ -399,6 +399,8 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( const U32 dictIndexDelta = prefixStartIndex - (U32)(dictEnd - dictBase); const U32 dictAndPrefixLength = (U32)(ip - prefixStart + dictEnd - dictStart); const U32 dictHLog = dictCParams->hashLog; + const BYTE* dictPrefetchPtr = dictEnd; + const BYTE* const dictPrefetchLimit = dictPrefetchPtr - MIN(dictEnd - dictStart, 102400); /* 100KB */ /* if a dictionary is still attached, it necessarily means that * it is within window size. So we just check it. */ @@ -434,6 +436,12 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( base + repIndex; hashTable[h] = curr; /* update hash table */ + if (dictPrefetchPtr >= dictPrefetchLimit) { + PREFETCH_L2(dictPrefetchPtr); + PREFETCH_L2(dictPrefetchPtr - 64); + dictPrefetchPtr -= 64; + } + if ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */ && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) { const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; From 744f7c19b2155aee5538b071ac7b1e3e7fac6caa Mon Sep 17 00:00:00 2001 From: Elliot Gorokhovsky Date: Mon, 28 Feb 2022 18:08:03 -0500 Subject: [PATCH 02/10] ip0/ip1 pipeline --- lib/compress/zstd_fast.c | 119 +++++++++++++++++++++++++-------------- 1 file changed, 78 insertions(+), 41 deletions(-) diff --git a/lib/compress/zstd_fast.c b/lib/compress/zstd_fast.c index 42c52d2af75..1b70c4850f5 100644 --- a/lib/compress/zstd_fast.c +++ b/lib/compress/zstd_fast.c @@ -380,7 +380,6 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( U32 const stepSize = cParams->targetLength + !(cParams->targetLength); const BYTE* const base = ms->window.base; const BYTE* const istart = (const BYTE*)src; - const BYTE* ip = istart; const BYTE* anchor = istart; const U32 prefixStartIndex = ms->window.dictLimit; const BYTE* const prefixStart = base + prefixStartIndex; @@ -397,15 +396,24 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( const BYTE* const dictStart = dictBase + dictStartIndex; const BYTE* const dictEnd = dms->window.nextSrc; const U32 dictIndexDelta = prefixStartIndex - (U32)(dictEnd - dictBase); - const U32 dictAndPrefixLength = (U32)(ip - prefixStart + dictEnd - dictStart); + const U32 dictAndPrefixLength = (U32)(istart - prefixStart + dictEnd - dictStart); const U32 dictHLog = dictCParams->hashLog; const BYTE* dictPrefetchPtr = dictEnd; const BYTE* const dictPrefetchLimit = dictPrefetchPtr - MIN(dictEnd - dictStart, 102400); /* 100KB */ + /* pipeline variables */ + const BYTE* ip0 = istart; + const BYTE* ip1; + size_t hash0; + size_t hash1; + size_t dictHash; /* inside the do{} loop, this is the hash for ip1 */ + U32 matchIndex; + U32 dictMatchIndex; + /* if a dictionary is still attached, it necessarily means that * it is within window size. So we just check it. */ const U32 maxDistance = 1U << cParams->windowLog; - const U32 endIndex = (U32)((size_t)(ip - base) + srcSize); + const U32 endIndex = (U32)((size_t)(istart - base) + srcSize); assert(endIndex - prefixStartIndex <= maxDistance); (void)maxDistance; (void)endIndex; /* these variables are not used when assert() is disabled */ @@ -417,24 +425,38 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( /* init */ DEBUGLOG(5, "ZSTD_compressBlock_fast_dictMatchState_generic"); - ip += (dictAndPrefixLength == 0); + ip0 += (dictAndPrefixLength == 0); /* dictMatchState repCode checks don't currently handle repCode == 0 * disabling. */ assert(offset_1 <= dictAndPrefixLength); assert(offset_2 <= dictAndPrefixLength); +_start: /* Requires: ip0 */ + assert(ip0 == anchor); + ip1 = ip0 + 1; + + if (ip1 >= ilimit) { + goto _cleanup; + } + + hash0 = ZSTD_hashPtr(ip0, hlog, mls); + dictHash = ZSTD_hashPtr(ip0, dictHLog, mls); + dictMatchIndex = dictHashTable[dictHash]; + matchIndex = hashTable[hash0]; + hash1 = ZSTD_hashPtr(ip1, hlog, mls); + dictHash = ZSTD_hashPtr(ip1, dictHLog, mls); + PREFETCH_L1(dictHashTable + dictHash); + /* Main Search Loop */ - while (ip < ilimit) { /* < instead of <=, because repcode check at (ip+1) */ + do { size_t mLength; - size_t const h = ZSTD_hashPtr(ip, hlog, mls); - U32 const curr = (U32)(ip-base); - U32 const matchIndex = hashTable[h]; + U32 const curr = (U32)(ip0-base); const BYTE* match = base + matchIndex; const U32 repIndex = curr + 1 - offset_1; const BYTE* repMatch = (repIndex < prefixStartIndex) ? dictBase + (repIndex - dictIndexDelta) : base + repIndex; - hashTable[h] = curr; /* update hash table */ + hashTable[hash0] = curr; /* update hash table */ if (dictPrefetchPtr >= dictPrefetchLimit) { PREFETCH_L2(dictPrefetchPtr); @@ -443,81 +465,96 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( } if ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */ - && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) { + && (MEM_read32(repMatch) == MEM_read32(ip0+1)) ) { const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; - mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4; - ip++; - ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength); + mLength = ZSTD_count_2segments(ip0+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4; + ip0++; + ZSTD_storeSeq(seqStore, (size_t)(ip0-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength); } else if ( (matchIndex <= prefixStartIndex) ) { - size_t const dictHash = ZSTD_hashPtr(ip, dictHLog, mls); - U32 const dictMatchIndex = dictHashTable[dictHash]; const BYTE* dictMatch = dictBase + dictMatchIndex; if (dictMatchIndex <= dictStartIndex || - MEM_read32(dictMatch) != MEM_read32(ip)) { + MEM_read32(dictMatch) != MEM_read32(ip0)) { assert(stepSize >= 1); - ip += ((ip-anchor) >> kSearchStrength) + stepSize; + dictMatchIndex = dictHashTable[dictHash]; + matchIndex = hashTable[hash1]; + ip0 = ip1; + ip1 = ip1 + ((ip1-anchor) >> kSearchStrength) + stepSize; + hash0 = hash1; + hash1 = ZSTD_hashPtr(ip1, hlog, mls); + dictHash = ZSTD_hashPtr(ip1, dictHLog, mls); + PREFETCH_L1(dictHashTable + dictHash); continue; } else { /* found a dict match */ U32 const offset = (U32)(curr-dictMatchIndex-dictIndexDelta); - mLength = ZSTD_count_2segments(ip+4, dictMatch+4, iend, dictEnd, prefixStart) + 4; - while (((ip>anchor) & (dictMatch>dictStart)) - && (ip[-1] == dictMatch[-1])) { - ip--; dictMatch--; mLength++; + mLength = ZSTD_count_2segments(ip0+4, dictMatch+4, iend, dictEnd, prefixStart) + 4; + while (((ip0>anchor) & (dictMatch>dictStart)) + && (ip0[-1] == dictMatch[-1])) { + ip0--; dictMatch--; mLength++; } /* catch up */ offset_2 = offset_1; offset_1 = offset; - ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); + ZSTD_storeSeq(seqStore, (size_t)(ip0-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); } - } else if (MEM_read32(match) != MEM_read32(ip)) { + } else if (MEM_read32(match) != MEM_read32(ip0)) { /* it's not a match, and we're not going to check the dictionary */ assert(stepSize >= 1); - ip += ((ip-anchor) >> kSearchStrength) + stepSize; + dictMatchIndex = dictHashTable[dictHash]; + matchIndex = hashTable[hash1]; + ip0 = ip1; + ip1 = ip1 + ((ip1-anchor) >> kSearchStrength) + stepSize; + hash0 = hash1; + hash1 = ZSTD_hashPtr(ip1, hlog, mls); + dictHash = ZSTD_hashPtr(ip1, dictHLog, mls); + PREFETCH_L1(dictHashTable + dictHash); continue; } else { /* found a regular match */ - U32 const offset = (U32)(ip-match); - mLength = ZSTD_count(ip+4, match+4, iend) + 4; - while (((ip>anchor) & (match>prefixStart)) - && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */ + U32 const offset = (U32)(ip0-match); + mLength = ZSTD_count(ip0+4, match+4, iend) + 4; + while (((ip0>anchor) & (match>prefixStart)) + && (ip0[-1] == match[-1])) { ip0--; match--; mLength++; } /* catch up */ offset_2 = offset_1; offset_1 = offset; - ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); + ZSTD_storeSeq(seqStore, (size_t)(ip0-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); } /* match found */ - ip += mLength; - anchor = ip; + ip0 += mLength; + anchor = ip0; - if (ip <= ilimit) { + if (ip0 <= ilimit) { /* Fill Table */ assert(base+curr+2 > istart); /* check base overflow */ hashTable[ZSTD_hashPtr(base+curr+2, hlog, mls)] = curr+2; /* here because curr+2 could be > iend-8 */ - hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base); + hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base); /* check immediate repcode */ - while (ip <= ilimit) { - U32 const current2 = (U32)(ip-base); + while (ip0 <= ilimit) { + U32 const current2 = (U32)(ip0-base); U32 const repIndex2 = current2 - offset_2; const BYTE* repMatch2 = repIndex2 < prefixStartIndex ? dictBase - dictIndexDelta + repIndex2 : base + repIndex2; if ( ((U32)((prefixStartIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */) - && (MEM_read32(repMatch2) == MEM_read32(ip)) ) { + && (MEM_read32(repMatch2) == MEM_read32(ip0)) ) { const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; - size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; + size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2); - hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2; - ip += repLength2; - anchor = ip; + hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = current2; + ip0 += repLength2; + anchor = ip0; continue; } break; } } - } + goto _start; /* found match, reset pipeline */ + } while (ip1 < ilimit); + +_cleanup: /* save reps for next block */ rep[0] = offset_1 ? offset_1 : offsetSaved; rep[1] = offset_2 ? offset_2 : offsetSaved; From ef1949f2fe6069127ef69e726120b46a24618fe9 Mon Sep 17 00:00:00 2001 From: Elliot Gorokhovsky Date: Tue, 1 Mar 2022 14:14:26 -0500 Subject: [PATCH 03/10] add L2_4 prefetch to dms pipeline --- lib/compress/zstd_fast.c | 137 +++++++++++++++++++++------------------ 1 file changed, 73 insertions(+), 64 deletions(-) diff --git a/lib/compress/zstd_fast.c b/lib/compress/zstd_fast.c index 1b70c4850f5..985c802de2c 100644 --- a/lib/compress/zstd_fast.c +++ b/lib/compress/zstd_fast.c @@ -401,7 +401,7 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( const BYTE* dictPrefetchPtr = dictEnd; const BYTE* const dictPrefetchLimit = dictPrefetchPtr - MIN(dictEnd - dictStart, 102400); /* 100KB */ - /* pipeline variables */ + /* loop variables */ const BYTE* ip0 = istart; const BYTE* ip1; size_t hash0; @@ -409,6 +409,8 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( size_t dictHash; /* inside the do{} loop, this is the hash for ip1 */ U32 matchIndex; U32 dictMatchIndex; + size_t mLength=0; /* initialize to avoid compiler warning, assert != 0 later on */ + U32 curr; /* if a dictionary is still attached, it necessarily means that * it is within window size. So we just check it. */ @@ -433,11 +435,10 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( _start: /* Requires: ip0 */ assert(ip0 == anchor); - ip1 = ip0 + 1; + assert(stepSize >= 1); - if (ip1 >= ilimit) { - goto _cleanup; - } + ip1 = ip0 + stepSize; + if (ip1 >= ilimit) goto _cleanup; hash0 = ZSTD_hashPtr(ip0, hlog, mls); dictHash = ZSTD_hashPtr(ip0, dictHLog, mls); @@ -445,19 +446,26 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( matchIndex = hashTable[hash0]; hash1 = ZSTD_hashPtr(ip1, hlog, mls); dictHash = ZSTD_hashPtr(ip1, dictHLog, mls); + + PREFETCH_L1(hashTable + hash1); PREFETCH_L1(dictHashTable + dictHash); /* Main Search Loop */ - do { - size_t mLength; - U32 const curr = (U32)(ip0-base); + while (1) { const BYTE* match = base + matchIndex; - const U32 repIndex = curr + 1 - offset_1; + const U32 repIndex = (U32)(ip0-base) + 1 - offset_1; const BYTE* repMatch = (repIndex < prefixStartIndex) ? dictBase + (repIndex - dictIndexDelta) : base + repIndex; + const size_t posIncr = ((ip1 - anchor) >> kSearchStrength) + stepSize; + const BYTE* dictIndexPrefetchPtr = ip1 + (posIncr << 2); + curr = (U32)(ip0-base); hashTable[hash0] = curr; /* update hash table */ + /* Cold dict optimization (may need to gate this) */ + if (dictIndexPrefetchPtr < ilimit) { + PREFETCH_L2(dictHashTable + ZSTD_hashPtr(ip1 + (posIncr << 2), dictHLog, mls)); + } if (dictPrefetchPtr >= dictPrefetchLimit) { PREFETCH_L2(dictPrefetchPtr); PREFETCH_L2(dictPrefetchPtr - 64); @@ -470,21 +478,12 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( mLength = ZSTD_count_2segments(ip0+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4; ip0++; ZSTD_storeSeq(seqStore, (size_t)(ip0-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength); + goto _match; } else if ( (matchIndex <= prefixStartIndex) ) { + /* We only look for a dict match if the normal matchIndex is invalid */ const BYTE* dictMatch = dictBase + dictMatchIndex; - if (dictMatchIndex <= dictStartIndex || - MEM_read32(dictMatch) != MEM_read32(ip0)) { - assert(stepSize >= 1); - dictMatchIndex = dictHashTable[dictHash]; - matchIndex = hashTable[hash1]; - ip0 = ip1; - ip1 = ip1 + ((ip1-anchor) >> kSearchStrength) + stepSize; - hash0 = hash1; - hash1 = ZSTD_hashPtr(ip1, hlog, mls); - dictHash = ZSTD_hashPtr(ip1, dictHLog, mls); - PREFETCH_L1(dictHashTable + dictHash); - continue; - } else { + if (dictMatchIndex > dictStartIndex && + MEM_read32(dictMatch) == MEM_read32(ip0)) { /* found a dict match */ U32 const offset = (U32)(curr-dictMatchIndex-dictIndexDelta); mLength = ZSTD_count_2segments(ip0+4, dictMatch+4, iend, dictEnd, prefixStart) + 4; @@ -495,20 +494,9 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( offset_2 = offset_1; offset_1 = offset; ZSTD_storeSeq(seqStore, (size_t)(ip0-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); + goto _match; } - } else if (MEM_read32(match) != MEM_read32(ip0)) { - /* it's not a match, and we're not going to check the dictionary */ - assert(stepSize >= 1); - dictMatchIndex = dictHashTable[dictHash]; - matchIndex = hashTable[hash1]; - ip0 = ip1; - ip1 = ip1 + ((ip1-anchor) >> kSearchStrength) + stepSize; - hash0 = hash1; - hash1 = ZSTD_hashPtr(ip1, hlog, mls); - dictHash = ZSTD_hashPtr(ip1, dictHLog, mls); - PREFETCH_L1(dictHashTable + dictHash); - continue; - } else { + } else if (MEM_read32(match) == MEM_read32(ip0)) { /* found a regular match */ U32 const offset = (U32)(ip0-match); mLength = ZSTD_count(ip0+4, match+4, iend) + 4; @@ -517,42 +505,63 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( offset_2 = offset_1; offset_1 = offset; ZSTD_storeSeq(seqStore, (size_t)(ip0-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); + goto _match; } - /* match found */ - ip0 += mLength; - anchor = ip0; + /* Prepare for next iteration */ + dictMatchIndex = dictHashTable[dictHash]; + matchIndex = hashTable[hash1]; - if (ip0 <= ilimit) { - /* Fill Table */ - assert(base+curr+2 > istart); /* check base overflow */ - hashTable[ZSTD_hashPtr(base+curr+2, hlog, mls)] = curr+2; /* here because curr+2 could be > iend-8 */ - hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base); + ip0 = ip1; + ip1 = ip1 + posIncr; + if (ip1 >= ilimit) goto _cleanup; - /* check immediate repcode */ - while (ip0 <= ilimit) { - U32 const current2 = (U32)(ip0-base); - U32 const repIndex2 = current2 - offset_2; - const BYTE* repMatch2 = repIndex2 < prefixStartIndex ? - dictBase - dictIndexDelta + repIndex2 : - base + repIndex2; - if ( ((U32)((prefixStartIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */) - && (MEM_read32(repMatch2) == MEM_read32(ip0)) ) { - const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; - size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; - U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */ - ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2); - hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = current2; - ip0 += repLength2; - anchor = ip0; - continue; - } - break; + hash0 = hash1; + hash1 = ZSTD_hashPtr(ip1, hlog, mls); + dictHash = ZSTD_hashPtr(ip1, dictHLog, mls); + + PREFETCH_L1(hashTable + hash1); + PREFETCH_L1(dictHashTable + dictHash); + } + +_match: + /* match found */ + assert(mLength); + ip0 += mLength; + anchor = ip0; + + if (ip0 <= ilimit) { + /* Fill Table */ + assert(base + curr + 2 > istart); /* check base overflow */ + hashTable[ZSTD_hashPtr(base + curr + 2, hlog, mls)] = curr + 2; /* here because curr+2 could be > iend-8 */ + hashTable[ZSTD_hashPtr(ip0 - 2, hlog, mls)] = (U32) (ip0 - 2 - base); + + /* check immediate repcode */ + while (ip0 <= ilimit) { + U32 const current2 = (U32) (ip0 - base); + U32 const repIndex2 = current2 - offset_2; + const BYTE *repMatch2 = repIndex2 < prefixStartIndex ? + dictBase - dictIndexDelta + repIndex2 : + base + repIndex2; + if (((U32) ((prefixStartIndex - 1) - (U32) repIndex2) >= 3 /* intentional overflow */) + && (MEM_read32(repMatch2) == MEM_read32(ip0))) { + const BYTE *const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; + size_t const repLength2 = + ZSTD_count_2segments(ip0 + 4, repMatch2 + 4, iend, repEnd2, prefixStart) + 4; + U32 tmpOffset = offset_2; + offset_2 = offset_1; + offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */ + ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2); + hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = current2; + ip0 += repLength2; + anchor = ip0; + continue; } + break; } + } - goto _start; /* found match, reset pipeline */ - } while (ip1 < ilimit); + goto _start; /* found match, reset pipeline */ _cleanup: /* save reps for next block */ From c0a306bdd74e55ba65584250d665171e54dd6843 Mon Sep 17 00:00:00 2001 From: Elliot Gorokhovsky Date: Wed, 2 Mar 2022 10:41:08 -0500 Subject: [PATCH 04/10] Remove L1 prefetch --- lib/compress/zstd_fast.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/lib/compress/zstd_fast.c b/lib/compress/zstd_fast.c index 985c802de2c..4eddd0130ac 100644 --- a/lib/compress/zstd_fast.c +++ b/lib/compress/zstd_fast.c @@ -447,9 +447,6 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( hash1 = ZSTD_hashPtr(ip1, hlog, mls); dictHash = ZSTD_hashPtr(ip1, dictHLog, mls); - PREFETCH_L1(hashTable + hash1); - PREFETCH_L1(dictHashTable + dictHash); - /* Main Search Loop */ while (1) { const BYTE* match = base + matchIndex; @@ -519,9 +516,6 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( hash0 = hash1; hash1 = ZSTD_hashPtr(ip1, hlog, mls); dictHash = ZSTD_hashPtr(ip1, dictHLog, mls); - - PREFETCH_L1(hashTable + hash1); - PREFETCH_L1(dictHashTable + dictHash); } _match: From 6f6de0605b4d464de823a9ee7f1453c9760775b4 Mon Sep 17 00:00:00 2001 From: Elliot Gorokhovsky Date: Wed, 2 Mar 2022 12:46:36 -0500 Subject: [PATCH 05/10] Remove L2 prefetching --- lib/compress/zstd_fast.c | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/lib/compress/zstd_fast.c b/lib/compress/zstd_fast.c index 4eddd0130ac..5f2178e3f67 100644 --- a/lib/compress/zstd_fast.c +++ b/lib/compress/zstd_fast.c @@ -398,8 +398,6 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( const U32 dictIndexDelta = prefixStartIndex - (U32)(dictEnd - dictBase); const U32 dictAndPrefixLength = (U32)(istart - prefixStart + dictEnd - dictStart); const U32 dictHLog = dictCParams->hashLog; - const BYTE* dictPrefetchPtr = dictEnd; - const BYTE* const dictPrefetchLimit = dictPrefetchPtr - MIN(dictEnd - dictStart, 102400); /* 100KB */ /* loop variables */ const BYTE* ip0 = istart; @@ -454,21 +452,9 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( const BYTE* repMatch = (repIndex < prefixStartIndex) ? dictBase + (repIndex - dictIndexDelta) : base + repIndex; - const size_t posIncr = ((ip1 - anchor) >> kSearchStrength) + stepSize; - const BYTE* dictIndexPrefetchPtr = ip1 + (posIncr << 2); curr = (U32)(ip0-base); hashTable[hash0] = curr; /* update hash table */ - /* Cold dict optimization (may need to gate this) */ - if (dictIndexPrefetchPtr < ilimit) { - PREFETCH_L2(dictHashTable + ZSTD_hashPtr(ip1 + (posIncr << 2), dictHLog, mls)); - } - if (dictPrefetchPtr >= dictPrefetchLimit) { - PREFETCH_L2(dictPrefetchPtr); - PREFETCH_L2(dictPrefetchPtr - 64); - dictPrefetchPtr -= 64; - } - if ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */ && (MEM_read32(repMatch) == MEM_read32(ip0+1)) ) { const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; @@ -510,7 +496,7 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( matchIndex = hashTable[hash1]; ip0 = ip1; - ip1 = ip1 + posIncr; + ip1 = ip1 + ((ip1 - anchor) >> kSearchStrength) + stepSize; if (ip1 >= ilimit) goto _cleanup; hash0 = hash1; From ced6f72fbf50d1973080e6ac2e83a9619c8205c2 Mon Sep 17 00:00:00 2001 From: Elliot Gorokhovsky Date: Wed, 2 Mar 2022 13:05:03 -0500 Subject: [PATCH 06/10] Reduce # of gotos --- lib/compress/zstd_fast.c | 212 +++++++++++++++++++-------------------- 1 file changed, 104 insertions(+), 108 deletions(-) diff --git a/lib/compress/zstd_fast.c b/lib/compress/zstd_fast.c index 5f2178e3f67..a5ab068daca 100644 --- a/lib/compress/zstd_fast.c +++ b/lib/compress/zstd_fast.c @@ -401,14 +401,7 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( /* loop variables */ const BYTE* ip0 = istart; - const BYTE* ip1; - size_t hash0; - size_t hash1; - size_t dictHash; /* inside the do{} loop, this is the hash for ip1 */ - U32 matchIndex; - U32 dictMatchIndex; - size_t mLength=0; /* initialize to avoid compiler warning, assert != 0 later on */ - U32 curr; + const BYTE* ip1 = ip0 + stepSize; /* if a dictionary is still attached, it necessarily means that * it is within window size. So we just check it. */ @@ -431,117 +424,120 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( assert(offset_1 <= dictAndPrefixLength); assert(offset_2 <= dictAndPrefixLength); -_start: /* Requires: ip0 */ - assert(ip0 == anchor); - assert(stepSize >= 1); - - ip1 = ip0 + stepSize; - if (ip1 >= ilimit) goto _cleanup; - - hash0 = ZSTD_hashPtr(ip0, hlog, mls); - dictHash = ZSTD_hashPtr(ip0, dictHLog, mls); - dictMatchIndex = dictHashTable[dictHash]; - matchIndex = hashTable[hash0]; - hash1 = ZSTD_hashPtr(ip1, hlog, mls); - dictHash = ZSTD_hashPtr(ip1, dictHLog, mls); - - /* Main Search Loop */ - while (1) { - const BYTE* match = base + matchIndex; - const U32 repIndex = (U32)(ip0-base) + 1 - offset_1; - const BYTE* repMatch = (repIndex < prefixStartIndex) ? - dictBase + (repIndex - dictIndexDelta) : - base + repIndex; - curr = (U32)(ip0-base); - hashTable[hash0] = curr; /* update hash table */ - - if ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */ - && (MEM_read32(repMatch) == MEM_read32(ip0+1)) ) { - const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; - mLength = ZSTD_count_2segments(ip0+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4; - ip0++; - ZSTD_storeSeq(seqStore, (size_t)(ip0-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength); - goto _match; - } else if ( (matchIndex <= prefixStartIndex) ) { - /* We only look for a dict match if the normal matchIndex is invalid */ - const BYTE* dictMatch = dictBase + dictMatchIndex; - if (dictMatchIndex > dictStartIndex && - MEM_read32(dictMatch) == MEM_read32(ip0)) { - /* found a dict match */ - U32 const offset = (U32)(curr-dictMatchIndex-dictIndexDelta); - mLength = ZSTD_count_2segments(ip0+4, dictMatch+4, iend, dictEnd, prefixStart) + 4; - while (((ip0>anchor) & (dictMatch>dictStart)) - && (ip0[-1] == dictMatch[-1])) { - ip0--; dictMatch--; mLength++; + /* Outer search loop */ + while (ip1 < ilimit) { + size_t mLength; + size_t hash0 = ZSTD_hashPtr(ip0, hlog, mls); + size_t dictHash = ZSTD_hashPtr(ip0, dictHLog, mls); + U32 dictMatchIndex = dictHashTable[dictHash]; + U32 matchIndex = hashTable[hash0]; + U32 curr = (U32)(ip0 - base); + + /* Inner search loop */ + while (1) { + const BYTE *match = base + matchIndex; + const U32 repIndex = curr + 1 - offset_1; + const BYTE *repMatch = (repIndex < prefixStartIndex) ? + dictBase + (repIndex - dictIndexDelta) : + base + repIndex; + size_t hash1 = ZSTD_hashPtr(ip1, hlog, mls); + dictHash = ZSTD_hashPtr(ip1, dictHLog, mls); + hashTable[hash0] = curr; /* update hash table */ + + if (((U32) ((prefixStartIndex - 1) - repIndex) >= + 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */ + && (MEM_read32(repMatch) == MEM_read32(ip0 + 1))) { + const BYTE *const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; + mLength = ZSTD_count_2segments(ip0 + 1 + 4, repMatch + 4, iend, repMatchEnd, prefixStart) + 4; + ip0++; + ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength); + break; + } else if ((matchIndex <= prefixStartIndex)) { + /* We only look for a dict match if the normal matchIndex is invalid */ + const BYTE *dictMatch = dictBase + dictMatchIndex; + if (dictMatchIndex > dictStartIndex && + MEM_read32(dictMatch) == MEM_read32(ip0)) { + /* found a dict match */ + U32 const offset = (U32) (curr - dictMatchIndex - dictIndexDelta); + mLength = ZSTD_count_2segments(ip0 + 4, dictMatch + 4, iend, dictEnd, prefixStart) + 4; + while (((ip0 > anchor) & (dictMatch > dictStart)) + && (ip0[-1] == dictMatch[-1])) { + ip0--; + dictMatch--; + mLength++; + } /* catch up */ + offset_2 = offset_1; + offset_1 = offset; + ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); + break; + } + } else if (MEM_read32(match) == MEM_read32(ip0)) { + /* found a regular match */ + U32 const offset = (U32) (ip0 - match); + mLength = ZSTD_count(ip0 + 4, match + 4, iend) + 4; + while (((ip0 > anchor) & (match > prefixStart)) + && (ip0[-1] == match[-1])) { + ip0--; + match--; + mLength++; } /* catch up */ offset_2 = offset_1; offset_1 = offset; - ZSTD_storeSeq(seqStore, (size_t)(ip0-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); - goto _match; + ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); + break; } - } else if (MEM_read32(match) == MEM_read32(ip0)) { - /* found a regular match */ - U32 const offset = (U32)(ip0-match); - mLength = ZSTD_count(ip0+4, match+4, iend) + 4; - while (((ip0>anchor) & (match>prefixStart)) - && (ip0[-1] == match[-1])) { ip0--; match--; mLength++; } /* catch up */ - offset_2 = offset_1; - offset_1 = offset; - ZSTD_storeSeq(seqStore, (size_t)(ip0-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); - goto _match; - } - /* Prepare for next iteration */ - dictMatchIndex = dictHashTable[dictHash]; - matchIndex = hashTable[hash1]; + /* Prepare for next iteration */ + dictMatchIndex = dictHashTable[dictHash]; + matchIndex = hashTable[hash1]; + ip0 = ip1; + ip1 = ip1 + ((ip1 - anchor) >> kSearchStrength) + stepSize; + if (ip1 >= ilimit) goto _cleanup; + curr = (U32)(ip0 - base); + hash0 = hash1; + } - ip0 = ip1; - ip1 = ip1 + ((ip1 - anchor) >> kSearchStrength) + stepSize; - if (ip1 >= ilimit) goto _cleanup; + /* match found */ + assert(mLength); + ip0 += mLength; + anchor = ip0; - hash0 = hash1; - hash1 = ZSTD_hashPtr(ip1, hlog, mls); - dictHash = ZSTD_hashPtr(ip1, dictHLog, mls); - } - -_match: - /* match found */ - assert(mLength); - ip0 += mLength; - anchor = ip0; + if (ip0 <= ilimit) { + /* Fill Table */ + assert(base + curr + 2 > istart); /* check base overflow */ + hashTable[ZSTD_hashPtr(base + curr + 2, hlog, mls)] = curr + 2; /* here because curr+2 could be > iend-8 */ + hashTable[ZSTD_hashPtr(ip0 - 2, hlog, mls)] = (U32) (ip0 - 2 - base); - if (ip0 <= ilimit) { - /* Fill Table */ - assert(base + curr + 2 > istart); /* check base overflow */ - hashTable[ZSTD_hashPtr(base + curr + 2, hlog, mls)] = curr + 2; /* here because curr+2 could be > iend-8 */ - hashTable[ZSTD_hashPtr(ip0 - 2, hlog, mls)] = (U32) (ip0 - 2 - base); - - /* check immediate repcode */ - while (ip0 <= ilimit) { - U32 const current2 = (U32) (ip0 - base); - U32 const repIndex2 = current2 - offset_2; - const BYTE *repMatch2 = repIndex2 < prefixStartIndex ? - dictBase - dictIndexDelta + repIndex2 : - base + repIndex2; - if (((U32) ((prefixStartIndex - 1) - (U32) repIndex2) >= 3 /* intentional overflow */) - && (MEM_read32(repMatch2) == MEM_read32(ip0))) { - const BYTE *const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; - size_t const repLength2 = - ZSTD_count_2segments(ip0 + 4, repMatch2 + 4, iend, repEnd2, prefixStart) + 4; - U32 tmpOffset = offset_2; - offset_2 = offset_1; - offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */ - ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2); - hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = current2; - ip0 += repLength2; - anchor = ip0; - continue; + /* check immediate repcode */ + while (ip0 <= ilimit) { + U32 const current2 = (U32) (ip0 - base); + U32 const repIndex2 = current2 - offset_2; + const BYTE *repMatch2 = repIndex2 < prefixStartIndex ? + dictBase - dictIndexDelta + repIndex2 : + base + repIndex2; + if (((U32) ((prefixStartIndex - 1) - (U32) repIndex2) >= 3 /* intentional overflow */) + && (MEM_read32(repMatch2) == MEM_read32(ip0))) { + const BYTE *const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; + size_t const repLength2 = + ZSTD_count_2segments(ip0 + 4, repMatch2 + 4, iend, repEnd2, prefixStart) + 4; + U32 tmpOffset = offset_2; + offset_2 = offset_1; + offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */ + ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2); + hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = current2; + ip0 += repLength2; + anchor = ip0; + continue; + } + break; } - break; } - } - goto _start; /* found match, reset pipeline */ + /* Prepare for next iteration */ + assert(ip0 == anchor); + assert(stepSize >= 1); + ip1 = ip0 + stepSize; + } _cleanup: /* save reps for next block */ From af680806cc8899f59be8e7c2e4f213deec258344 Mon Sep 17 00:00:00 2001 From: Elliot Gorokhovsky Date: Fri, 4 Mar 2022 14:42:02 -0500 Subject: [PATCH 07/10] Cosmetic fixes --- lib/compress/zstd_fast.c | 39 +++++++++++++++++---------------------- 1 file changed, 17 insertions(+), 22 deletions(-) diff --git a/lib/compress/zstd_fast.c b/lib/compress/zstd_fast.c index a5ab068daca..81fadba6f60 100644 --- a/lib/compress/zstd_fast.c +++ b/lib/compress/zstd_fast.c @@ -380,6 +380,8 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( U32 const stepSize = cParams->targetLength + !(cParams->targetLength); const BYTE* const base = ms->window.base; const BYTE* const istart = (const BYTE*)src; + const BYTE* ip0 = istart; + const BYTE* ip1 = ip0 + stepSize; /* we assert below that stepSize >= 1 */ const BYTE* anchor = istart; const U32 prefixStartIndex = ms->window.dictLimit; const BYTE* const prefixStart = base + prefixStartIndex; @@ -399,10 +401,6 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( const U32 dictAndPrefixLength = (U32)(istart - prefixStart + dictEnd - dictStart); const U32 dictHLog = dictCParams->hashLog; - /* loop variables */ - const BYTE* ip0 = istart; - const BYTE* ip1 = ip0 + stepSize; - /* if a dictionary is still attached, it necessarily means that * it is within window size. So we just check it. */ const U32 maxDistance = 1U << cParams->windowLog; @@ -425,7 +423,8 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( assert(offset_2 <= dictAndPrefixLength); /* Outer search loop */ - while (ip1 < ilimit) { + assert(stepSize >= 1); + while (ip1 < ilimit) { /* < instead of <=, because repcode check at (ip+1) */ size_t mLength; size_t hash0 = ZSTD_hashPtr(ip0, hlog, mls); size_t dictHash = ZSTD_hashPtr(ip0, dictHLog, mls); @@ -452,7 +451,7 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( ip0++; ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength); break; - } else if ((matchIndex <= prefixStartIndex)) { + } else if (matchIndex <= prefixStartIndex) { /* We only look for a dict match if the normal matchIndex is invalid */ const BYTE *dictMatch = dictBase + dictMatchIndex; if (dictMatchIndex > dictStartIndex && @@ -504,25 +503,22 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( if (ip0 <= ilimit) { /* Fill Table */ - assert(base + curr + 2 > istart); /* check base overflow */ - hashTable[ZSTD_hashPtr(base + curr + 2, hlog, mls)] = curr + 2; /* here because curr+2 could be > iend-8 */ - hashTable[ZSTD_hashPtr(ip0 - 2, hlog, mls)] = (U32) (ip0 - 2 - base); + assert(base+curr+2 > istart); /* check base overflow */ + hashTable[ZSTD_hashPtr(base+curr+2, hlog, mls)] = curr+2; /* here because curr+2 could be > iend-8 */ + hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base); /* check immediate repcode */ while (ip0 <= ilimit) { - U32 const current2 = (U32) (ip0 - base); + U32 const current2 = (U32)(ip0-base); U32 const repIndex2 = current2 - offset_2; - const BYTE *repMatch2 = repIndex2 < prefixStartIndex ? - dictBase - dictIndexDelta + repIndex2 : - base + repIndex2; - if (((U32) ((prefixStartIndex - 1) - (U32) repIndex2) >= 3 /* intentional overflow */) - && (MEM_read32(repMatch2) == MEM_read32(ip0))) { - const BYTE *const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; - size_t const repLength2 = - ZSTD_count_2segments(ip0 + 4, repMatch2 + 4, iend, repEnd2, prefixStart) + 4; - U32 tmpOffset = offset_2; - offset_2 = offset_1; - offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */ + const BYTE* repMatch2 = repIndex2 < prefixStartIndex ? + dictBase - dictIndexDelta + repIndex2 : + base + repIndex2; + if ( ((U32)((prefixStartIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */) + && (MEM_read32(repMatch2) == MEM_read32(ip0))) { + const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; + size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; + U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2); hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = current2; ip0 += repLength2; @@ -535,7 +531,6 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( /* Prepare for next iteration */ assert(ip0 == anchor); - assert(stepSize >= 1); ip1 = ip0 + stepSize; } From 6cba817c8f2eff8a387b17033a02ec88eab4648e Mon Sep 17 00:00:00 2001 From: Elliot Gorokhovsky Date: Wed, 9 Mar 2022 11:30:40 -0500 Subject: [PATCH 08/10] Check final position sometimes --- lib/compress/zstd_fast.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/compress/zstd_fast.c b/lib/compress/zstd_fast.c index 81fadba6f60..9c55f24bc13 100644 --- a/lib/compress/zstd_fast.c +++ b/lib/compress/zstd_fast.c @@ -424,7 +424,7 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( /* Outer search loop */ assert(stepSize >= 1); - while (ip1 < ilimit) { /* < instead of <=, because repcode check at (ip+1) */ + while (ip1 <= ilimit) { /* repcode check at (ip0 + 1) is safe because ip0 < ip1 */ size_t mLength; size_t hash0 = ZSTD_hashPtr(ip0, hlog, mls); size_t dictHash = ZSTD_hashPtr(ip0, dictHLog, mls); @@ -491,7 +491,7 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( matchIndex = hashTable[hash1]; ip0 = ip1; ip1 = ip1 + ((ip1 - anchor) >> kSearchStrength) + stepSize; - if (ip1 >= ilimit) goto _cleanup; + if (ip1 > ilimit) goto _cleanup; curr = (U32)(ip0 - base); hash0 = hash1; } From b4f9a3fac089566170450ffeff524e5c481a8432 Mon Sep 17 00:00:00 2001 From: Elliot Gorokhovsky Date: Thu, 10 Mar 2022 10:15:54 -0500 Subject: [PATCH 09/10] Track step size as in bc768bc --- lib/compress/zstd_fast.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/lib/compress/zstd_fast.c b/lib/compress/zstd_fast.c index 9c55f24bc13..2409db6d39d 100644 --- a/lib/compress/zstd_fast.c +++ b/lib/compress/zstd_fast.c @@ -431,6 +431,9 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( U32 dictMatchIndex = dictHashTable[dictHash]; U32 matchIndex = hashTable[hash0]; U32 curr = (U32)(ip0 - base); + size_t step = stepSize; + const size_t kStepIncr = 1 << kSearchStrength; + const BYTE* nextStep = ip0 + kStepIncr; /* Inner search loop */ while (1) { @@ -489,9 +492,15 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( /* Prepare for next iteration */ dictMatchIndex = dictHashTable[dictHash]; matchIndex = hashTable[hash1]; + + if (ip1 >= nextStep) { + step++; + nextStep += kStepIncr; + } ip0 = ip1; - ip1 = ip1 + ((ip1 - anchor) >> kSearchStrength) + stepSize; + ip1 = ip1 + step; if (ip1 > ilimit) goto _cleanup; + curr = (U32)(ip0 - base); hash0 = hash1; } From 201f0e25785e57ef9ded9fb7ba3da44bb9ec45e4 Mon Sep 17 00:00:00 2001 From: Elliot Gorokhovsky Date: Thu, 10 Mar 2022 16:39:44 -0500 Subject: [PATCH 10/10] Fix nits --- lib/compress/zstd_fast.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/lib/compress/zstd_fast.c b/lib/compress/zstd_fast.c index 2409db6d39d..5da108c622a 100644 --- a/lib/compress/zstd_fast.c +++ b/lib/compress/zstd_fast.c @@ -424,11 +424,11 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( /* Outer search loop */ assert(stepSize >= 1); - while (ip1 <= ilimit) { /* repcode check at (ip0 + 1) is safe because ip0 < ip1 */ + while (ip1 <= ilimit) { /* repcode check at (ip0 + 1) is safe because ip0 < ip1 */ size_t mLength; size_t hash0 = ZSTD_hashPtr(ip0, hlog, mls); - size_t dictHash = ZSTD_hashPtr(ip0, dictHLog, mls); - U32 dictMatchIndex = dictHashTable[dictHash]; + const size_t dictHash0 = ZSTD_hashPtr(ip0, dictHLog, mls); + U32 dictMatchIndex = dictHashTable[dictHash0]; U32 matchIndex = hashTable[hash0]; U32 curr = (U32)(ip0 - base); size_t step = stepSize; @@ -437,26 +437,26 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( /* Inner search loop */ while (1) { - const BYTE *match = base + matchIndex; + const BYTE* match = base + matchIndex; const U32 repIndex = curr + 1 - offset_1; - const BYTE *repMatch = (repIndex < prefixStartIndex) ? + const BYTE* repMatch = (repIndex < prefixStartIndex) ? dictBase + (repIndex - dictIndexDelta) : base + repIndex; - size_t hash1 = ZSTD_hashPtr(ip1, hlog, mls); - dictHash = ZSTD_hashPtr(ip1, dictHLog, mls); + const size_t hash1 = ZSTD_hashPtr(ip1, hlog, mls); + const size_t dictHash1 = ZSTD_hashPtr(ip1, dictHLog, mls); hashTable[hash0] = curr; /* update hash table */ if (((U32) ((prefixStartIndex - 1) - repIndex) >= 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */ && (MEM_read32(repMatch) == MEM_read32(ip0 + 1))) { - const BYTE *const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; + const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; mLength = ZSTD_count_2segments(ip0 + 1 + 4, repMatch + 4, iend, repMatchEnd, prefixStart) + 4; ip0++; ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength); break; } else if (matchIndex <= prefixStartIndex) { /* We only look for a dict match if the normal matchIndex is invalid */ - const BYTE *dictMatch = dictBase + dictMatchIndex; + const BYTE* dictMatch = dictBase + dictMatchIndex; if (dictMatchIndex > dictStartIndex && MEM_read32(dictMatch) == MEM_read32(ip0)) { /* found a dict match */ @@ -490,7 +490,7 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( } /* Prepare for next iteration */ - dictMatchIndex = dictHashTable[dictHash]; + dictMatchIndex = dictHashTable[dictHash1]; matchIndex = hashTable[hash1]; if (ip1 >= nextStep) { @@ -503,7 +503,7 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( curr = (U32)(ip0 - base); hash0 = hash1; - } + } /* end inner search loop */ /* match found */ assert(mLength);