From bd4b2c0f19fc4518010ecb592047d833a4c1a741 Mon Sep 17 00:00:00 2001
From: Elliot Gorokhovsky <embg@fb.com>
Date: Wed, 23 Feb 2022 18:01:35 -0500
Subject: [PATCH 01/10] prefetch dict content inside loop

---
 lib/compress/zstd_fast.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/lib/compress/zstd_fast.c b/lib/compress/zstd_fast.c
index 62c4c2cea02..42c52d2af75 100644
--- a/lib/compress/zstd_fast.c
+++ b/lib/compress/zstd_fast.c
@@ -399,6 +399,8 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
     const U32 dictIndexDelta       = prefixStartIndex - (U32)(dictEnd - dictBase);
     const U32 dictAndPrefixLength  = (U32)(ip - prefixStart + dictEnd - dictStart);
     const U32 dictHLog             = dictCParams->hashLog;
+    const BYTE* dictPrefetchPtr    = dictEnd;
+    const BYTE* const dictPrefetchLimit = dictPrefetchPtr - MIN(dictEnd - dictStart, 102400); /* 100KB */
 
     /* if a dictionary is still attached, it necessarily means that
      * it is within window size. So we just check it. */
@@ -434,6 +436,12 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
                                base + repIndex;
         hashTable[h] = curr;   /* update hash table */
 
+        if (dictPrefetchPtr >= dictPrefetchLimit) {
+            PREFETCH_L2(dictPrefetchPtr);
+            PREFETCH_L2(dictPrefetchPtr - 64);
+            dictPrefetchPtr -= 64;
+        }
+
         if ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */
           && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
             const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;

From 744f7c19b2155aee5538b071ac7b1e3e7fac6caa Mon Sep 17 00:00:00 2001
From: Elliot Gorokhovsky <embg@fb.com>
Date: Mon, 28 Feb 2022 18:08:03 -0500
Subject: [PATCH 02/10] ip0/ip1 pipeline

---
 lib/compress/zstd_fast.c | 119 +++++++++++++++++++++++++--------------
 1 file changed, 78 insertions(+), 41 deletions(-)

diff --git a/lib/compress/zstd_fast.c b/lib/compress/zstd_fast.c
index 42c52d2af75..1b70c4850f5 100644
--- a/lib/compress/zstd_fast.c
+++ b/lib/compress/zstd_fast.c
@@ -380,7 +380,6 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
     U32 const stepSize = cParams->targetLength + !(cParams->targetLength);
     const BYTE* const base = ms->window.base;
     const BYTE* const istart = (const BYTE*)src;
-    const BYTE* ip = istart;
     const BYTE* anchor = istart;
     const U32   prefixStartIndex = ms->window.dictLimit;
     const BYTE* const prefixStart = base + prefixStartIndex;
@@ -397,15 +396,24 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
     const BYTE* const dictStart    = dictBase + dictStartIndex;
     const BYTE* const dictEnd      = dms->window.nextSrc;
     const U32 dictIndexDelta       = prefixStartIndex - (U32)(dictEnd - dictBase);
-    const U32 dictAndPrefixLength  = (U32)(ip - prefixStart + dictEnd - dictStart);
+    const U32 dictAndPrefixLength  = (U32)(istart - prefixStart + dictEnd - dictStart);
     const U32 dictHLog             = dictCParams->hashLog;
     const BYTE* dictPrefetchPtr    = dictEnd;
     const BYTE* const dictPrefetchLimit = dictPrefetchPtr - MIN(dictEnd - dictStart, 102400); /* 100KB */
 
+    /* pipeline variables */
+    const BYTE* ip0 = istart;
+    const BYTE* ip1;
+    size_t hash0;
+    size_t hash1;
+    size_t dictHash; /* inside the do{} loop, this is the hash for ip1 */
+    U32 matchIndex;
+    U32 dictMatchIndex;
+
     /* if a dictionary is still attached, it necessarily means that
      * it is within window size. So we just check it. */
     const U32 maxDistance = 1U << cParams->windowLog;
-    const U32 endIndex = (U32)((size_t)(ip - base) + srcSize);
+    const U32 endIndex = (U32)((size_t)(istart - base) + srcSize);
     assert(endIndex - prefixStartIndex <= maxDistance);
     (void)maxDistance; (void)endIndex;   /* these variables are not used when assert() is disabled */
 
@@ -417,24 +425,38 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
 
     /* init */
     DEBUGLOG(5, "ZSTD_compressBlock_fast_dictMatchState_generic");
-    ip += (dictAndPrefixLength == 0);
+    ip0 += (dictAndPrefixLength == 0);
     /* dictMatchState repCode checks don't currently handle repCode == 0
      * disabling. */
     assert(offset_1 <= dictAndPrefixLength);
     assert(offset_2 <= dictAndPrefixLength);
 
+_start: /* Requires: ip0 */
+    assert(ip0 == anchor);
+    ip1 = ip0 + 1;
+
+    if (ip1 >= ilimit) {
+        goto _cleanup;
+    }
+
+    hash0 = ZSTD_hashPtr(ip0, hlog, mls);
+    dictHash = ZSTD_hashPtr(ip0, dictHLog, mls);
+    dictMatchIndex = dictHashTable[dictHash];
+    matchIndex = hashTable[hash0];
+    hash1 = ZSTD_hashPtr(ip1, hlog, mls);
+    dictHash = ZSTD_hashPtr(ip1, dictHLog, mls);
+    PREFETCH_L1(dictHashTable + dictHash);
+
     /* Main Search Loop */
-    while (ip < ilimit) {   /* < instead of <=, because repcode check at (ip+1) */
+    do {
         size_t mLength;
-        size_t const h = ZSTD_hashPtr(ip, hlog, mls);
-        U32 const curr = (U32)(ip-base);
-        U32 const matchIndex = hashTable[h];
+        U32 const curr = (U32)(ip0-base);
         const BYTE* match = base + matchIndex;
         const U32 repIndex = curr + 1 - offset_1;
         const BYTE* repMatch = (repIndex < prefixStartIndex) ?
                                dictBase + (repIndex - dictIndexDelta) :
                                base + repIndex;
-        hashTable[h] = curr;   /* update hash table */
+        hashTable[hash0] = curr;   /* update hash table */
 
         if (dictPrefetchPtr >= dictPrefetchLimit) {
             PREFETCH_L2(dictPrefetchPtr);
@@ -443,81 +465,96 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
         }
 
         if ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */
-          && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
+          && (MEM_read32(repMatch) == MEM_read32(ip0+1)) ) {
             const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
-            mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4;
-            ip++;
-            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
+            mLength = ZSTD_count_2segments(ip0+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4;
+            ip0++;
+            ZSTD_storeSeq(seqStore, (size_t)(ip0-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
         } else if ( (matchIndex <= prefixStartIndex) ) {
-            size_t const dictHash = ZSTD_hashPtr(ip, dictHLog, mls);
-            U32 const dictMatchIndex = dictHashTable[dictHash];
             const BYTE* dictMatch = dictBase + dictMatchIndex;
             if (dictMatchIndex <= dictStartIndex ||
-                MEM_read32(dictMatch) != MEM_read32(ip)) {
+                MEM_read32(dictMatch) != MEM_read32(ip0)) {
                 assert(stepSize >= 1);
-                ip += ((ip-anchor) >> kSearchStrength) + stepSize;
+                dictMatchIndex = dictHashTable[dictHash];
+                matchIndex = hashTable[hash1];
+                ip0 = ip1;
+                ip1 = ip1 + ((ip1-anchor) >> kSearchStrength) + stepSize;
+                hash0 = hash1;
+                hash1 = ZSTD_hashPtr(ip1, hlog, mls);
+                dictHash = ZSTD_hashPtr(ip1, dictHLog, mls);
+                PREFETCH_L1(dictHashTable + dictHash);
                 continue;
             } else {
                 /* found a dict match */
                 U32 const offset = (U32)(curr-dictMatchIndex-dictIndexDelta);
-                mLength = ZSTD_count_2segments(ip+4, dictMatch+4, iend, dictEnd, prefixStart) + 4;
-                while (((ip>anchor) & (dictMatch>dictStart))
-                     && (ip[-1] == dictMatch[-1])) {
-                    ip--; dictMatch--; mLength++;
+                mLength = ZSTD_count_2segments(ip0+4, dictMatch+4, iend, dictEnd, prefixStart) + 4;
+                while (((ip0>anchor) & (dictMatch>dictStart))
+                     && (ip0[-1] == dictMatch[-1])) {
+                    ip0--; dictMatch--; mLength++;
                 } /* catch up */
                 offset_2 = offset_1;
                 offset_1 = offset;
-                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+                ZSTD_storeSeq(seqStore, (size_t)(ip0-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
             }
-        } else if (MEM_read32(match) != MEM_read32(ip)) {
+        } else if (MEM_read32(match) != MEM_read32(ip0)) {
             /* it's not a match, and we're not going to check the dictionary */
             assert(stepSize >= 1);
-            ip += ((ip-anchor) >> kSearchStrength) + stepSize;
+            dictMatchIndex = dictHashTable[dictHash];
+            matchIndex = hashTable[hash1];
+            ip0 = ip1;
+            ip1 = ip1 + ((ip1-anchor) >> kSearchStrength) + stepSize;
+            hash0 = hash1;
+            hash1 = ZSTD_hashPtr(ip1, hlog, mls);
+            dictHash = ZSTD_hashPtr(ip1, dictHLog, mls);
+            PREFETCH_L1(dictHashTable + dictHash);
             continue;
         } else {
             /* found a regular match */
-            U32 const offset = (U32)(ip-match);
-            mLength = ZSTD_count(ip+4, match+4, iend) + 4;
-            while (((ip>anchor) & (match>prefixStart))
-                 && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */
+            U32 const offset = (U32)(ip0-match);
+            mLength = ZSTD_count(ip0+4, match+4, iend) + 4;
+            while (((ip0>anchor) & (match>prefixStart))
+                 && (ip0[-1] == match[-1])) { ip0--; match--; mLength++; } /* catch up */
             offset_2 = offset_1;
             offset_1 = offset;
-            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+            ZSTD_storeSeq(seqStore, (size_t)(ip0-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
         }
 
         /* match found */
-        ip += mLength;
-        anchor = ip;
+        ip0 += mLength;
+        anchor = ip0;
 
-        if (ip <= ilimit) {
+        if (ip0 <= ilimit) {
             /* Fill Table */
             assert(base+curr+2 > istart);  /* check base overflow */
             hashTable[ZSTD_hashPtr(base+curr+2, hlog, mls)] = curr+2;  /* here because curr+2 could be > iend-8 */
-            hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base);
+            hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base);
 
             /* check immediate repcode */
-            while (ip <= ilimit) {
-                U32 const current2 = (U32)(ip-base);
+            while (ip0 <= ilimit) {
+                U32 const current2 = (U32)(ip0-base);
                 U32 const repIndex2 = current2 - offset_2;
                 const BYTE* repMatch2 = repIndex2 < prefixStartIndex ?
                         dictBase - dictIndexDelta + repIndex2 :
                         base + repIndex2;
                 if ( ((U32)((prefixStartIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */)
-                   && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
+                   && (MEM_read32(repMatch2) == MEM_read32(ip0)) ) {
                     const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
-                    size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
+                    size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
                     U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
                     ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
-                    hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2;
-                    ip += repLength2;
-                    anchor = ip;
+                    hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = current2;
+                    ip0 += repLength2;
+                    anchor = ip0;
                     continue;
                 }
                 break;
             }
         }
-    }
 
+        goto _start; /* found match, reset pipeline */
+    } while (ip1 < ilimit);
+
+_cleanup:
     /* save reps for next block */
     rep[0] = offset_1 ? offset_1 : offsetSaved;
     rep[1] = offset_2 ? offset_2 : offsetSaved;

From ef1949f2fe6069127ef69e726120b46a24618fe9 Mon Sep 17 00:00:00 2001
From: Elliot Gorokhovsky <embg@fb.com>
Date: Tue, 1 Mar 2022 14:14:26 -0500
Subject: [PATCH 03/10] add L2_4 prefetch to dms pipeline

---
 lib/compress/zstd_fast.c | 137 +++++++++++++++++++++------------------
 1 file changed, 73 insertions(+), 64 deletions(-)

diff --git a/lib/compress/zstd_fast.c b/lib/compress/zstd_fast.c
index 1b70c4850f5..985c802de2c 100644
--- a/lib/compress/zstd_fast.c
+++ b/lib/compress/zstd_fast.c
@@ -401,7 +401,7 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
     const BYTE* dictPrefetchPtr    = dictEnd;
     const BYTE* const dictPrefetchLimit = dictPrefetchPtr - MIN(dictEnd - dictStart, 102400); /* 100KB */
 
-    /* pipeline variables */
+    /* loop variables */
     const BYTE* ip0 = istart;
     const BYTE* ip1;
     size_t hash0;
@@ -409,6 +409,8 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
     size_t dictHash; /* inside the do{} loop, this is the hash for ip1 */
     U32 matchIndex;
     U32 dictMatchIndex;
+    size_t mLength=0; /* initialize to avoid compiler warning, assert != 0 later on */
+    U32 curr;
 
     /* if a dictionary is still attached, it necessarily means that
      * it is within window size. So we just check it. */
@@ -433,11 +435,10 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
 
 _start: /* Requires: ip0 */
     assert(ip0 == anchor);
-    ip1 = ip0 + 1;
+    assert(stepSize >= 1);
 
-    if (ip1 >= ilimit) {
-        goto _cleanup;
-    }
+    ip1 = ip0 + stepSize;
+    if (ip1 >= ilimit) goto _cleanup;
 
     hash0 = ZSTD_hashPtr(ip0, hlog, mls);
     dictHash = ZSTD_hashPtr(ip0, dictHLog, mls);
@@ -445,19 +446,26 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
     matchIndex = hashTable[hash0];
     hash1 = ZSTD_hashPtr(ip1, hlog, mls);
     dictHash = ZSTD_hashPtr(ip1, dictHLog, mls);
+
+    PREFETCH_L1(hashTable + hash1);
     PREFETCH_L1(dictHashTable + dictHash);
 
     /* Main Search Loop */
-    do {
-        size_t mLength;
-        U32 const curr = (U32)(ip0-base);
+    while (1) {
         const BYTE* match = base + matchIndex;
-        const U32 repIndex = curr + 1 - offset_1;
+        const U32 repIndex = (U32)(ip0-base) + 1 - offset_1;
         const BYTE* repMatch = (repIndex < prefixStartIndex) ?
                                dictBase + (repIndex - dictIndexDelta) :
                                base + repIndex;
+        const size_t posIncr = ((ip1 - anchor) >> kSearchStrength) + stepSize;
+        const BYTE* dictIndexPrefetchPtr = ip1 + (posIncr << 2);
+        curr = (U32)(ip0-base);
         hashTable[hash0] = curr;   /* update hash table */
 
+        /* Cold dict optimization (may need to gate this) */
+        if (dictIndexPrefetchPtr < ilimit) {
+            PREFETCH_L2(dictHashTable + ZSTD_hashPtr(ip1 + (posIncr << 2), dictHLog, mls));
+        }
         if (dictPrefetchPtr >= dictPrefetchLimit) {
             PREFETCH_L2(dictPrefetchPtr);
             PREFETCH_L2(dictPrefetchPtr - 64);
@@ -470,21 +478,12 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
             mLength = ZSTD_count_2segments(ip0+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4;
             ip0++;
             ZSTD_storeSeq(seqStore, (size_t)(ip0-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
+            goto _match;
         } else if ( (matchIndex <= prefixStartIndex) ) {
+            /* We only look for a dict match if the normal matchIndex is invalid */
             const BYTE* dictMatch = dictBase + dictMatchIndex;
-            if (dictMatchIndex <= dictStartIndex ||
-                MEM_read32(dictMatch) != MEM_read32(ip0)) {
-                assert(stepSize >= 1);
-                dictMatchIndex = dictHashTable[dictHash];
-                matchIndex = hashTable[hash1];
-                ip0 = ip1;
-                ip1 = ip1 + ((ip1-anchor) >> kSearchStrength) + stepSize;
-                hash0 = hash1;
-                hash1 = ZSTD_hashPtr(ip1, hlog, mls);
-                dictHash = ZSTD_hashPtr(ip1, dictHLog, mls);
-                PREFETCH_L1(dictHashTable + dictHash);
-                continue;
-            } else {
+            if (dictMatchIndex > dictStartIndex &&
+                MEM_read32(dictMatch) == MEM_read32(ip0)) {
                 /* found a dict match */
                 U32 const offset = (U32)(curr-dictMatchIndex-dictIndexDelta);
                 mLength = ZSTD_count_2segments(ip0+4, dictMatch+4, iend, dictEnd, prefixStart) + 4;
@@ -495,20 +494,9 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
                 offset_2 = offset_1;
                 offset_1 = offset;
                 ZSTD_storeSeq(seqStore, (size_t)(ip0-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+                goto _match;
             }
-        } else if (MEM_read32(match) != MEM_read32(ip0)) {
-            /* it's not a match, and we're not going to check the dictionary */
-            assert(stepSize >= 1);
-            dictMatchIndex = dictHashTable[dictHash];
-            matchIndex = hashTable[hash1];
-            ip0 = ip1;
-            ip1 = ip1 + ((ip1-anchor) >> kSearchStrength) + stepSize;
-            hash0 = hash1;
-            hash1 = ZSTD_hashPtr(ip1, hlog, mls);
-            dictHash = ZSTD_hashPtr(ip1, dictHLog, mls);
-            PREFETCH_L1(dictHashTable + dictHash);
-            continue;
-        } else {
+        } else if (MEM_read32(match) == MEM_read32(ip0)) {
             /* found a regular match */
             U32 const offset = (U32)(ip0-match);
             mLength = ZSTD_count(ip0+4, match+4, iend) + 4;
@@ -517,42 +505,63 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
             offset_2 = offset_1;
             offset_1 = offset;
             ZSTD_storeSeq(seqStore, (size_t)(ip0-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+            goto _match;
         }
 
-        /* match found */
-        ip0 += mLength;
-        anchor = ip0;
+        /* Prepare for next iteration */
+        dictMatchIndex = dictHashTable[dictHash];
+        matchIndex = hashTable[hash1];
 
-        if (ip0 <= ilimit) {
-            /* Fill Table */
-            assert(base+curr+2 > istart);  /* check base overflow */
-            hashTable[ZSTD_hashPtr(base+curr+2, hlog, mls)] = curr+2;  /* here because curr+2 could be > iend-8 */
-            hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base);
+        ip0 = ip1;
+        ip1 = ip1 + posIncr;
+        if (ip1 >= ilimit) goto _cleanup;
 
-            /* check immediate repcode */
-            while (ip0 <= ilimit) {
-                U32 const current2 = (U32)(ip0-base);
-                U32 const repIndex2 = current2 - offset_2;
-                const BYTE* repMatch2 = repIndex2 < prefixStartIndex ?
-                        dictBase - dictIndexDelta + repIndex2 :
-                        base + repIndex2;
-                if ( ((U32)((prefixStartIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */)
-                   && (MEM_read32(repMatch2) == MEM_read32(ip0)) ) {
-                    const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
-                    size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
-                    U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
-                    ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
-                    hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = current2;
-                    ip0 += repLength2;
-                    anchor = ip0;
-                    continue;
-                }
-                break;
+        hash0 = hash1;
+        hash1 = ZSTD_hashPtr(ip1, hlog, mls);
+        dictHash = ZSTD_hashPtr(ip1, dictHLog, mls);
+
+        PREFETCH_L1(hashTable + hash1);
+        PREFETCH_L1(dictHashTable + dictHash);
+    }
+
+_match:
+    /* match found */
+    assert(mLength);
+    ip0 += mLength;
+    anchor = ip0;
+
+    if (ip0 <= ilimit) {
+        /* Fill Table */
+        assert(base + curr + 2 > istart);  /* check base overflow */
+        hashTable[ZSTD_hashPtr(base + curr + 2, hlog, mls)] = curr + 2;  /* here because curr+2 could be > iend-8 */
+        hashTable[ZSTD_hashPtr(ip0 - 2, hlog, mls)] = (U32) (ip0 - 2 - base);
+
+        /* check immediate repcode */
+        while (ip0 <= ilimit) {
+            U32 const current2 = (U32) (ip0 - base);
+            U32 const repIndex2 = current2 - offset_2;
+            const BYTE *repMatch2 = repIndex2 < prefixStartIndex ?
+                                    dictBase - dictIndexDelta + repIndex2 :
+                                    base + repIndex2;
+            if (((U32) ((prefixStartIndex - 1) - (U32) repIndex2) >= 3 /* intentional overflow */)
+                && (MEM_read32(repMatch2) == MEM_read32(ip0))) {
+                const BYTE *const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
+                size_t const repLength2 =
+                        ZSTD_count_2segments(ip0 + 4, repMatch2 + 4, iend, repEnd2, prefixStart) + 4;
+                U32 tmpOffset = offset_2;
+                offset_2 = offset_1;
+                offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
+                ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
+                hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = current2;
+                ip0 += repLength2;
+                anchor = ip0;
+                continue;
             }
+            break;
         }
+    }
 
-        goto _start; /* found match, reset pipeline */
-    } while (ip1 < ilimit);
+    goto _start; /* found match, reset pipeline */
 
 _cleanup:
     /* save reps for next block */

From c0a306bdd74e55ba65584250d665171e54dd6843 Mon Sep 17 00:00:00 2001
From: Elliot Gorokhovsky <embg@fb.com>
Date: Wed, 2 Mar 2022 10:41:08 -0500
Subject: [PATCH 04/10] Remove L1 prefetch

---
 lib/compress/zstd_fast.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/lib/compress/zstd_fast.c b/lib/compress/zstd_fast.c
index 985c802de2c..4eddd0130ac 100644
--- a/lib/compress/zstd_fast.c
+++ b/lib/compress/zstd_fast.c
@@ -447,9 +447,6 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
     hash1 = ZSTD_hashPtr(ip1, hlog, mls);
     dictHash = ZSTD_hashPtr(ip1, dictHLog, mls);
 
-    PREFETCH_L1(hashTable + hash1);
-    PREFETCH_L1(dictHashTable + dictHash);
-
     /* Main Search Loop */
     while (1) {
         const BYTE* match = base + matchIndex;
@@ -519,9 +516,6 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
         hash0 = hash1;
         hash1 = ZSTD_hashPtr(ip1, hlog, mls);
         dictHash = ZSTD_hashPtr(ip1, dictHLog, mls);
-
-        PREFETCH_L1(hashTable + hash1);
-        PREFETCH_L1(dictHashTable + dictHash);
     }
 
 _match:

From 6f6de0605b4d464de823a9ee7f1453c9760775b4 Mon Sep 17 00:00:00 2001
From: Elliot Gorokhovsky <embg@fb.com>
Date: Wed, 2 Mar 2022 12:46:36 -0500
Subject: [PATCH 05/10] Remove L2 prefetching

---
 lib/compress/zstd_fast.c | 16 +---------------
 1 file changed, 1 insertion(+), 15 deletions(-)

diff --git a/lib/compress/zstd_fast.c b/lib/compress/zstd_fast.c
index 4eddd0130ac..5f2178e3f67 100644
--- a/lib/compress/zstd_fast.c
+++ b/lib/compress/zstd_fast.c
@@ -398,8 +398,6 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
     const U32 dictIndexDelta       = prefixStartIndex - (U32)(dictEnd - dictBase);
     const U32 dictAndPrefixLength  = (U32)(istart - prefixStart + dictEnd - dictStart);
     const U32 dictHLog             = dictCParams->hashLog;
-    const BYTE* dictPrefetchPtr    = dictEnd;
-    const BYTE* const dictPrefetchLimit = dictPrefetchPtr - MIN(dictEnd - dictStart, 102400); /* 100KB */
 
     /* loop variables */
     const BYTE* ip0 = istart;
@@ -454,21 +452,9 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
         const BYTE* repMatch = (repIndex < prefixStartIndex) ?
                                dictBase + (repIndex - dictIndexDelta) :
                                base + repIndex;
-        const size_t posIncr = ((ip1 - anchor) >> kSearchStrength) + stepSize;
-        const BYTE* dictIndexPrefetchPtr = ip1 + (posIncr << 2);
         curr = (U32)(ip0-base);
         hashTable[hash0] = curr;   /* update hash table */
 
-        /* Cold dict optimization (may need to gate this) */
-        if (dictIndexPrefetchPtr < ilimit) {
-            PREFETCH_L2(dictHashTable + ZSTD_hashPtr(ip1 + (posIncr << 2), dictHLog, mls));
-        }
-        if (dictPrefetchPtr >= dictPrefetchLimit) {
-            PREFETCH_L2(dictPrefetchPtr);
-            PREFETCH_L2(dictPrefetchPtr - 64);
-            dictPrefetchPtr -= 64;
-        }
-
         if ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */
           && (MEM_read32(repMatch) == MEM_read32(ip0+1)) ) {
             const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
@@ -510,7 +496,7 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
         matchIndex = hashTable[hash1];
 
         ip0 = ip1;
-        ip1 = ip1 + posIncr;
+        ip1 = ip1 + ((ip1 - anchor) >> kSearchStrength) + stepSize;
         if (ip1 >= ilimit) goto _cleanup;
 
         hash0 = hash1;

From ced6f72fbf50d1973080e6ac2e83a9619c8205c2 Mon Sep 17 00:00:00 2001
From: Elliot Gorokhovsky <embg@fb.com>
Date: Wed, 2 Mar 2022 13:05:03 -0500
Subject: [PATCH 06/10] Reduce # of gotos

---
 lib/compress/zstd_fast.c | 212 +++++++++++++++++++--------------------
 1 file changed, 104 insertions(+), 108 deletions(-)

diff --git a/lib/compress/zstd_fast.c b/lib/compress/zstd_fast.c
index 5f2178e3f67..a5ab068daca 100644
--- a/lib/compress/zstd_fast.c
+++ b/lib/compress/zstd_fast.c
@@ -401,14 +401,7 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
 
     /* loop variables */
     const BYTE* ip0 = istart;
-    const BYTE* ip1;
-    size_t hash0;
-    size_t hash1;
-    size_t dictHash; /* inside the do{} loop, this is the hash for ip1 */
-    U32 matchIndex;
-    U32 dictMatchIndex;
-    size_t mLength=0; /* initialize to avoid compiler warning, assert != 0 later on */
-    U32 curr;
+    const BYTE* ip1 = ip0 + stepSize;
 
     /* if a dictionary is still attached, it necessarily means that
      * it is within window size. So we just check it. */
@@ -431,117 +424,120 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
     assert(offset_1 <= dictAndPrefixLength);
     assert(offset_2 <= dictAndPrefixLength);
 
-_start: /* Requires: ip0 */
-    assert(ip0 == anchor);
-    assert(stepSize >= 1);
-
-    ip1 = ip0 + stepSize;
-    if (ip1 >= ilimit) goto _cleanup;
-
-    hash0 = ZSTD_hashPtr(ip0, hlog, mls);
-    dictHash = ZSTD_hashPtr(ip0, dictHLog, mls);
-    dictMatchIndex = dictHashTable[dictHash];
-    matchIndex = hashTable[hash0];
-    hash1 = ZSTD_hashPtr(ip1, hlog, mls);
-    dictHash = ZSTD_hashPtr(ip1, dictHLog, mls);
-
-    /* Main Search Loop */
-    while (1) {
-        const BYTE* match = base + matchIndex;
-        const U32 repIndex = (U32)(ip0-base) + 1 - offset_1;
-        const BYTE* repMatch = (repIndex < prefixStartIndex) ?
-                               dictBase + (repIndex - dictIndexDelta) :
-                               base + repIndex;
-        curr = (U32)(ip0-base);
-        hashTable[hash0] = curr;   /* update hash table */
-
-        if ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */
-          && (MEM_read32(repMatch) == MEM_read32(ip0+1)) ) {
-            const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
-            mLength = ZSTD_count_2segments(ip0+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4;
-            ip0++;
-            ZSTD_storeSeq(seqStore, (size_t)(ip0-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
-            goto _match;
-        } else if ( (matchIndex <= prefixStartIndex) ) {
-            /* We only look for a dict match if the normal matchIndex is invalid */
-            const BYTE* dictMatch = dictBase + dictMatchIndex;
-            if (dictMatchIndex > dictStartIndex &&
-                MEM_read32(dictMatch) == MEM_read32(ip0)) {
-                /* found a dict match */
-                U32 const offset = (U32)(curr-dictMatchIndex-dictIndexDelta);
-                mLength = ZSTD_count_2segments(ip0+4, dictMatch+4, iend, dictEnd, prefixStart) + 4;
-                while (((ip0>anchor) & (dictMatch>dictStart))
-                     && (ip0[-1] == dictMatch[-1])) {
-                    ip0--; dictMatch--; mLength++;
+    /* Outer search loop */
+    while (ip1 < ilimit) {
+        size_t mLength;
+        size_t hash0 = ZSTD_hashPtr(ip0, hlog, mls);
+        size_t dictHash = ZSTD_hashPtr(ip0, dictHLog, mls);
+        U32 dictMatchIndex = dictHashTable[dictHash];
+        U32 matchIndex = hashTable[hash0];
+        U32 curr = (U32)(ip0 - base);
+
+        /* Inner search loop */
+        while (1) {
+            const BYTE *match = base + matchIndex;
+            const U32 repIndex = curr + 1 - offset_1;
+            const BYTE *repMatch = (repIndex < prefixStartIndex) ?
+                                   dictBase + (repIndex - dictIndexDelta) :
+                                   base + repIndex;
+            size_t hash1 = ZSTD_hashPtr(ip1, hlog, mls);
+            dictHash = ZSTD_hashPtr(ip1, dictHLog, mls);
+            hashTable[hash0] = curr;   /* update hash table */
+
+            if (((U32) ((prefixStartIndex - 1) - repIndex) >=
+                 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */
+                && (MEM_read32(repMatch) == MEM_read32(ip0 + 1))) {
+                const BYTE *const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
+                mLength = ZSTD_count_2segments(ip0 + 1 + 4, repMatch + 4, iend, repMatchEnd, prefixStart) + 4;
+                ip0++;
+                ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
+                break;
+            } else if ((matchIndex <= prefixStartIndex)) {
+                /* We only look for a dict match if the normal matchIndex is invalid */
+                const BYTE *dictMatch = dictBase + dictMatchIndex;
+                if (dictMatchIndex > dictStartIndex &&
+                    MEM_read32(dictMatch) == MEM_read32(ip0)) {
+                    /* found a dict match */
+                    U32 const offset = (U32) (curr - dictMatchIndex - dictIndexDelta);
+                    mLength = ZSTD_count_2segments(ip0 + 4, dictMatch + 4, iend, dictEnd, prefixStart) + 4;
+                    while (((ip0 > anchor) & (dictMatch > dictStart))
+                           && (ip0[-1] == dictMatch[-1])) {
+                        ip0--;
+                        dictMatch--;
+                        mLength++;
+                    } /* catch up */
+                    offset_2 = offset_1;
+                    offset_1 = offset;
+                    ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+                    break;
+                }
+            } else if (MEM_read32(match) == MEM_read32(ip0)) {
+                /* found a regular match */
+                U32 const offset = (U32) (ip0 - match);
+                mLength = ZSTD_count(ip0 + 4, match + 4, iend) + 4;
+                while (((ip0 > anchor) & (match > prefixStart))
+                       && (ip0[-1] == match[-1])) {
+                    ip0--;
+                    match--;
+                    mLength++;
                 } /* catch up */
                 offset_2 = offset_1;
                 offset_1 = offset;
-                ZSTD_storeSeq(seqStore, (size_t)(ip0-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
-                goto _match;
+                ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+                break;
             }
-        } else if (MEM_read32(match) == MEM_read32(ip0)) {
-            /* found a regular match */
-            U32 const offset = (U32)(ip0-match);
-            mLength = ZSTD_count(ip0+4, match+4, iend) + 4;
-            while (((ip0>anchor) & (match>prefixStart))
-                 && (ip0[-1] == match[-1])) { ip0--; match--; mLength++; } /* catch up */
-            offset_2 = offset_1;
-            offset_1 = offset;
-            ZSTD_storeSeq(seqStore, (size_t)(ip0-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
-            goto _match;
-        }
 
-        /* Prepare for next iteration */
-        dictMatchIndex = dictHashTable[dictHash];
-        matchIndex = hashTable[hash1];
+            /* Prepare for next iteration */
+            dictMatchIndex = dictHashTable[dictHash];
+            matchIndex = hashTable[hash1];
+            ip0 = ip1;
+            ip1 = ip1 + ((ip1 - anchor) >> kSearchStrength) + stepSize;
+            if (ip1 >= ilimit) goto _cleanup;
+            curr = (U32)(ip0 - base);
+            hash0 = hash1;
+        }
 
-        ip0 = ip1;
-        ip1 = ip1 + ((ip1 - anchor) >> kSearchStrength) + stepSize;
-        if (ip1 >= ilimit) goto _cleanup;
+        /* match found */
+        assert(mLength);
+        ip0 += mLength;
+        anchor = ip0;
 
-        hash0 = hash1;
-        hash1 = ZSTD_hashPtr(ip1, hlog, mls);
-        dictHash = ZSTD_hashPtr(ip1, dictHLog, mls);
-    }
-
-_match:
-    /* match found */
-    assert(mLength);
-    ip0 += mLength;
-    anchor = ip0;
+        if (ip0 <= ilimit) {
+            /* Fill Table */
+            assert(base + curr + 2 > istart);  /* check base overflow */
+            hashTable[ZSTD_hashPtr(base + curr + 2, hlog, mls)] = curr + 2;  /* here because curr+2 could be > iend-8 */
+            hashTable[ZSTD_hashPtr(ip0 - 2, hlog, mls)] = (U32) (ip0 - 2 - base);
 
-    if (ip0 <= ilimit) {
-        /* Fill Table */
-        assert(base + curr + 2 > istart);  /* check base overflow */
-        hashTable[ZSTD_hashPtr(base + curr + 2, hlog, mls)] = curr + 2;  /* here because curr+2 could be > iend-8 */
-        hashTable[ZSTD_hashPtr(ip0 - 2, hlog, mls)] = (U32) (ip0 - 2 - base);
-
-        /* check immediate repcode */
-        while (ip0 <= ilimit) {
-            U32 const current2 = (U32) (ip0 - base);
-            U32 const repIndex2 = current2 - offset_2;
-            const BYTE *repMatch2 = repIndex2 < prefixStartIndex ?
-                                    dictBase - dictIndexDelta + repIndex2 :
-                                    base + repIndex2;
-            if (((U32) ((prefixStartIndex - 1) - (U32) repIndex2) >= 3 /* intentional overflow */)
-                && (MEM_read32(repMatch2) == MEM_read32(ip0))) {
-                const BYTE *const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
-                size_t const repLength2 =
-                        ZSTD_count_2segments(ip0 + 4, repMatch2 + 4, iend, repEnd2, prefixStart) + 4;
-                U32 tmpOffset = offset_2;
-                offset_2 = offset_1;
-                offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
-                ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
-                hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = current2;
-                ip0 += repLength2;
-                anchor = ip0;
-                continue;
+            /* check immediate repcode */
+            while (ip0 <= ilimit) {
+                U32 const current2 = (U32) (ip0 - base);
+                U32 const repIndex2 = current2 - offset_2;
+                const BYTE *repMatch2 = repIndex2 < prefixStartIndex ?
+                                        dictBase - dictIndexDelta + repIndex2 :
+                                        base + repIndex2;
+                if (((U32) ((prefixStartIndex - 1) - (U32) repIndex2) >= 3 /* intentional overflow */)
+                    && (MEM_read32(repMatch2) == MEM_read32(ip0))) {
+                    const BYTE *const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
+                    size_t const repLength2 =
+                            ZSTD_count_2segments(ip0 + 4, repMatch2 + 4, iend, repEnd2, prefixStart) + 4;
+                    U32 tmpOffset = offset_2;
+                    offset_2 = offset_1;
+                    offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
+                    ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
+                    hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = current2;
+                    ip0 += repLength2;
+                    anchor = ip0;
+                    continue;
+                }
+                break;
             }
-            break;
         }
-    }
 
-    goto _start; /* found match, reset pipeline */
+        /* Prepare for next iteration */
+        assert(ip0 == anchor);
+        assert(stepSize >= 1);
+        ip1 = ip0 + stepSize;
+    }
 
 _cleanup:
     /* save reps for next block */

From af680806cc8899f59be8e7c2e4f213deec258344 Mon Sep 17 00:00:00 2001
From: Elliot Gorokhovsky <embg@fb.com>
Date: Fri, 4 Mar 2022 14:42:02 -0500
Subject: [PATCH 07/10] Cosmetic fixes

---
 lib/compress/zstd_fast.c | 39 +++++++++++++++++----------------------
 1 file changed, 17 insertions(+), 22 deletions(-)

diff --git a/lib/compress/zstd_fast.c b/lib/compress/zstd_fast.c
index a5ab068daca..81fadba6f60 100644
--- a/lib/compress/zstd_fast.c
+++ b/lib/compress/zstd_fast.c
@@ -380,6 +380,8 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
     U32 const stepSize = cParams->targetLength + !(cParams->targetLength);
     const BYTE* const base = ms->window.base;
     const BYTE* const istart = (const BYTE*)src;
+    const BYTE* ip0 = istart;
+    const BYTE* ip1 = ip0 + stepSize; /* we assert below that stepSize >= 1 */
     const BYTE* anchor = istart;
     const U32   prefixStartIndex = ms->window.dictLimit;
     const BYTE* const prefixStart = base + prefixStartIndex;
@@ -399,10 +401,6 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
     const U32 dictAndPrefixLength  = (U32)(istart - prefixStart + dictEnd - dictStart);
     const U32 dictHLog             = dictCParams->hashLog;
 
-    /* loop variables */
-    const BYTE* ip0 = istart;
-    const BYTE* ip1 = ip0 + stepSize;
-
     /* if a dictionary is still attached, it necessarily means that
      * it is within window size. So we just check it. */
     const U32 maxDistance = 1U << cParams->windowLog;
@@ -425,7 +423,8 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
     assert(offset_2 <= dictAndPrefixLength);
 
     /* Outer search loop */
-    while (ip1 < ilimit) {
+    assert(stepSize >= 1);
+    while (ip1 < ilimit) {   /* < instead of <=, because repcode check at (ip+1) */
         size_t mLength;
         size_t hash0 = ZSTD_hashPtr(ip0, hlog, mls);
         size_t dictHash = ZSTD_hashPtr(ip0, dictHLog, mls);
@@ -452,7 +451,7 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
                 ip0++;
                 ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
                 break;
-            } else if ((matchIndex <= prefixStartIndex)) {
+            } else if (matchIndex <= prefixStartIndex) {
                 /* We only look for a dict match if the normal matchIndex is invalid */
                 const BYTE *dictMatch = dictBase + dictMatchIndex;
                 if (dictMatchIndex > dictStartIndex &&
@@ -504,25 +503,22 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
 
         if (ip0 <= ilimit) {
             /* Fill Table */
-            assert(base + curr + 2 > istart);  /* check base overflow */
-            hashTable[ZSTD_hashPtr(base + curr + 2, hlog, mls)] = curr + 2;  /* here because curr+2 could be > iend-8 */
-            hashTable[ZSTD_hashPtr(ip0 - 2, hlog, mls)] = (U32) (ip0 - 2 - base);
+            assert(base+curr+2 > istart);  /* check base overflow */
+            hashTable[ZSTD_hashPtr(base+curr+2, hlog, mls)] = curr+2;  /* here because curr+2 could be > iend-8 */
+            hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base);
 
             /* check immediate repcode */
             while (ip0 <= ilimit) {
-                U32 const current2 = (U32) (ip0 - base);
+                U32 const current2 = (U32)(ip0-base);
                 U32 const repIndex2 = current2 - offset_2;
-                const BYTE *repMatch2 = repIndex2 < prefixStartIndex ?
-                                        dictBase - dictIndexDelta + repIndex2 :
-                                        base + repIndex2;
-                if (((U32) ((prefixStartIndex - 1) - (U32) repIndex2) >= 3 /* intentional overflow */)
-                    && (MEM_read32(repMatch2) == MEM_read32(ip0))) {
-                    const BYTE *const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
-                    size_t const repLength2 =
-                            ZSTD_count_2segments(ip0 + 4, repMatch2 + 4, iend, repEnd2, prefixStart) + 4;
-                    U32 tmpOffset = offset_2;
-                    offset_2 = offset_1;
-                    offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
+                const BYTE* repMatch2 = repIndex2 < prefixStartIndex ?
+                        dictBase - dictIndexDelta + repIndex2 :
+                        base + repIndex2;
+                if ( ((U32)((prefixStartIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */)
+                   && (MEM_read32(repMatch2) == MEM_read32(ip0))) {
+                    const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
+                    size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
+                    U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
                     ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
                     hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = current2;
                     ip0 += repLength2;
@@ -535,7 +531,6 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
 
         /* Prepare for next iteration */
         assert(ip0 == anchor);
-        assert(stepSize >= 1);
         ip1 = ip0 + stepSize;
     }
 

From 6cba817c8f2eff8a387b17033a02ec88eab4648e Mon Sep 17 00:00:00 2001
From: Elliot Gorokhovsky <embg@fb.com>
Date: Wed, 9 Mar 2022 11:30:40 -0500
Subject: [PATCH 08/10] Check final position sometimes

---
 lib/compress/zstd_fast.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/compress/zstd_fast.c b/lib/compress/zstd_fast.c
index 81fadba6f60..9c55f24bc13 100644
--- a/lib/compress/zstd_fast.c
+++ b/lib/compress/zstd_fast.c
@@ -424,7 +424,7 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
 
     /* Outer search loop */
     assert(stepSize >= 1);
-    while (ip1 < ilimit) {   /* < instead of <=, because repcode check at (ip+1) */
+    while (ip1 <= ilimit) { /* repcode check at (ip0 + 1) is safe because ip0 < ip1 */
         size_t mLength;
         size_t hash0 = ZSTD_hashPtr(ip0, hlog, mls);
         size_t dictHash = ZSTD_hashPtr(ip0, dictHLog, mls);
@@ -491,7 +491,7 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
             matchIndex = hashTable[hash1];
             ip0 = ip1;
             ip1 = ip1 + ((ip1 - anchor) >> kSearchStrength) + stepSize;
-            if (ip1 >= ilimit) goto _cleanup;
+            if (ip1 > ilimit) goto _cleanup;
             curr = (U32)(ip0 - base);
             hash0 = hash1;
         }

From b4f9a3fac089566170450ffeff524e5c481a8432 Mon Sep 17 00:00:00 2001
From: Elliot Gorokhovsky <embg@fb.com>
Date: Thu, 10 Mar 2022 10:15:54 -0500
Subject: [PATCH 09/10] Track step size as in bc768bc

---
 lib/compress/zstd_fast.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/lib/compress/zstd_fast.c b/lib/compress/zstd_fast.c
index 9c55f24bc13..2409db6d39d 100644
--- a/lib/compress/zstd_fast.c
+++ b/lib/compress/zstd_fast.c
@@ -431,6 +431,9 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
         U32 dictMatchIndex = dictHashTable[dictHash];
         U32 matchIndex = hashTable[hash0];
         U32 curr = (U32)(ip0 - base);
+        size_t step = stepSize;
+        const size_t kStepIncr = 1 << kSearchStrength;
+        const BYTE* nextStep = ip0 + kStepIncr;
 
         /* Inner search loop */
         while (1) {
@@ -489,9 +492,15 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
             /* Prepare for next iteration */
             dictMatchIndex = dictHashTable[dictHash];
             matchIndex = hashTable[hash1];
+
+            if (ip1 >= nextStep) {
+                step++;
+                nextStep += kStepIncr;
+            }
             ip0 = ip1;
-            ip1 = ip1 + ((ip1 - anchor) >> kSearchStrength) + stepSize;
+            ip1 = ip1 + step;
             if (ip1 > ilimit) goto _cleanup;
+
             curr = (U32)(ip0 - base);
             hash0 = hash1;
         }

From 201f0e25785e57ef9ded9fb7ba3da44bb9ec45e4 Mon Sep 17 00:00:00 2001
From: Elliot Gorokhovsky <embg@fb.com>
Date: Thu, 10 Mar 2022 16:39:44 -0500
Subject: [PATCH 10/10] Fix nits

---
 lib/compress/zstd_fast.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/lib/compress/zstd_fast.c b/lib/compress/zstd_fast.c
index 2409db6d39d..5da108c622a 100644
--- a/lib/compress/zstd_fast.c
+++ b/lib/compress/zstd_fast.c
@@ -424,11 +424,11 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
 
     /* Outer search loop */
     assert(stepSize >= 1);
-    while (ip1 <= ilimit) { /* repcode check at (ip0 + 1) is safe because ip0 < ip1 */
+    while (ip1 <= ilimit) {   /* repcode check at (ip0 + 1) is safe because ip0 < ip1 */
         size_t mLength;
         size_t hash0 = ZSTD_hashPtr(ip0, hlog, mls);
-        size_t dictHash = ZSTD_hashPtr(ip0, dictHLog, mls);
-        U32 dictMatchIndex = dictHashTable[dictHash];
+        const size_t dictHash0 = ZSTD_hashPtr(ip0, dictHLog, mls);
+        U32 dictMatchIndex = dictHashTable[dictHash0];
         U32 matchIndex = hashTable[hash0];
         U32 curr = (U32)(ip0 - base);
         size_t step = stepSize;
@@ -437,26 +437,26 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
 
         /* Inner search loop */
         while (1) {
-            const BYTE *match = base + matchIndex;
+            const BYTE* match = base + matchIndex;
             const U32 repIndex = curr + 1 - offset_1;
-            const BYTE *repMatch = (repIndex < prefixStartIndex) ?
+            const BYTE* repMatch = (repIndex < prefixStartIndex) ?
                                    dictBase + (repIndex - dictIndexDelta) :
                                    base + repIndex;
-            size_t hash1 = ZSTD_hashPtr(ip1, hlog, mls);
-            dictHash = ZSTD_hashPtr(ip1, dictHLog, mls);
+            const size_t hash1 = ZSTD_hashPtr(ip1, hlog, mls);
+            const size_t dictHash1 = ZSTD_hashPtr(ip1, dictHLog, mls);
             hashTable[hash0] = curr;   /* update hash table */
 
             if (((U32) ((prefixStartIndex - 1) - repIndex) >=
                  3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */
                 && (MEM_read32(repMatch) == MEM_read32(ip0 + 1))) {
-                const BYTE *const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
+                const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
                 mLength = ZSTD_count_2segments(ip0 + 1 + 4, repMatch + 4, iend, repMatchEnd, prefixStart) + 4;
                 ip0++;
                 ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
                 break;
             } else if (matchIndex <= prefixStartIndex) {
                 /* We only look for a dict match if the normal matchIndex is invalid */
-                const BYTE *dictMatch = dictBase + dictMatchIndex;
+                const BYTE* dictMatch = dictBase + dictMatchIndex;
                 if (dictMatchIndex > dictStartIndex &&
                     MEM_read32(dictMatch) == MEM_read32(ip0)) {
                     /* found a dict match */
@@ -490,7 +490,7 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
             }
 
             /* Prepare for next iteration */
-            dictMatchIndex = dictHashTable[dictHash];
+            dictMatchIndex = dictHashTable[dictHash1];
             matchIndex = hashTable[hash1];
 
             if (ip1 >= nextStep) {
@@ -503,7 +503,7 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
 
             curr = (U32)(ip0 - base);
             hash0 = hash1;
-        }
+        }   /* end inner search loop */
 
         /* match found */
         assert(mLength);