compress:check more bytes to reduce ZSTD_count call

JunHe77 · JunHe77 · commit 78e60fc52fe0 · 2022-07-19T15:57:26.000+08:00
Comparing 4B instead of comparing 1B in ZSTD_noDict
mode, thus it can avoid cases like match in match[ml]
but mismatch in match[ml-3]..match[ml-1]. So the call
count of ZSTD_count can be reduced.

Signed-off-by: Jun He &lt;jun.he@arm.com&gt;
Change-Id: I3449ea423d5c8e8344f75341f19a2d1643c703f6
diff --git a/lib/compress/zstd_lazy.c b/lib/compress/zstd_lazy.c
@@ -678,6 +678,11 @@ size_t ZSTD_HcFindBestMatch(
                         ? ZSTD_hashPtr(ip, ddsHashLog, mls) << ZSTD_LAZY_DDSS_BUCKET_LOG : 0;
 
     U32 matchIndex;
+    U32 ref = 0, lookup = 0, pr = 0;
+    if (dictMode == ZSTD_noDict) {
+        /* read initial refernence */
+        ref = MEM_read32(ip);
+    }
 
     if (dictMode == ZSTD_dedicatedDictSearch) {
         const U32* entry = &dms->hashTable[ddsIdx];
@@ -692,8 +697,14 @@ size_t ZSTD_HcFindBestMatch(
         if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
             const BYTE* const match = base + matchIndex;
             assert(matchIndex >= dictLimit);   /* ensures this is true if dictMode != ZSTD_extDict */
-            if (match[ml] == ip[ml])   /* potentially better */
-                currentMl = ZSTD_count(ip, match, iLimit);
+            if (dictMode == ZSTD_noDict) {
+                lookup = MEM_read32(match + pr); /* read 4B starting from (match + ml + 1 - sizeof(U32)) */
+                if (ref == lookup)   /* potentially better */
+                    currentMl = ZSTD_count(ip, match, iLimit);
+            } else {
+                if (match[ml] == ip[ml])   /* potentially better */
+                    currentMl = ZSTD_count(ip, match, iLimit);
+            }
         } else {
             const BYTE* const match = dictBase + matchIndex;
             assert(match+4 <= dictEnd);
@@ -706,6 +717,11 @@ size_t ZSTD_HcFindBestMatch(
             ml = currentMl;
             *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
             if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
+            if (dictMode == ZSTD_noDict) {
+                /* have a new longer ml, now advance one more byte and read new reference */
+                pr = (U32)ml - sizeof(ref) + 1;
+                ref = MEM_read32(ip + pr);
+            }
         }
 
         if (matchIndex <= minChain) break;
@@ -1210,6 +1226,11 @@ size_t ZSTD_RowFindBestMatch(
         size_t numMatches = 0;
         size_t currMatch = 0;
         ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, headGrouped, rowEntries);
+        U32 ref = 0, lookup = 0, pr = 0;
+        if (dictMode == ZSTD_noDict) {
+            /* read initial ref */
+            ref = MEM_read32(ip);
+        }
 
         /* Cycle through the matches and prefetch */
         for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
@@ -1244,8 +1265,14 @@ size_t ZSTD_RowFindBestMatch(
             if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
                 const BYTE* const match = base + matchIndex;
                 assert(matchIndex >= dictLimit);   /* ensures this is true if dictMode != ZSTD_extDict */
-                if (match[ml] == ip[ml])   /* potentially better */
-                    currentMl = ZSTD_count(ip, match, iLimit);
+                if (dictMode == ZSTD_noDict) {
+                    lookup = MEM_read32(match + pr); /* read 4B starting from (match + ml + 1 - sizeof(U32)) */
+                    if (ref == lookup)   /* potentially better */
+                        currentMl = ZSTD_count(ip, match, iLimit);
+                } else {
+                    if (match[ml] == ip[ml])   /* potentially better */
+                        currentMl = ZSTD_count(ip, match, iLimit);
+                }
             } else {
                 const BYTE* const match = dictBase + matchIndex;
                 assert(match+4 <= dictEnd);
@@ -1258,6 +1285,11 @@ size_t ZSTD_RowFindBestMatch(
                 ml = currentMl;
                 *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
                 if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
+                if (dictMode == ZSTD_noDict) {
+                    /* have a new longer ml, now advance one more byte and read new reference */
+                    pr = (U32)ml - sizeof(ref) + 1;
+                    ref = MEM_read32(ip + pr);
+                }
             }
         }
     }