From 2a402626dd046bf17e6172fe8d829ed5a443fbe3 Mon Sep 17 00:00:00 2001 From: Elliot Gorokhovsky Date: Wed, 28 Dec 2022 16:45:14 -0500 Subject: [PATCH] External matchfinder API (#3333) * First building commit with sample matchfinder * Set up ZSTD_externalMatchCtx struct * move seqBuffer to ZSTD_Sequence* * support non-contiguous dictionary * clean up parens * add clearExternalMatchfinder, handle allocation errors * Add useExternalMatchfinder cParam * validate useExternalMatchfinder cParam * Disable LDM + external matchfinder * Check for static CCtx * Validate mState and mStateDestructor * Improve LDM check to cover both branches * Error API with optional fallback * handle RLE properly for external matchfinder * nit * Move to a CDict-like model for resource ownership * Add hidden useExternalMatchfinder bool to CCtx_params_s * Eliminate malloc, move to cwksp allocation * Handle CCtx reset properly * Ensure seqStore has enough space for external sequences * fix capitalization * Add DEBUGLOG statements * Add compressionLevel param to matchfinder API * fix c99 issues and add a param combination error code * nits * Test external matchfinder API * C90 compat for simpleExternalMatchFinder * Fix some @nocommits and an ASAN bug * nit * nit * nits * forward declare copySequencesToSeqStore functions in zstd_compress_internal.h * nit * nit * nits * Update copyright headers * Fix CMake zstreamtest build * Fix copyright headers (again) * typo * Add externalMatchfinder demo program to make contrib * Reduce memory consumption for small blockSize * ZSTD_postProcessExternalMatchFinderResult nits * test sum(matchlen) + sum(litlen) == srcSize in debug builds * refExternalMatchFinder -> registerExternalMatchFinder * C90 nit * zstreamtest nits * contrib nits * contrib nits * allow block splitter + external matchfinder, refactor * add windowSize param * add contrib/externalMatchfinder/README.md * docs * go back to old RLE heuristic because of the first block issue * fix initializer element is not a constant expression * ref contrib from zstd.h * extremely pedantic compiler warning fix, meson fix, typo fix * Additional docs on API limitations * minor nits * Refactor maxNbSeq calculation into a helper function * Fix copyright --- Makefile | 2 + build/cmake/tests/CMakeLists.txt | 2 +- build/meson/tests/meson.build | 6 +- contrib/externalMatchfinder/.gitignore | 2 + contrib/externalMatchfinder/Makefile | 40 ++++ contrib/externalMatchfinder/README.md | 14 ++ contrib/externalMatchfinder/main.c | 107 ++++++++++ contrib/externalMatchfinder/matchfinder.c | 80 ++++++++ contrib/externalMatchfinder/matchfinder.h | 26 +++ lib/common/error_private.c | 2 + lib/compress/zstd_compress.c | 230 ++++++++++++++++++---- lib/compress/zstd_compress_internal.h | 53 +++++ lib/zstd.h | 156 ++++++++++++++- lib/zstd_errors.h | 2 + tests/Makefile | 2 +- tests/external_matchfinder.c | 117 +++++++++++ tests/external_matchfinder.h | 35 ++++ tests/zstreamtest.c | 93 ++++++++- 18 files changed, 929 insertions(+), 40 deletions(-) create mode 100644 contrib/externalMatchfinder/.gitignore create mode 100644 contrib/externalMatchfinder/Makefile create mode 100644 contrib/externalMatchfinder/README.md create mode 100644 contrib/externalMatchfinder/main.c create mode 100644 contrib/externalMatchfinder/matchfinder.c create mode 100644 contrib/externalMatchfinder/matchfinder.h create mode 100644 tests/external_matchfinder.c create mode 100644 tests/external_matchfinder.h diff --git a/Makefile b/Makefile index 10fbe47f72d..d87fc76eb52 100644 --- a/Makefile +++ b/Makefile @@ -123,6 +123,7 @@ contrib: lib $(MAKE) -C contrib/seekable_format/examples all $(MAKE) -C contrib/seekable_format/tests test $(MAKE) -C contrib/largeNbDicts all + $(MAKE) -C contrib/externalMatchfinder all cd build/single_file_libs/ ; ./build_decoder_test.sh cd build/single_file_libs/ ; ./build_library_test.sh @@ -142,6 +143,7 @@ clean: $(Q)$(MAKE) -C contrib/seekable_format/examples $@ > $(VOID) $(Q)$(MAKE) -C contrib/seekable_format/tests $@ > $(VOID) $(Q)$(MAKE) -C contrib/largeNbDicts $@ > $(VOID) + $(Q)$(MAKE) -C contrib/externalMatchfinder $@ > $(VOID) $(Q)$(RM) zstd$(EXT) zstdmt$(EXT) tmp* $(Q)$(RM) -r lz4 @echo Cleaning completed diff --git a/build/cmake/tests/CMakeLists.txt b/build/cmake/tests/CMakeLists.txt index 53e0e7b173e..250f0508f37 100644 --- a/build/cmake/tests/CMakeLists.txt +++ b/build/cmake/tests/CMakeLists.txt @@ -81,7 +81,7 @@ add_test(NAME fuzzer COMMAND fuzzer ${ZSTD_FUZZER_FLAGS}) # # zstreamtest # -add_executable(zstreamtest ${PROGRAMS_DIR}/datagen.c ${PROGRAMS_DIR}/util.c ${PROGRAMS_DIR}/timefn.c ${TESTS_DIR}/seqgen.c ${TESTS_DIR}/zstreamtest.c) +add_executable(zstreamtest ${PROGRAMS_DIR}/datagen.c ${PROGRAMS_DIR}/util.c ${PROGRAMS_DIR}/timefn.c ${TESTS_DIR}/seqgen.c ${TESTS_DIR}/zstreamtest.c ${TESTS_DIR}/external_matchfinder.c) if (NOT MSVC) target_compile_options(zstreamtest PRIVATE "-Wno-deprecated-declarations") endif() diff --git a/build/meson/tests/meson.build b/build/meson/tests/meson.build index f7ba5310188..e70b73432c5 100644 --- a/build/meson/tests/meson.build +++ b/build/meson/tests/meson.build @@ -65,8 +65,10 @@ fuzzer = executable('fuzzer', dependencies: [ testcommon_dep, thread_dep ], install: false) -zstreamtest_sources = [join_paths(zstd_rootdir, 'tests/seqgen.c'), - join_paths(zstd_rootdir, 'tests/zstreamtest.c')] +zstreamtest_sources = [ + join_paths(zstd_rootdir, 'tests/seqgen.c'), + join_paths(zstd_rootdir, 'tests/zstreamtest.c'), + join_paths(zstd_rootdir, 'tests/external_matchfinder.c')] zstreamtest = executable('zstreamtest', zstreamtest_sources, include_directories: test_includes, diff --git a/contrib/externalMatchfinder/.gitignore b/contrib/externalMatchfinder/.gitignore new file mode 100644 index 00000000000..46357ef5800 --- /dev/null +++ b/contrib/externalMatchfinder/.gitignore @@ -0,0 +1,2 @@ +# build artifacts +externalMatchfinder diff --git a/contrib/externalMatchfinder/Makefile b/contrib/externalMatchfinder/Makefile new file mode 100644 index 00000000000..2baa558cb56 --- /dev/null +++ b/contrib/externalMatchfinder/Makefile @@ -0,0 +1,40 @@ +# ################################################################ +# Copyright (c) Yann Collet, Meta Platforms, Inc. +# All rights reserved. +# +# This source code is licensed under both the BSD-style license (found in the +# LICENSE file in the root directory of this source tree) and the GPLv2 (found +# in the COPYING file in the root directory of this source tree). +# ################################################################ + +PROGDIR = ../../programs +LIBDIR = ../../lib + +LIBZSTD = $(LIBDIR)/libzstd.a + +CPPFLAGS+= -I$(LIBDIR) -I$(LIBDIR)/compress -I$(LIBDIR)/common + +CFLAGS ?= -O3 +CFLAGS += -std=gnu99 +DEBUGFLAGS= -Wall -Wextra -Wcast-qual -Wcast-align -Wshadow \ + -Wstrict-aliasing=1 -Wswitch-enum \ + -Wstrict-prototypes -Wundef -Wpointer-arith \ + -Wvla -Wformat=2 -Winit-self -Wfloat-equal -Wwrite-strings \ + -Wredundant-decls +CFLAGS += $(DEBUGFLAGS) $(MOREFLAGS) + +default: externalMatchfinder + +all: externalMatchfinder + +externalMatchfinder: matchfinder.c main.c $(LIBZSTD) + $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ + +.PHONY: $(LIBZSTD) +$(LIBZSTD): + $(MAKE) -C $(LIBDIR) libzstd.a CFLAGS="$(CFLAGS)" + +clean: + $(RM) *.o + $(MAKE) -C $(LIBDIR) clean > /dev/null + $(RM) externalMatchfinder diff --git a/contrib/externalMatchfinder/README.md b/contrib/externalMatchfinder/README.md new file mode 100644 index 00000000000..cb7d49d97a8 --- /dev/null +++ b/contrib/externalMatchfinder/README.md @@ -0,0 +1,14 @@ +externalMatchfinder +===================== + +`externalMatchfinder` is a test tool for the external matchfinder API. +It demonstrates how to use the API to perform a simple round-trip test. + +A sample matchfinder is provided in matchfinder.c, but the user can swap +this out with a different one if desired. The sample matchfinder implements +LZ compression with a 1KB hashtable. Dictionary compression is not currently supported. + +Command line : +``` +externalMatchfinder filename +``` diff --git a/contrib/externalMatchfinder/main.c b/contrib/externalMatchfinder/main.c new file mode 100644 index 00000000000..6971a46c7e2 --- /dev/null +++ b/contrib/externalMatchfinder/main.c @@ -0,0 +1,107 @@ +/* + * Copyright (c) Yann Collet, Meta Platforms, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#include +#include +#include +#include + +#define ZSTD_STATIC_LINKING_ONLY +#include "zstd.h" +#include "zstd_errors.h" +#include "matchfinder.h" // simpleExternalMatchFinder + +#define CHECK(res) \ +do { \ + if (ZSTD_isError(res)) { \ + printf("ERROR: %s\n", ZSTD_getErrorName(res)); \ + return 1; \ + } \ +} while (0) \ + +int main(int argc, char *argv[]) { + if (argc != 2) { + printf("Usage: exampleMatchfinder \n"); + return 1; + } + + ZSTD_CCtx* const zc = ZSTD_createCCtx(); + + int simpleExternalMatchState = 0xdeadbeef; + + // Here is the crucial bit of code! + ZSTD_registerExternalMatchFinder( + zc, + &simpleExternalMatchState, + simpleExternalMatchFinder + ); + + { + size_t const res = ZSTD_CCtx_setParameter(zc, ZSTD_c_enableMatchFinderFallback, 1); + CHECK(res); + } + + FILE *f = fopen(argv[1], "rb"); + assert(f); + { + int const ret = fseek(f, 0, SEEK_END); + assert(ret == 0); + } + size_t const srcSize = ftell(f); + { + int const ret = fseek(f, 0, SEEK_SET); + assert(ret == 0); + } + + char* const src = malloc(srcSize + 1); + assert(src); + { + size_t const ret = fread(src, srcSize, 1, f); + assert(ret == 1); + int const ret2 = fclose(f); + assert(ret2 == 0); + } + + size_t const dstSize = ZSTD_compressBound(srcSize); + char* const dst = malloc(dstSize); + assert(dst); + + size_t const cSize = ZSTD_compress2(zc, dst, dstSize, src, srcSize); + CHECK(cSize); + + char* const val = malloc(srcSize); + assert(val); + + { + size_t const res = ZSTD_decompress(val, srcSize, dst, cSize); + CHECK(res); + } + + if (memcmp(src, val, srcSize) == 0) { + printf("Compression and decompression were successful!\n"); + printf("Original size: %lu\n", srcSize); + printf("Compressed size: %lu\n", cSize); + } else { + printf("ERROR: input and validation buffers don't match!\n"); + for (size_t i = 0; i < srcSize; i++) { + if (src[i] != val[i]) { + printf("First bad index: %zu\n", i); + break; + } + } + return 1; + } + + ZSTD_freeCCtx(zc); + free(src); + free(dst); + free(val); + return 0; +} diff --git a/contrib/externalMatchfinder/matchfinder.c b/contrib/externalMatchfinder/matchfinder.c new file mode 100644 index 00000000000..f119193ef1d --- /dev/null +++ b/contrib/externalMatchfinder/matchfinder.c @@ -0,0 +1,80 @@ +/* + * Copyright (c) Yann Collet, Meta Platforms, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#include "zstd_compress_internal.h" +#include "matchfinder.h" + +#define HSIZE 1024 +static U32 const HLOG = 10; +static U32 const MLS = 4; +static U32 const BADIDX = 0xffffffff; + +size_t simpleExternalMatchFinder( + void* externalMatchState, + ZSTD_Sequence* outSeqs, size_t outSeqsCapacity, + const void* src, size_t srcSize, + const void* dict, size_t dictSize, + int compressionLevel, + size_t windowSize +) { + const BYTE* const istart = (const BYTE*)src; + const BYTE* const iend = istart + srcSize; + const BYTE* ip = istart; + const BYTE* anchor = istart; + size_t seqCount = 0; + U32 hashTable[HSIZE]; + + (void)externalMatchState; + (void)dict; + (void)dictSize; + (void)outSeqsCapacity; + (void)compressionLevel; + + { int i; + for (i=0; i < HSIZE; i++) { + hashTable[i] = BADIDX; + } } + + while (ip + MLS < iend) { + size_t const hash = ZSTD_hashPtr(ip, HLOG, MLS); + U32 const matchIndex = hashTable[hash]; + hashTable[hash] = (U32)(ip - istart); + + if (matchIndex != BADIDX) { + const BYTE* const match = istart + matchIndex; + U32 const matchLen = (U32)ZSTD_count(ip, match, iend); + if (matchLen >= ZSTD_MINMATCH_MIN) { + U32 const litLen = (U32)(ip - anchor); + U32 const offset = (U32)(ip - match); + ZSTD_Sequence const seq = { + offset, litLen, matchLen, 0 + }; + + /* Note: it's crucial to stay within the window size! */ + if (offset <= windowSize) { + outSeqs[seqCount++] = seq; + ip += matchLen; + anchor = ip; + continue; + } + } + } + + ip++; + } + + { ZSTD_Sequence const finalSeq = { + 0, (U32)(iend - anchor), 0, 0 + }; + outSeqs[seqCount++] = finalSeq; + } + + return seqCount; +} diff --git a/contrib/externalMatchfinder/matchfinder.h b/contrib/externalMatchfinder/matchfinder.h new file mode 100644 index 00000000000..f8ba1c96531 --- /dev/null +++ b/contrib/externalMatchfinder/matchfinder.h @@ -0,0 +1,26 @@ +/* + * Copyright (c) Yann Collet, Meta Platforms, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef MATCHFINDER_H +#define MATCHFINDER_H + +#define ZSTD_STATIC_LINKING_ONLY +#include "zstd.h" + +size_t simpleExternalMatchFinder( + void* externalMatchState, + ZSTD_Sequence* outSeqs, size_t outSeqsCapacity, + const void* src, size_t srcSize, + const void* dict, size_t dictSize, + int compressionLevel, + size_t windowSize +); + +#endif diff --git a/lib/common/error_private.c b/lib/common/error_private.c index fe73c5edd4b..fb4d7059621 100644 --- a/lib/common/error_private.c +++ b/lib/common/error_private.c @@ -31,6 +31,7 @@ const char* ERR_getErrorString(ERR_enum code) case PREFIX(checksum_wrong): return "Restored data doesn't match checksum"; case PREFIX(literals_headerWrong): return "Header of Literals' block doesn't respect format specification"; case PREFIX(parameter_unsupported): return "Unsupported parameter"; + case PREFIX(parameter_combination_unsupported): return "Unsupported combination of parameters"; case PREFIX(parameter_outOfBound): return "Parameter is out of bound"; case PREFIX(init_missing): return "Context should be init first"; case PREFIX(memory_allocation): return "Allocation error : not enough memory"; @@ -51,6 +52,7 @@ const char* ERR_getErrorString(ERR_enum code) case PREFIX(seekableIO): return "An I/O error occurred when reading/seeking"; case PREFIX(dstBuffer_wrong): return "Destination buffer is wrong"; case PREFIX(srcBuffer_wrong): return "Source buffer is wrong"; + case PREFIX(externalMatchFinder_failed): return "External matchfinder returned an error code"; case PREFIX(maxCode): default: return notErrorCode; } diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index 4cf2c09456e..9ded251295f 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -278,6 +278,16 @@ static ZSTD_paramSwitch_e ZSTD_resolveEnableLdm(ZSTD_paramSwitch_e mode, return (cParams->strategy >= ZSTD_btopt && cParams->windowLog >= 27) ? ZSTD_ps_enable : ZSTD_ps_disable; } +/* Enables validation for external sequences in debug builds. */ +static int ZSTD_resolveExternalSequenceValidation(int mode) { +#if defined(DEBUGLEVEL) && (DEBUGLEVEL>=2) + (void)mode; + return 1; +#else + return mode; +#endif +} + /* Returns 1 if compression parameters are such that CDict hashtable and chaintable indices are tagged. * If so, the tags need to be removed in ZSTD_resetCCtx_byCopyingCDict. */ static int ZSTD_CDictIndicesAreTagged(const ZSTD_compressionParameters* const cParams) { @@ -301,6 +311,7 @@ static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams( } cctxParams.useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams.useBlockSplitter, &cParams); cctxParams.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams.useRowMatchFinder, &cParams); + cctxParams.validateSequences = ZSTD_resolveExternalSequenceValidation(cctxParams.validateSequences); assert(!ZSTD_checkCParams(cParams)); return cctxParams; } @@ -362,6 +373,7 @@ static void ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, ZSTD_par cctxParams->useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams->useRowMatchFinder, ¶ms->cParams); cctxParams->useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams->useBlockSplitter, ¶ms->cParams); cctxParams->ldmParams.enableLdm = ZSTD_resolveEnableLdm(cctxParams->ldmParams.enableLdm, ¶ms->cParams); + cctxParams->validateSequences = ZSTD_resolveExternalSequenceValidation(cctxParams->validateSequences); DEBUGLOG(4, "ZSTD_CCtxParams_init_internal: useRowMatchFinder=%d, useBlockSplitter=%d ldm=%d", cctxParams->useRowMatchFinder, cctxParams->useBlockSplitter, cctxParams->ldmParams.enableLdm); } @@ -584,6 +596,11 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param) bounds.upperBound = (int)ZSTD_ps_disable; return bounds; + case ZSTD_c_enableMatchFinderFallback: + bounds.lowerBound = 0; + bounds.upperBound = 1; + return bounds; + default: bounds.error = ERROR(parameter_unsupported); return bounds; @@ -649,6 +666,7 @@ static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param) case ZSTD_c_useRowMatchFinder: case ZSTD_c_deterministicRefPrefix: case ZSTD_c_prefetchCDictTables: + case ZSTD_c_enableMatchFinderFallback: default: return 0; } @@ -705,6 +723,7 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value) case ZSTD_c_useRowMatchFinder: case ZSTD_c_deterministicRefPrefix: case ZSTD_c_prefetchCDictTables: + case ZSTD_c_enableMatchFinderFallback: break; default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); @@ -937,6 +956,11 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, CCtxParams->prefetchCDictTables = (ZSTD_paramSwitch_e)value; return CCtxParams->prefetchCDictTables; + case ZSTD_c_enableMatchFinderFallback: + BOUNDCHECK(ZSTD_c_enableMatchFinderFallback, value); + CCtxParams->enableMatchFinderFallback = value; + return CCtxParams->enableMatchFinderFallback; + default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); } } @@ -1072,6 +1096,9 @@ size_t ZSTD_CCtxParams_getParameter( case ZSTD_c_prefetchCDictTables: *value = (int)CCtxParams->prefetchCDictTables; break; + case ZSTD_c_enableMatchFinderFallback: + *value = CCtxParams->enableMatchFinderFallback; + break; default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); } return 0; @@ -1243,6 +1270,7 @@ size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset) RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, "Can't reset parameters only when not in init stage."); ZSTD_clearAllDicts(cctx); + ZSTD_memset(&cctx->externalMatchCtx, 0, sizeof(cctx->externalMatchCtx)); return ZSTD_CCtxParams_reset(&cctx->requestedParams); } return 0; @@ -1485,6 +1513,13 @@ ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams, return tableSpace + optSpace + slackSpace + lazyAdditionalSpace; } +/* Helper function for calculating memory requirements. + * Gives a tighter bound than ZSTD_sequenceBound() by taking minMatch into account. */ +static size_t ZSTD_maxNbSeq(size_t blockSize, unsigned minMatch, int useExternalMatchFinder) { + U32 const divider = (minMatch==3 || useExternalMatchFinder) ? 3 : 4; + return blockSize / divider; +} + static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal( const ZSTD_compressionParameters* cParams, const ldmParams_t* ldmParams, @@ -1492,12 +1527,12 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal( const ZSTD_paramSwitch_e useRowMatchFinder, const size_t buffInSize, const size_t buffOutSize, - const U64 pledgedSrcSize) + const U64 pledgedSrcSize, + int useExternalMatchFinder) { size_t const windowSize = (size_t) BOUNDED(1ULL, 1ULL << cParams->windowLog, pledgedSrcSize); size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize); - U32 const divider = (cParams->minMatch==3) ? 3 : 4; - size_t const maxNbSeq = blockSize / divider; + size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, cParams->minMatch, useExternalMatchFinder); size_t const tokenSpace = ZSTD_cwksp_alloc_size(WILDCOPY_OVERLENGTH + blockSize) + ZSTD_cwksp_aligned_alloc_size(maxNbSeq * sizeof(seqDef)) + 3 * ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(BYTE)); @@ -1516,6 +1551,11 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal( size_t const cctxSpace = isStatic ? ZSTD_cwksp_alloc_size(sizeof(ZSTD_CCtx)) : 0; + size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize); + size_t const externalSeqSpace = useExternalMatchFinder + ? ZSTD_cwksp_alloc_size(maxNbExternalSeq * sizeof(ZSTD_Sequence)) + : 0; + size_t const neededSpace = cctxSpace + entropySpace + @@ -1524,7 +1564,8 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal( ldmSeqSpace + matchStateSize + tokenSpace + - bufferSpace; + bufferSpace + + externalSeqSpace; DEBUGLOG(5, "estimate workspace : %u", (U32)neededSpace); return neededSpace; @@ -1542,7 +1583,7 @@ size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params) * be needed. However, we still allocate two 0-sized buffers, which can * take space under ASAN. */ return ZSTD_estimateCCtxSize_usingCCtxParams_internal( - &cParams, ¶ms->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN); + &cParams, ¶ms->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN, params->useExternalMatchFinder); } size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams) @@ -1603,7 +1644,7 @@ size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params) return ZSTD_estimateCCtxSize_usingCCtxParams_internal( &cParams, ¶ms->ldmParams, 1, useRowMatchFinder, inBuffSize, outBuffSize, - ZSTD_CONTENTSIZE_UNKNOWN); + ZSTD_CONTENTSIZE_UNKNOWN, params->useExternalMatchFinder); } } @@ -1886,8 +1927,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, { size_t const windowSize = MAX(1, (size_t)MIN(((U64)1 << params->cParams.windowLog), pledgedSrcSize)); size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize); - U32 const divider = (params->cParams.minMatch==3) ? 3 : 4; - size_t const maxNbSeq = blockSize / divider; + size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, params->cParams.minMatch, params->useExternalMatchFinder); size_t const buffOutSize = (zbuff == ZSTDb_buffered && params->outBufferMode == ZSTD_bm_buffered) ? ZSTD_compressBound(blockSize) + 1 : 0; @@ -1904,7 +1944,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, size_t const neededSpace = ZSTD_estimateCCtxSize_usingCCtxParams_internal( ¶ms->cParams, ¶ms->ldmParams, zc->staticSize != 0, params->useRowMatchFinder, - buffInSize, buffOutSize, pledgedSrcSize); + buffInSize, buffOutSize, pledgedSrcSize, params->useExternalMatchFinder); int resizeWorkspace; FORWARD_IF_ERROR(neededSpace, "cctx size estimate failed!"); @@ -2017,6 +2057,14 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, zc->ldmState.loadedDictEnd = 0; } + /* reserve space for block-level external sequences */ + if (params->useExternalMatchFinder) { + size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize); + zc->externalMatchCtx.seqBufferCapacity = maxNbExternalSeq; + zc->externalMatchCtx.seqBuffer = + (ZSTD_Sequence*)ZSTD_cwksp_reserve_aligned(ws, maxNbExternalSeq * sizeof(ZSTD_Sequence)); + } + DEBUGLOG(3, "wksp: finished allocating, %zd bytes remain available", ZSTD_cwksp_available_space(ws)); assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace, resizeWorkspace)); @@ -2868,6 +2916,55 @@ void ZSTD_resetSeqStore(seqStore_t* ssPtr) ssPtr->longLengthType = ZSTD_llt_none; } +/* ZSTD_postProcessExternalMatchFinderResult() : + * Validates and post-processes sequences obtained through the external matchfinder API: + * - Checks whether nbExternalSeqs represents an error condition. + * - Appends a block delimiter to outSeqs if one is not already present. + * See zstd.h for context regarding block delimiters. + * Returns the number of sequences after post-processing, or an error code. */ +static size_t ZSTD_postProcessExternalMatchFinderResult( + ZSTD_Sequence* outSeqs, size_t nbExternalSeqs, size_t outSeqsCapacity, size_t srcSize +) { + RETURN_ERROR_IF( + nbExternalSeqs > outSeqsCapacity, + externalMatchFinder_failed, + "External matchfinder returned error code %lu", + (unsigned long)nbExternalSeqs + ); + + RETURN_ERROR_IF( + nbExternalSeqs == 0 && srcSize > 0, + externalMatchFinder_failed, + "External matchfinder produced zero sequences for a non-empty src buffer!" + ); + + if (srcSize == 0) { + ZSTD_memset(&outSeqs[0], 0, sizeof(ZSTD_Sequence)); + return 1; + } + + { + ZSTD_Sequence const lastSeq = outSeqs[nbExternalSeqs - 1]; + + /* We can return early if lastSeq is already a block delimiter. */ + if (lastSeq.offset == 0 && lastSeq.matchLength == 0) { + return nbExternalSeqs; + } + + /* This error condition is only possible if the external matchfinder + * produced an invalid parse, by definition of ZSTD_sequenceBound(). */ + RETURN_ERROR_IF( + nbExternalSeqs == outSeqsCapacity, + externalMatchFinder_failed, + "nbExternalSeqs == outSeqsCapacity but lastSeq is not a block delimiter!" + ); + + /* lastSeq is not a block delimiter, so we need to append one. */ + ZSTD_memset(&outSeqs[nbExternalSeqs], 0, sizeof(ZSTD_Sequence)); + return nbExternalSeqs + 1; + } +} + typedef enum { ZSTDbss_compress, ZSTDbss_noCompress } ZSTD_buildSeqStore_e; static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) @@ -2915,6 +3012,15 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) } if (zc->externSeqStore.pos < zc->externSeqStore.size) { assert(zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_disable); + + /* External matchfinder + LDM is technically possible, just not implemented yet. + * We need to revisit soon and implement it. */ + RETURN_ERROR_IF( + zc->appliedParams.useExternalMatchFinder, + parameter_combination_unsupported, + "Long-distance matching with external matchfinder enabled is not currently supported." + ); + /* Updates ldmSeqStore.pos */ lastLLSize = ZSTD_ldm_blockCompress(&zc->externSeqStore, @@ -2926,6 +3032,14 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) } else if (zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable) { rawSeqStore_t ldmSeqStore = kNullRawSeqStore; + /* External matchfinder + LDM is technically possible, just not implemented yet. + * We need to revisit soon and implement it. */ + RETURN_ERROR_IF( + zc->appliedParams.useExternalMatchFinder, + parameter_combination_unsupported, + "Long-distance matching with external matchfinder enabled is not currently supported." + ); + ldmSeqStore.seq = zc->ldmSequences; ldmSeqStore.capacity = zc->maxNbLdmSequences; /* Updates ldmSeqStore.size */ @@ -2940,10 +3054,64 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) zc->appliedParams.useRowMatchFinder, src, srcSize); assert(ldmSeqStore.pos == ldmSeqStore.size); - } else { /* not long range mode */ + } else if (zc->appliedParams.useExternalMatchFinder) { + assert( + zc->externalMatchCtx.seqBufferCapacity >= ZSTD_sequenceBound(srcSize) + ); + assert(zc->externalMatchCtx.mFinder != NULL); + + { U32 const windowSize = (U32)1 << zc->appliedParams.cParams.windowLog; + + size_t const nbExternalSeqs = (zc->externalMatchCtx.mFinder)( + zc->externalMatchCtx.mState, + zc->externalMatchCtx.seqBuffer, + zc->externalMatchCtx.seqBufferCapacity, + src, srcSize, + NULL, 0, /* dict and dictSize, currently not supported */ + zc->appliedParams.compressionLevel, + windowSize + ); + + size_t const nbPostProcessedSeqs = ZSTD_postProcessExternalMatchFinderResult( + zc->externalMatchCtx.seqBuffer, + nbExternalSeqs, + zc->externalMatchCtx.seqBufferCapacity, + srcSize + ); + + /* Return early if there is no error, since we don't need to worry about last literals */ + if (!ZSTD_isError(nbPostProcessedSeqs)) { + ZSTD_sequencePosition seqPos = {0,0,0}; + ZSTD_copySequencesToSeqStoreExplicitBlockDelim( + zc, &seqPos, zc->externalMatchCtx.seqBuffer, nbPostProcessedSeqs, src, srcSize + ); + ms->ldmSeqStore = NULL; + DEBUGLOG(5, "Copied %lu sequences from external matchfinder to internal seqStore.", (unsigned long)nbExternalSeqs); + return ZSTDbss_compress; + } + + /* Propagate the error if fallback is disabled */ + if (!zc->appliedParams.enableMatchFinderFallback) { + return nbPostProcessedSeqs; + } + + /* Fallback to software matchfinder */ + { ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy, + zc->appliedParams.useRowMatchFinder, + dictMode); + ms->ldmSeqStore = NULL; + DEBUGLOG( + 5, + "External matchfinder returned error code %lu. Falling back to internal matchfinder.", + (unsigned long)nbExternalSeqs + ); + lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize); + } } + } else { /* not long range mode and no external matchfinder */ ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy, zc->appliedParams.useRowMatchFinder, dictMode); + assert(zc->externalMatchCtx.mFinder == NULL); ms->ldmSeqStore = NULL; lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize); } @@ -5726,6 +5894,7 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx, params.useBlockSplitter = ZSTD_resolveBlockSplitterMode(params.useBlockSplitter, ¶ms.cParams); params.ldmParams.enableLdm = ZSTD_resolveEnableLdm(params.ldmParams.enableLdm, ¶ms.cParams); params.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params.useRowMatchFinder, ¶ms.cParams); + params.validateSequences = ZSTD_resolveExternalSequenceValidation(params.validateSequences); #ifdef ZSTD_MULTITHREAD if ((cctx->pledgedSrcSizePlusOne-1) <= ZSTDMT_JOBSIZE_MIN) { @@ -5927,12 +6096,6 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx, } } -typedef struct { - U32 idx; /* Index in array of ZSTD_Sequence */ - U32 posInSequence; /* Position within sequence at idx */ - size_t posInSrc; /* Number of bytes given by sequences provided so far */ -} ZSTD_sequencePosition; - /* ZSTD_validateSequence() : * @offCode : is presumed to follow format required by ZSTD_storeSeq() * @returns a ZSTD error code if sequence is not valid @@ -5970,10 +6133,7 @@ static U32 ZSTD_finalizeOffBase(U32 rawOffset, const U32 rep[ZSTD_REP_NUM], U32 return offBase; } -/* Returns 0 on success, and a ZSTD_error otherwise. This function scans through an array of - * ZSTD_Sequence, storing the sequences it finds, until it reaches a block delimiter. - */ -static size_t +size_t ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos, const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, @@ -6027,19 +6187,7 @@ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx, return 0; } -/* Returns the number of bytes to move the current read position back by. - * Only non-zero if we ended up splitting a sequence. - * Otherwise, it may return a ZSTD error if something went wrong. - * - * This function will attempt to scan through blockSize bytes - * represented by the sequences in @inSeqs, - * storing any (partial) sequences. - * - * Occasionally, we may want to change the actual number of bytes we consumed from inSeqs to - * avoid splitting a match, or to avoid splitting a match such that it would produce a match - * smaller than MINMATCH. In this case, we return the number of bytes that we didn't read from this block. - */ -static size_t +size_t ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos, const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, const void* src, size_t blockSize) @@ -6572,3 +6720,19 @@ ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long srcSizeH if (srcSizeHint == 0) srcSizeHint = ZSTD_CONTENTSIZE_UNKNOWN; return ZSTD_getParams_internal(compressionLevel, srcSizeHint, dictSize, ZSTD_cpm_unknown); } + +void ZSTD_registerExternalMatchFinder( + ZSTD_CCtx* zc, void* mState, + ZSTD_externalMatchFinder_F* mFinder +) { + ZSTD_externalMatchCtx emctx = { + mState, + mFinder, + + /* seqBuffer is allocated later (from the cwskp) */ + NULL, /* seqBuffer */ + 0 /* seqBufferCapacity */ + }; + zc->externalMatchCtx = emctx; + zc->requestedParams.useExternalMatchFinder = 1; +} diff --git a/lib/compress/zstd_compress_internal.h b/lib/compress/zstd_compress_internal.h index e4bb2f5372c..f755a1f79f7 100644 --- a/lib/compress/zstd_compress_internal.h +++ b/lib/compress/zstd_compress_internal.h @@ -150,6 +150,12 @@ typedef struct { size_t capacity; /* The capacity starting from `seq` pointer */ } rawSeqStore_t; +typedef struct { + U32 idx; /* Index in array of ZSTD_Sequence */ + U32 posInSequence; /* Position within sequence at idx */ + size_t posInSrc; /* Number of bytes given by sequences provided so far */ +} ZSTD_sequencePosition; + UNUSED_ATTR static const rawSeqStore_t kNullRawSeqStore = {NULL, 0, 0, 0, 0}; typedef struct { @@ -340,6 +346,15 @@ struct ZSTD_CCtx_params_s { /* Controls prefetching in some dictMatchState matchfinders */ ZSTD_paramSwitch_e prefetchCDictTables; + + /* Controls whether zstd will fall back to an internal matchfinder + * if the external matchfinder returns an error code. */ + int enableMatchFinderFallback; + + /* Indicates whether an external matchfinder has been referenced. + * Users can't set this externally. + * It is set internally in ZSTD_registerExternalMatchFinder(). */ + int useExternalMatchFinder; }; /* typedef'd to ZSTD_CCtx_params within "zstd.h" */ #define COMPRESS_SEQUENCES_WORKSPACE_SIZE (sizeof(unsigned) * (MaxSeq + 2)) @@ -371,6 +386,14 @@ typedef struct { ZSTD_entropyCTablesMetadata_t entropyMetadata; } ZSTD_blockSplitCtx; +/* Context for block-level external matchfinder API */ +typedef struct { + void* mState; + ZSTD_externalMatchFinder_F* mFinder; + ZSTD_Sequence* seqBuffer; + size_t seqBufferCapacity; +} ZSTD_externalMatchCtx; + struct ZSTD_CCtx_s { ZSTD_compressionStage_e stage; int cParamsChanged; /* == 1 if cParams(except wlog) or compression level are changed in requestedParams. Triggers transmission of new params to ZSTDMT (if available) then reset to 0. */ @@ -440,6 +463,9 @@ struct ZSTD_CCtx_s { /* Workspace for block splitter */ ZSTD_blockSplitCtx blockSplitCtx; + + /* Workspace for external matchfinder */ + ZSTD_externalMatchCtx externalMatchCtx; }; typedef enum { ZSTD_dtlm_fast, ZSTD_dtlm_full } ZSTD_dictTableLoadMethod_e; @@ -1411,4 +1437,31 @@ U32 ZSTD_cycleLog(U32 hashLog, ZSTD_strategy strat); */ void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize); +/* Returns 0 on success, and a ZSTD_error otherwise. This function scans through an array of + * ZSTD_Sequence, storing the sequences it finds, until it reaches a block delimiter. + * Note that the block delimiter must include the last literals of the block. + */ +size_t +ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx, + ZSTD_sequencePosition* seqPos, + const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, + const void* src, size_t blockSize); + +/* Returns the number of bytes to move the current read position back by. + * Only non-zero if we ended up splitting a sequence. + * Otherwise, it may return a ZSTD error if something went wrong. + * + * This function will attempt to scan through blockSize bytes + * represented by the sequences in @inSeqs, + * storing any (partial) sequences. + * + * Occasionally, we may want to change the actual number of bytes we consumed from inSeqs to + * avoid splitting a match, or to avoid splitting a match such that it would produce a match + * smaller than MINMATCH. In this case, we return the number of bytes that we didn't read from this block. + */ +size_t +ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos, + const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, + const void* src, size_t blockSize); + #endif /* ZSTD_COMPRESS_H */ diff --git a/lib/zstd.h b/lib/zstd.h index dd72e17ed76..14a7d23066d 100644 --- a/lib/zstd.h +++ b/lib/zstd.h @@ -478,6 +478,7 @@ typedef enum { * ZSTD_c_useBlockSplitter * ZSTD_c_useRowMatchFinder * ZSTD_c_prefetchCDictTables + * ZSTD_c_enableMatchFinderFallback * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them. * note : never ever use experimentalParam? names directly; * also, the enums values themselves are unstable and can still change. @@ -497,7 +498,8 @@ typedef enum { ZSTD_c_experimentalParam13=1010, ZSTD_c_experimentalParam14=1011, ZSTD_c_experimentalParam15=1012, - ZSTD_c_experimentalParam16=1013 + ZSTD_c_experimentalParam16=1013, + ZSTD_c_experimentalParam17=1014 } ZSTD_cParameter; typedef struct { @@ -560,7 +562,7 @@ typedef enum { * They will be used to compress next frame. * Resetting session never fails. * - The parameters : changes all parameters back to "default". - * This removes any reference to any dictionary too. + * This also removes any reference to any dictionary or external matchfinder. * Parameters can only be changed between 2 sessions (i.e. no compression is currently ongoing) * otherwise the reset fails, and function returns an error value (which can be tested using ZSTD_isError()) * - Both : similar to resetting the session, followed by resetting parameters. @@ -2044,6 +2046,20 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo */ #define ZSTD_c_prefetchCDictTables ZSTD_c_experimentalParam16 +/* ZSTD_c_enableMatchFinderFallback + * Allowed values are 0 (disable) and 1 (enable). The default setting is 0. + * + * Controls whether zstd will fall back to an internal matchfinder if an + * external matchfinder is registered and returns an error code. This fallback is + * block-by-block: the internal matchfinder will only be called for blocks where + * the external matchfinder returns an error code. Fallback compression will + * follow any other cParam settings, such as compression level, the same as in a + * normal (fully-internal) compression operation. + * + * The user is strongly encouraged to read the full external matchfinder API + * documentation (below) before setting this parameter. */ +#define ZSTD_c_enableMatchFinderFallback ZSTD_c_experimentalParam17 + /*! ZSTD_CCtx_getParameter() : * Get the requested compression parameter value, selected by enum ZSTD_cParameter, * and store it into int* value. @@ -2676,6 +2692,142 @@ ZSTDLIB_STATIC_API size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst, size_ ZSTDLIB_STATIC_API size_t ZSTD_insertBlock (ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize); /**< insert uncompressed block into `dctx` history. Useful for multi-blocks decompression. */ +/* ********************** EXTERNAL MATCHFINDER API ********************** + * + * *** OVERVIEW *** + * This API allows users to replace the zstd internal block-level matchfinder + * with an external matchfinder function. Potential applications of the API + * include hardware-accelerated matchfinders and matchfinders specialized to + * particular types of data. + * + * See contrib/externalMatchfinder for an example program employing the + * external matchfinder API. + * + * *** USAGE *** + * The user is responsible for implementing a function of type + * ZSTD_externalMatchFinder_F. For each block, zstd will pass the following + * arguments to the user-provided function: + * + * - externalMatchState: a pointer to a user-managed state for the external + * matchfinder. + * + * - outSeqs, outSeqsCapacity: an output buffer for sequences produced by the + * external matchfinder. outSeqsCapacity is guaranteed >= + * ZSTD_sequenceBound(srcSize). The memory backing outSeqs is managed by + * the CCtx. + * + * - src, srcSize: an input buffer which the external matchfinder must parse + * into sequences. srcSize is guaranteed to be <= ZSTD_BLOCKSIZE_MAX. + * + * - dict, dictSize: a history buffer, which may be empty, which the external + * matchfinder may reference as it produces sequences for the src buffer. + * Currently, zstd will always pass dictSize == 0 into external matchfinders, + * but this will change in the future. + * + * - compressionLevel: a signed integer representing the zstd compression level + * set by the user for the current operation. The external matchfinder may + * choose to use this information to change its compression strategy and + * speed/ratio tradeoff. Note: The compression level does not reflect zstd + * parameters set through the advanced API. + * + * - windowSize: a size_t representing the maximum allowed offset for external + * sequences. Note that sequence offsets are sometimes allowed to exceed the + * windowSize if a dictionary is present, see doc/zstd_compression_format.md + * for details. + * + * The user-provided function shall return a size_t representing the number of + * sequences written to outSeqs. This return value will be treated as an error + * code if it is greater than outSeqsCapacity. The return value must be non-zero + * if srcSize is non-zero. The ZSTD_EXTERNAL_MATCHFINDER_ERROR macro is provided + * for convenience, but any value greater than outSeqsCapacity will be treated as + * an error code. + * + * If the user-provided function does not return an error code, the sequences + * written to outSeqs must be a valid parse of the src buffer. Data corruption may + * occur if the parse is not valid. A parse is defined to be valid if the + * following conditions hold: + * - The sum of matchLengths and literalLengths is equal to srcSize. + * - All sequences in the parse have matchLength != 0, except for the final + * sequence. matchLength is not constrained for the final sequence. + * - All offsets respect the windowSize parameter as specified in + * doc/zstd_compression_format.md. + * + * zstd will only validate these conditions (and fail compression if they do not + * hold) if the ZSTD_c_validateSequences cParam is enabled. Note that sequence + * validation has a performance cost. + * + * If the user-provided function returns an error, zstd will either fall back + * to an internal matchfinder or fail the compression operation. The user can + * choose between the two behaviors by setting the + * ZSTD_c_enableMatchFinderFallback cParam. Fallback compression will follow any + * other cParam settings, such as compression level, the same as in a normal + * compression operation. + * + * The user shall instruct zstd to use a particular ZSTD_externalMatchFinder_F + * function by calling ZSTD_registerExternalMatchFinder(cctx, externalMatchState, + * externalMatchFinder). This setting will persist until the next parameter reset + * of the CCtx. + * + * The externalMatchState must be initialized by the user before calling + * ZSTD_registerExternalMatchFinder. The user is responsible for destroying the + * externalMatchState. + * + * *** LIMITATIONS *** + * External matchfinders are compatible with all zstd compression APIs. There are + * only two limitations. + * + * First, the ZSTD_c_enableLongDistanceMatching cParam is not supported. + * COMPRESSION WILL FAIL if it is enabled and the user tries to compress with an + * external matchfinder. + * - Note that ZSTD_c_enableLongDistanceMatching is auto-enabled by default in + * some cases (see its documentation for details). Users must explicitly set + * ZSTD_c_enableLongDistanceMatching to ZSTD_ps_disable in such cases if an + * external matchfinder is registered. + * - As of this writing, ZSTD_c_enableLongDistanceMatching is disabled by default + * whenever ZSTD_c_windowLog < 128MB, but that's subject to change. Users should + * check the docs on ZSTD_c_enableLongDistanceMatching whenever the external + * matchfinder API is used in conjunction with advanced settings (like windowLog). + * + * Second, history buffers are not supported. Concretely, zstd will always pass + * dictSize == 0 to the external matchfinder (for now). This has two implications: + * - Dictionaries are not supported. Compression will *not* fail if the user + * references a dictionary, but the dictionary won't have any effect. + * - Stream history is not supported. All compression APIs, including streaming + * APIs, work with the external matchfinder, but the external matchfinder won't + * receive any history from the previous block. Each block is an independent chunk. + * + * Long-term, we plan to overcome both limitations. There is no technical blocker to + * overcoming them. It is purely a question of engineering effort. + */ + +#define ZSTD_EXTERNAL_MATCHFINDER_ERROR ((size_t)(-1)) + +typedef size_t ZSTD_externalMatchFinder_F ( + void* externalMatchState, + ZSTD_Sequence* outSeqs, size_t outSeqsCapacity, + const void* src, size_t srcSize, + const void* dict, size_t dictSize, + int compressionLevel, + size_t windowSize +); + +/*! ZSTD_registerExternalMatchFinder() : + * Instruct zstd to use an external matchfinder function. + * + * The externalMatchState must be initialized by the caller, and the caller is + * responsible for managing its lifetime. This parameter is sticky across + * compressions. It will remain set until the user explicitly resets compression + * parameters. + * + * The user is strongly encouraged to read the full API documentation (above) + * before calling this function. */ +ZSTDLIB_STATIC_API void +ZSTD_registerExternalMatchFinder( + ZSTD_CCtx* cctx, + void* externalMatchState, + ZSTD_externalMatchFinder_F* externalMatchFinder +); + #endif /* ZSTD_H_ZSTD_STATIC_LINKING_ONLY */ #if defined (__cplusplus) diff --git a/lib/zstd_errors.h b/lib/zstd_errors.h index 880e8e4f6da..bd6dbee5ff8 100644 --- a/lib/zstd_errors.h +++ b/lib/zstd_errors.h @@ -75,6 +75,7 @@ typedef enum { ZSTD_error_dictionary_wrong = 32, ZSTD_error_dictionaryCreation_failed = 34, ZSTD_error_parameter_unsupported = 40, + ZSTD_error_parameter_combination_unsupported = 41, ZSTD_error_parameter_outOfBound = 42, ZSTD_error_tableLog_tooLarge = 44, ZSTD_error_maxSymbolValue_tooLarge = 46, @@ -92,6 +93,7 @@ typedef enum { ZSTD_error_seekableIO = 102, ZSTD_error_dstBuffer_wrong = 104, ZSTD_error_srcBuffer_wrong = 105, + ZSTD_error_externalMatchFinder_failed = 106, ZSTD_error_maxCode = 120 /* never EVER use this value directly, it can change in future versions! Use ZSTD_isError() instead */ } ZSTD_ErrorCode; diff --git a/tests/Makefile b/tests/Makefile index baf616acb34..8226176cc86 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -169,7 +169,7 @@ fuzzer-dll : $(ZSTDDIR)/common/xxhash.c $(PRGDIR)/util.c $(PRGDIR)/timefn.c $(PR $(CC) $(CPPFLAGS) $(CFLAGS) $(filter %.c,$^) $(LDFLAGS) -o $@$(EXT) CLEAN += zstreamtest zstreamtest32 -ZSTREAM_LOCAL_FILES := $(PRGDIR)/datagen.c $(PRGDIR)/util.c $(PRGDIR)/timefn.c seqgen.c zstreamtest.c +ZSTREAM_LOCAL_FILES := $(PRGDIR)/datagen.c $(PRGDIR)/util.c $(PRGDIR)/timefn.c seqgen.c zstreamtest.c external_matchfinder.c ZSTREAM_PROPER_FILES := $(ZDICT_FILES) $(ZSTREAM_LOCAL_FILES) ZSTREAMFILES := $(ZSTD_FILES) $(ZSTREAM_PROPER_FILES) zstreamtest32 : CFLAGS += -m32 diff --git a/tests/external_matchfinder.c b/tests/external_matchfinder.c new file mode 100644 index 00000000000..8ae76d519ef --- /dev/null +++ b/tests/external_matchfinder.c @@ -0,0 +1,117 @@ +/* + * Copyright (c) Yann Collet, Meta Platforms, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#include "external_matchfinder.h" +#include +#include "zstd_compress_internal.h" + +#define HSIZE 1024 +static U32 const HLOG = 10; +static U32 const MLS = 4; +static U32 const BADIDX = 0xffffffff; + +static size_t simpleExternalMatchFinder( + void* externalMatchState, + ZSTD_Sequence* outSeqs, size_t outSeqsCapacity, + const void* src, size_t srcSize, + const void* dict, size_t dictSize, + int compressionLevel, + size_t windowSize +) { + const BYTE* const istart = (const BYTE*)src; + const BYTE* const iend = istart + srcSize; + const BYTE* ip = istart; + const BYTE* anchor = istart; + size_t seqCount = 0; + U32 hashTable[HSIZE]; + + (void)externalMatchState; + (void)dict; + (void)dictSize; + (void)outSeqsCapacity; + (void)compressionLevel; + + { int i; + for (i=0; i < HSIZE; i++) { + hashTable[i] = BADIDX; + } } + + while (ip + MLS < iend) { + size_t const hash = ZSTD_hashPtr(ip, HLOG, MLS); + U32 const matchIndex = hashTable[hash]; + hashTable[hash] = (U32)(ip - istart); + + if (matchIndex != BADIDX) { + const BYTE* const match = istart + matchIndex; + U32 const matchLen = (U32)ZSTD_count(ip, match, iend); + if (matchLen >= ZSTD_MINMATCH_MIN) { + U32 const litLen = (U32)(ip - anchor); + U32 const offset = (U32)(ip - match); + ZSTD_Sequence const seq = { + offset, litLen, matchLen, 0 + }; + + /* Note: it's crucial to stay within the window size! */ + if (offset <= windowSize) { + outSeqs[seqCount++] = seq; + ip += matchLen; + anchor = ip; + continue; + } + } + } + + ip++; + } + + { ZSTD_Sequence const finalSeq = { + 0, (U32)(iend - anchor), 0, 0 + }; + outSeqs[seqCount++] = finalSeq; + } + + return seqCount; +} + +size_t zstreamExternalMatchFinder( + void* externalMatchState, + ZSTD_Sequence* outSeqs, size_t outSeqsCapacity, + const void* src, size_t srcSize, + const void* dict, size_t dictSize, + int compressionLevel, + size_t windowSize +) { + EMF_testCase const testCase = *((EMF_testCase*)externalMatchState); + memset(outSeqs, 0, outSeqsCapacity); + + switch (testCase) { + case EMF_ZERO_SEQS: + return 0; + case EMF_ONE_BIG_SEQ: + outSeqs[0].offset = 0; + outSeqs[0].matchLength = 0; + outSeqs[0].litLength = (U32)(srcSize); + return 1; + case EMF_LOTS_OF_SEQS: + return simpleExternalMatchFinder( + externalMatchState, + outSeqs, outSeqsCapacity, + src, srcSize, + dict, dictSize, + compressionLevel, + windowSize + ); + case EMF_SMALL_ERROR: + return outSeqsCapacity + 1; + case EMF_BIG_ERROR: + default: + return ZSTD_EXTERNAL_MATCHFINDER_ERROR; + } +} diff --git a/tests/external_matchfinder.h b/tests/external_matchfinder.h new file mode 100644 index 00000000000..041f73e4d2a --- /dev/null +++ b/tests/external_matchfinder.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) Yann Collet, Meta Platforms, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef EXTERNAL_MATCHFINDER +#define EXTERNAL_MATCHFINDER + +#define ZSTD_STATIC_LINKING_ONLY +#include "zstd.h" + +/* See external_matchfinder.c for details on each test case */ +typedef enum { + EMF_ZERO_SEQS = 0, + EMF_ONE_BIG_SEQ = 1, + EMF_LOTS_OF_SEQS = 2, + EMF_BIG_ERROR = 3, + EMF_SMALL_ERROR = 4 +} EMF_testCase; + +size_t zstreamExternalMatchFinder( + void* externalMatchState, + ZSTD_Sequence* outSeqs, size_t outSeqsCapacity, + const void* src, size_t srcSize, + const void* dict, size_t dictSize, + int compressionLevel, + size_t windowSize +); + +#endif // EXTERNAL_MATCHFINDER diff --git a/tests/zstreamtest.c b/tests/zstreamtest.c index 30c5215ae65..664ff632ac1 100644 --- a/tests/zstreamtest.c +++ b/tests/zstreamtest.c @@ -39,7 +39,7 @@ #include "seqgen.h" #include "util.h" #include "timefn.h" /* UTIL_time_t, UTIL_clockSpanMicro, UTIL_getTime */ - +#include "external_matchfinder.h" /* zstreamExternalMatchFinder, EMF_testCase */ /*-************************************ * Constants @@ -1834,6 +1834,97 @@ static int basicUnitTests(U32 seed, double compressibility, int bigTests) } DISPLAYLEVEL(3, "OK \n"); + DISPLAYLEVEL(3, "test%3i : External matchfinder API: ", testNb++); + { + size_t const dstBufSize = ZSTD_compressBound(CNBufferSize); + BYTE* const dstBuf = (BYTE*)malloc(ZSTD_compressBound(dstBufSize)); + size_t const checkBufSize = CNBufferSize; + BYTE* const checkBuf = (BYTE*)malloc(checkBufSize); + int enableFallback; + EMF_testCase externalMatchState; + + CHECK(dstBuf == NULL || checkBuf == NULL, "allocation failed"); + + ZSTD_CCtx_reset(zc, ZSTD_reset_session_and_parameters); + + /* Reference external matchfinder outside the test loop to + * check that the reference is preserved across compressions */ + ZSTD_registerExternalMatchFinder( + zc, + &externalMatchState, + zstreamExternalMatchFinder + ); + + for (enableFallback = 0; enableFallback < 1; enableFallback++) { + size_t testCaseId; + + EMF_testCase const EMF_successCases[] = { + EMF_ONE_BIG_SEQ, + EMF_LOTS_OF_SEQS, + }; + size_t const EMF_numSuccessCases = 2; + + EMF_testCase const EMF_failureCases[] = { + EMF_ZERO_SEQS, + EMF_BIG_ERROR, + EMF_SMALL_ERROR, + }; + size_t const EMF_numFailureCases = 3; + + /* Test external matchfinder success scenarios */ + for (testCaseId = 0; testCaseId < EMF_numSuccessCases; testCaseId++) { + size_t res; + externalMatchState = EMF_successCases[testCaseId]; + ZSTD_CCtx_reset(zc, ZSTD_reset_session_only); + CHECK_Z(ZSTD_CCtx_setParameter(zc, ZSTD_c_enableMatchFinderFallback, enableFallback)); + res = ZSTD_compress2(zc, dstBuf, dstBufSize, CNBuffer, CNBufferSize); + CHECK(ZSTD_isError(res), "EMF: Compression error: %s", ZSTD_getErrorName(res)); + CHECK_Z(ZSTD_decompress(checkBuf, checkBufSize, dstBuf, res)); + CHECK(memcmp(CNBuffer, checkBuf, CNBufferSize) != 0, "EMF: Corruption!"); + } + + /* Test external matchfinder failure scenarios */ + for (testCaseId = 0; testCaseId < EMF_numFailureCases; testCaseId++) { + size_t res; + externalMatchState = EMF_failureCases[testCaseId]; + ZSTD_CCtx_reset(zc, ZSTD_reset_session_only); + CHECK_Z(ZSTD_CCtx_setParameter(zc, ZSTD_c_enableMatchFinderFallback, enableFallback)); + res = ZSTD_compress2(zc, dstBuf, dstBufSize, CNBuffer, CNBufferSize); + if (enableFallback) { + CHECK_Z(ZSTD_decompress(checkBuf, checkBufSize, dstBuf, res)); + CHECK(memcmp(CNBuffer, checkBuf, CNBufferSize) != 0, "EMF: Corruption!"); + } else { + CHECK(!ZSTD_isError(res), "EMF: Should have raised an error!"); + CHECK( + ZSTD_getErrorCode(res) != ZSTD_error_externalMatchFinder_failed, + "EMF: Wrong error code: %s", ZSTD_getErrorName(res) + ); + } + } + + /* Test compression with external matchfinder + empty src buffer */ + { + size_t res; + externalMatchState = EMF_ZERO_SEQS; + ZSTD_CCtx_reset(zc, ZSTD_reset_session_only); + CHECK_Z(ZSTD_CCtx_setParameter(zc, ZSTD_c_enableMatchFinderFallback, enableFallback)); + res = ZSTD_compress2(zc, dstBuf, dstBufSize, CNBuffer, 0); + CHECK(ZSTD_isError(res), "EMF: Compression error: %s", ZSTD_getErrorName(res)); + CHECK(ZSTD_decompress(checkBuf, checkBufSize, dstBuf, res) != 0, "EMF: Empty src round trip failed!"); + } + } + + /* Test that reset clears the external matchfinder */ + ZSTD_CCtx_reset(zc, ZSTD_reset_session_and_parameters); + externalMatchState = EMF_BIG_ERROR; /* ensure zstd will fail if the matchfinder wasn't cleared */ + CHECK_Z(ZSTD_CCtx_setParameter(zc, ZSTD_c_enableMatchFinderFallback, 0)); + CHECK_Z(ZSTD_compress2(zc, dstBuf, dstBufSize, CNBuffer, CNBufferSize)); + + free(dstBuf); + free(checkBuf); + } + DISPLAYLEVEL(3, "OK \n"); + _end: FUZ_freeDictionary(dictionary); ZSTD_freeCStream(zc);