diff --git a/contrib/externalMatchfinder/Makefile b/contrib/externalMatchfinder/Makefile new file mode 100644 index 00000000000..46a5fc2dc08 --- /dev/null +++ b/contrib/externalMatchfinder/Makefile @@ -0,0 +1,40 @@ +# ################################################################ +# Copyright (c) 2018-present, Yann Collet, Facebook, Inc. +# All rights reserved. +# +# This source code is licensed under both the BSD-style license (found in the +# LICENSE file in the root directory of this source tree) and the GPLv2 (found +# in the COPYING file in the root directory of this source tree). +# ################################################################ + +PROGDIR = ../../programs +LIBDIR = ../../lib + +LIBZSTD = $(LIBDIR)/libzstd.a + +CPPFLAGS+= -I$(LIBDIR) -I$(LIBDIR)/compress -I$(LIBDIR)/common + +CFLAGS ?= -O3 +CFLAGS += -std=gnu99 +DEBUGFLAGS= -Wall -Wextra -Wcast-qual -Wcast-align -Wshadow \ + -Wstrict-aliasing=1 -Wswitch-enum \ + -Wstrict-prototypes -Wundef -Wpointer-arith \ + -Wvla -Wformat=2 -Winit-self -Wfloat-equal -Wwrite-strings \ + -Wredundant-decls +CFLAGS += $(DEBUGFLAGS) $(MOREFLAGS) + +default: externalMatchfinder + +all: externalMatchfinder + +externalMatchfinder: matchfinder.c main.c $(LIBZSTD) + $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ + +.PHONY: $(LIBZSTD) +$(LIBZSTD): + $(MAKE) -C $(LIBDIR) libzstd.a CFLAGS="$(CFLAGS)" + +clean: + $(RM) *.o + $(MAKE) -C $(LIBDIR) clean > /dev/null + $(RM) externalMatchfinder diff --git a/contrib/externalMatchfinder/main.c b/contrib/externalMatchfinder/main.c new file mode 100644 index 00000000000..003b4b55783 --- /dev/null +++ b/contrib/externalMatchfinder/main.c @@ -0,0 +1,80 @@ +#include +#include +#include + +#define ZSTD_STATIC_LINKING_ONLY +#include "zstd.h" +#include "zstd_errors.h" +#include "matchfinder.h" // simpleExternalMatchFinder + +int main(int argc, char *argv[]) { + size_t res; + + if (argc != 2) { + printf("Usage: exampleMatchfinder \n"); + return 1; + } + + ZSTD_CCtx* zc = ZSTD_createCCtx(); + + int simpleExternalMatchState = 0xdeadbeef; + + // Here is the crucial bit of code! + ZSTD_refExternalMatchFinder( + zc, + &simpleExternalMatchState, + simpleExternalMatchFinder + ); + + res = ZSTD_CCtx_setParameter(zc, ZSTD_c_enableMatchFinderFallback, 1); + + if (ZSTD_isError(res)) { + printf("ERROR: %s\n", ZSTD_getErrorName(res)); + return 1; + } + + FILE *f = fopen(argv[1], "rb"); + fseek(f, 0, SEEK_END); + long const srcSize = ftell(f); + fseek(f, 0, SEEK_SET); + + char *src = malloc(srcSize + 1); + fread(src, srcSize, 1, f); + fclose(f); + + size_t const dstSize = ZSTD_compressBound(srcSize); + char *dst = malloc(dstSize); + + size_t const cSize = ZSTD_compress2(zc, dst, dstSize, src, srcSize); + + if (ZSTD_isError(cSize)) { + printf("ERROR: %s\n", ZSTD_getErrorName(cSize)); + return 1; + } + + char *val = malloc(srcSize); + res = ZSTD_decompress(val, srcSize, dst, cSize); + + ZSTD_freeCCtx(zc); + + if (ZSTD_isError(res)) { + printf("ERROR: %s\n", ZSTD_getErrorName(res)); + return 1; + } + + if (memcmp(src, val, srcSize) == 0) { + printf("Compression and decompression were successful!\n"); + printf("Original size: %lu\n", srcSize); + printf("Compressed size: %lu\n", cSize); + return 0; + } else { + printf("ERROR: input and validation buffers don't match!\n"); + for (int i = 0; i < srcSize; i++) { + if (src[i] != val[i]) { + printf("First bad index: %d\n", i); + break; + } + } + return 1; + } +} diff --git a/contrib/externalMatchfinder/matchfinder.c b/contrib/externalMatchfinder/matchfinder.c new file mode 100644 index 00000000000..7fabc1a4a1b --- /dev/null +++ b/contrib/externalMatchfinder/matchfinder.c @@ -0,0 +1,65 @@ +#include "zstd_compress_internal.h" +#include "matchfinder.h" + +#define HSIZE 1024 +static U32 const HLOG = 10; +static U32 const MLS = 4; +static U32 const BADIDX = (1 << 31); + +size_t simpleExternalMatchFinder( + void* externalMatchState, + ZSTD_Sequence* outSeqs, size_t outSeqsCapacity, + const void* src, size_t srcSize, + const void* dict, size_t dictSize, + int compressionLevel +) { + const BYTE* const istart = (const BYTE*)src; + const BYTE* const iend = istart + srcSize; + const BYTE* ip = istart; + const BYTE* anchor = istart; + size_t seqCount = 0; + U32 hashTable[HSIZE]; + + (void)externalMatchState; + (void)dict; + (void)dictSize; + (void)outSeqsCapacity; + (void)compressionLevel; + + { int i; + for (i=0; i < HSIZE; i++) { + hashTable[i] = BADIDX; + } } + + while (ip + 4 < iend) { + size_t const hash = ZSTD_hashPtr(ip, HLOG, MLS); + U32 const matchIndex = hashTable[hash]; + hashTable[hash] = (U32)(ip - istart); + + if (matchIndex != BADIDX) { + const BYTE* const match = istart + matchIndex; + U32 const matchLen = (U32)ZSTD_count(ip, match, iend); + if (matchLen >= ZSTD_MINMATCH_MIN) { + U32 const litLen = (U32)(ip - anchor); + U32 const offset = (U32)(ip - match); + ZSTD_Sequence const seq = { + offset, litLen, matchLen, 0 + }; + outSeqs[seqCount++] = seq; + ip += matchLen; + anchor = ip; + continue; + } + } + + ip++; + } + + { ZSTD_Sequence const finalSeq = { + 0, (U32)(iend - anchor), 0, 0 + }; + outSeqs[seqCount++] = finalSeq; + } + + return seqCount; +} diff --git a/contrib/externalMatchfinder/matchfinder.h b/contrib/externalMatchfinder/matchfinder.h new file mode 100644 index 00000000000..b89a1a4a4e0 --- /dev/null +++ b/contrib/externalMatchfinder/matchfinder.h @@ -0,0 +1,15 @@ +#ifndef MATCHFINDER_H +#define MATCHFINDER_H + +#define ZSTD_STATIC_LINKING_ONLY +#include "zstd.h" + +size_t simpleExternalMatchFinder( + void* externalMatchState, + ZSTD_Sequence* outSeqs, size_t outSeqsCapacity, + const void* src, size_t srcSize, + const void* dict, size_t dictSize, + int compressionLevel +); + +#endif diff --git a/lib/common/error_private.c b/lib/common/error_private.c index 1b67500f3bb..e94646a1224 100644 --- a/lib/common/error_private.c +++ b/lib/common/error_private.c @@ -30,6 +30,7 @@ const char* ERR_getErrorString(ERR_enum code) case PREFIX(corruption_detected): return "Data corruption detected"; case PREFIX(checksum_wrong): return "Restored data doesn't match checksum"; case PREFIX(parameter_unsupported): return "Unsupported parameter"; + case PREFIX(parameter_combination_unsupported): return "Unsupported combination of parameters"; case PREFIX(parameter_outOfBound): return "Parameter is out of bound"; case PREFIX(init_missing): return "Context should be init first"; case PREFIX(memory_allocation): return "Allocation error : not enough memory"; @@ -50,6 +51,7 @@ const char* ERR_getErrorString(ERR_enum code) case PREFIX(seekableIO): return "An I/O error occurred when reading/seeking"; case PREFIX(dstBuffer_wrong): return "Destination buffer is wrong"; case PREFIX(srcBuffer_wrong): return "Source buffer is wrong"; + case PREFIX(externalMatchFinder_failed): return "External matchfinder returned an error code"; case PREFIX(maxCode): default: return notErrorCode; } diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index adf1f6e7afc..29819add542 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -581,6 +581,11 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param) bounds.upperBound = (int)ZSTD_ps_disable; return bounds; + case ZSTD_c_enableMatchFinderFallback: + bounds.lowerBound = 0; + bounds.upperBound = 1; + return bounds; + default: bounds.error = ERROR(parameter_unsupported); return bounds; @@ -646,6 +651,7 @@ static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param) case ZSTD_c_useRowMatchFinder: case ZSTD_c_deterministicRefPrefix: case ZSTD_c_prefetchCDictTables: + case ZSTD_c_enableMatchFinderFallback: default: return 0; } @@ -702,6 +708,7 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value) case ZSTD_c_useRowMatchFinder: case ZSTD_c_deterministicRefPrefix: case ZSTD_c_prefetchCDictTables: + case ZSTD_c_enableMatchFinderFallback: break; default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); @@ -933,6 +940,11 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, CCtxParams->prefetchCDictTables = (ZSTD_paramSwitch_e)value; return CCtxParams->prefetchCDictTables; + case ZSTD_c_enableMatchFinderFallback: + BOUNDCHECK(ZSTD_c_enableMatchFinderFallback, value); + CCtxParams->enableMatchFinderFallback = value; + return CCtxParams->enableMatchFinderFallback; + default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); } } @@ -1068,6 +1080,9 @@ size_t ZSTD_CCtxParams_getParameter( case ZSTD_c_prefetchCDictTables: *value = (int)CCtxParams->prefetchCDictTables; break; + case ZSTD_c_enableMatchFinderFallback: + *value = CCtxParams->enableMatchFinderFallback; + break; default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); } return 0; @@ -1239,6 +1254,7 @@ size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset) RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, "Can't reset parameters only when not in init stage."); ZSTD_clearAllDicts(cctx); + ZSTD_memset(&cctx->externalMatchCtx, 0, sizeof(cctx->externalMatchCtx)); return ZSTD_CCtxParams_reset(&cctx->requestedParams); } return 0; @@ -1488,11 +1504,12 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal( const ZSTD_paramSwitch_e useRowMatchFinder, const size_t buffInSize, const size_t buffOutSize, - const U64 pledgedSrcSize) + const U64 pledgedSrcSize, + int useExternalMatchFinder) { size_t const windowSize = (size_t) BOUNDED(1ULL, 1ULL << cParams->windowLog, pledgedSrcSize); size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize); - U32 const divider = (cParams->minMatch==3) ? 3 : 4; + U32 const divider = (cParams->minMatch==3 || useExternalMatchFinder) ? 3 : 4; size_t const maxNbSeq = blockSize / divider; size_t const tokenSpace = ZSTD_cwksp_alloc_size(WILDCOPY_OVERLENGTH + blockSize) + ZSTD_cwksp_aligned_alloc_size(maxNbSeq * sizeof(seqDef)) @@ -1512,6 +1529,11 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal( size_t const cctxSpace = isStatic ? ZSTD_cwksp_alloc_size(sizeof(ZSTD_CCtx)) : 0; + size_t const maxNbExternalSeq = ZSTD_sequenceBound(ZSTD_BLOCKSIZE_MAX); + size_t const externalSeqSpace = useExternalMatchFinder + ? ZSTD_cwksp_alloc_size(maxNbExternalSeq * sizeof(ZSTD_Sequence)) + : 0; + size_t const neededSpace = cctxSpace + entropySpace + @@ -1520,7 +1542,8 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal( ldmSeqSpace + matchStateSize + tokenSpace + - bufferSpace; + bufferSpace + + externalSeqSpace; DEBUGLOG(5, "estimate workspace : %u", (U32)neededSpace); return neededSpace; @@ -1538,7 +1561,7 @@ size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params) * be needed. However, we still allocate two 0-sized buffers, which can * take space under ASAN. */ return ZSTD_estimateCCtxSize_usingCCtxParams_internal( - &cParams, ¶ms->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN); + &cParams, ¶ms->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN, params->useExternalMatchFinder); } size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams) @@ -1599,7 +1622,7 @@ size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params) return ZSTD_estimateCCtxSize_usingCCtxParams_internal( &cParams, ¶ms->ldmParams, 1, useRowMatchFinder, inBuffSize, outBuffSize, - ZSTD_CONTENTSIZE_UNKNOWN); + ZSTD_CONTENTSIZE_UNKNOWN, params->useExternalMatchFinder); } } @@ -1882,7 +1905,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, { size_t const windowSize = MAX(1, (size_t)MIN(((U64)1 << params->cParams.windowLog), pledgedSrcSize)); size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize); - U32 const divider = (params->cParams.minMatch==3) ? 3 : 4; + U32 const divider = (params->cParams.minMatch==3 || params->useExternalMatchFinder) ? 3 : 4; size_t const maxNbSeq = blockSize / divider; size_t const buffOutSize = (zbuff == ZSTDb_buffered && params->outBufferMode == ZSTD_bm_buffered) ? ZSTD_compressBound(blockSize) + 1 @@ -1900,7 +1923,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, size_t const neededSpace = ZSTD_estimateCCtxSize_usingCCtxParams_internal( ¶ms->cParams, ¶ms->ldmParams, zc->staticSize != 0, params->useRowMatchFinder, - buffInSize, buffOutSize, pledgedSrcSize); + buffInSize, buffOutSize, pledgedSrcSize, params->useExternalMatchFinder); int resizeWorkspace; FORWARD_IF_ERROR(neededSpace, "cctx size estimate failed!"); @@ -2013,6 +2036,14 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, zc->ldmState.loadedDictEnd = 0; } + /* reserve space for block-level external sequences */ + if (params->useExternalMatchFinder) { + size_t const maxNbExternalSeq = ZSTD_sequenceBound(ZSTD_BLOCKSIZE_MAX); + zc->externalMatchCtx.seqBufferCapacity = maxNbExternalSeq; + zc->externalMatchCtx.seqBuffer = + (ZSTD_Sequence*)ZSTD_cwksp_reserve_aligned(ws, maxNbExternalSeq * sizeof(ZSTD_Sequence)); + } + DEBUGLOG(3, "wksp: finished allocating, %zd bytes remain available", ZSTD_cwksp_available_space(ws)); assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace, resizeWorkspace)); @@ -2856,6 +2887,49 @@ void ZSTD_resetSeqStore(seqStore_t* ssPtr) ssPtr->longLengthType = ZSTD_llt_none; } +/* ZSTD_postProcessExternalMatchFinderResult() : + * Validates and post-processes sequences obtained through the external matchfinder API. + * Returns the number of sequences after post-processing, or an error code. */ +static size_t ZSTD_postProcessExternalMatchFinderResult( + ZSTD_Sequence* outSeqs, size_t nbExternalSeqs, size_t outSeqsCapacity, int emptySrc +) { + RETURN_ERROR_IF( + nbExternalSeqs > outSeqsCapacity, + externalMatchFinder_failed, + "External matchfinder returned error code %lu", + (unsigned long)nbExternalSeqs + ); + + RETURN_ERROR_IF( + nbExternalSeqs == 0 && !emptySrc, + externalMatchFinder_failed, + "External matchfinder produced zero sequences for a non-empty src buffer!" + ); + + if (emptySrc) { + ZSTD_memset(&outSeqs[0], 0, sizeof(ZSTD_Sequence)); + return 1; + } else { + ZSTD_Sequence const lastSeq = outSeqs[nbExternalSeqs - 1]; + + /* Check if lastSeq is a block delimiter, append one if not */ + if (lastSeq.offset == 0 && lastSeq.matchLength == 0) { + return nbExternalSeqs; + } else { + /* This error condition is only possible if the external matchfinder + * produced an invalid parse, by definition of ZSTD_sequenceBound(). */ + RETURN_ERROR_IF( + nbExternalSeqs == outSeqsCapacity, + externalMatchFinder_failed, + "nbExternalSeqs == outSeqsCapacity but lastSeq is not a block delimiter!" + ); + + ZSTD_memset(&outSeqs[nbExternalSeqs], 0, sizeof(ZSTD_Sequence)); + return nbExternalSeqs + 1; + } + } +} + typedef enum { ZSTDbss_compress, ZSTDbss_noCompress } ZSTD_buildSeqStore_e; static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) @@ -2903,6 +2977,24 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) } if (zc->externSeqStore.pos < zc->externSeqStore.size) { assert(zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_disable); + + /* External matchfinder + LDM is technically possible, just not implemented yet. + * We need to revisit soon and implement it. */ + if (zc->appliedParams.useBlockSplitter == ZSTD_ps_enable) { + RETURN_ERROR_IF( + zc->appliedParams.useExternalMatchFinder, + parameter_combination_unsupported, + "Block splitting with external matchfinder enabled is not currently supported. " + "Note: block splitting is enabled by default at high compression levels." + ); + } else { + RETURN_ERROR_IF( + zc->appliedParams.useExternalMatchFinder, + parameter_combination_unsupported, + "Long-distance matching with external matchfinder enabled is not currently supported." + ); + } + /* Updates ldmSeqStore.pos */ lastLLSize = ZSTD_ldm_blockCompress(&zc->externSeqStore, @@ -2914,6 +3006,14 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) } else if (zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable) { rawSeqStore_t ldmSeqStore = kNullRawSeqStore; + /* External matchfinder + LDM is technically possible, just not implemented yet. + * We need to revisit soon and implement it. */ + RETURN_ERROR_IF( + zc->appliedParams.useExternalMatchFinder, + parameter_combination_unsupported, + "Long-distance matching with external matchfinder enabled is not currently supported." + ); + ldmSeqStore.seq = zc->ldmSequences; ldmSeqStore.capacity = zc->maxNbLdmSequences; /* Updates ldmSeqStore.size */ @@ -2928,10 +3028,57 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) zc->appliedParams.useRowMatchFinder, src, srcSize); assert(ldmSeqStore.pos == ldmSeqStore.size); - } else { /* not long range mode */ + } else if (zc->appliedParams.useExternalMatchFinder) { + assert( + zc->externalMatchCtx.seqBufferCapacity >= ZSTD_sequenceBound(srcSize) + ); + assert(zc->externalMatchCtx.mFinder != NULL); + + { size_t const nbExternalSeqs = (zc->externalMatchCtx.mFinder)( + zc->externalMatchCtx.mState, + zc->externalMatchCtx.seqBuffer, + zc->externalMatchCtx.seqBufferCapacity, + src, srcSize, + NULL, 0, /* dict and dictSize, currently not supported */ + zc->appliedParams.compressionLevel + ); + + size_t const nbPostProcessedSeqs = ZSTD_postProcessExternalMatchFinderResult( + zc->externalMatchCtx.seqBuffer, + nbExternalSeqs, + zc->externalMatchCtx.seqBufferCapacity, + srcSize == 0 /* emptySrc */ + ); + + if (!ZSTD_isError(nbPostProcessedSeqs)) { + ZSTD_sequencePosition seqPos = {0,0,0}; + ZSTD_copySequencesToSeqStoreExplicitBlockDelim( + zc, &seqPos, zc->externalMatchCtx.seqBuffer, nbPostProcessedSeqs, src, srcSize + ); + ms->ldmSeqStore = NULL; + lastLLSize = 0; + DEBUGLOG(5, "Copied %lu sequences from external matchfinder to internal seqStore.", (unsigned long)nbExternalSeqs); + } else { + if (zc->appliedParams.enableMatchFinderFallback) { + ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy, + zc->appliedParams.useRowMatchFinder, + dictMode); + ms->ldmSeqStore = NULL; + DEBUGLOG( + 5, + "External matchfinder returned error code %lu. Falling back to internal matchfinder.", + (unsigned long)nbExternalSeqs + ); + lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize); + } else { + return nbPostProcessedSeqs; /* return an error */ + } + } } + } else { /* not long range mode and no external matchfinder */ ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy, zc->appliedParams.useRowMatchFinder, dictMode); + assert(zc->externalMatchCtx.mFinder == NULL); ms->ldmSeqStore = NULL; lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize); } @@ -3074,12 +3221,18 @@ static int ZSTD_isRLE(const BYTE* src, size_t length) { * This is just a heuristic based on the compressibility. * It may return both false positives and false negatives. */ -static int ZSTD_maybeRLE(seqStore_t const* seqStore) +static int ZSTD_maybeRLE(seqStore_t const* seqStore, ZSTD_CCtx_params const* appliedParams) { size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart); size_t const nbLits = (size_t)(seqStore->lit - seqStore->litStart); - return nbSeqs < 4 && nbLits < 10; + if (appliedParams->useExternalMatchFinder) { + /* We shouldn't make any assumptions about how an external matchfinder + * will compress an RLE block. */ + return 1; + } else { + return nbSeqs < 4 && nbLits < 10; + } } static void ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* const bs) @@ -3895,7 +4048,7 @@ static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc, * This is only an issue for zstd <= v1.4.3 */ !zc->isFirstBlock && - ZSTD_maybeRLE(&zc->seqStore) && + ZSTD_maybeRLE(&zc->seqStore, &zc->appliedParams) && ZSTD_isRLE((BYTE const*)src, srcSize)) { return ZSTD_rleCompressBlock(dst, dstCapacity, *(BYTE const*)src, srcSize, lastBlock); @@ -5882,12 +6035,6 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx, } } -typedef struct { - U32 idx; /* Index in array of ZSTD_Sequence */ - U32 posInSequence; /* Position within sequence at idx */ - size_t posInSrc; /* Number of bytes given by sequences provided so far */ -} ZSTD_sequencePosition; - /* ZSTD_validateSequence() : * @offCode : is presumed to follow format required by ZSTD_storeSeq() * @returns a ZSTD error code if sequence is not valid @@ -5925,10 +6072,7 @@ static U32 ZSTD_finalizeOffBase(U32 rawOffset, const U32 rep[ZSTD_REP_NUM], U32 return offBase; } -/* Returns 0 on success, and a ZSTD_error otherwise. This function scans through an array of - * ZSTD_Sequence, storing the sequences it finds, until it reaches a block delimiter. - */ -static size_t +size_t ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos, const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, @@ -5982,19 +6126,7 @@ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx, return 0; } -/* Returns the number of bytes to move the current read position back by. - * Only non-zero if we ended up splitting a sequence. - * Otherwise, it may return a ZSTD error if something went wrong. - * - * This function will attempt to scan through blockSize bytes - * represented by the sequences in @inSeqs, - * storing any (partial) sequences. - * - * Occasionally, we may want to change the actual number of bytes we consumed from inSeqs to - * avoid splitting a match, or to avoid splitting a match such that it would produce a match - * smaller than MINMATCH. In this case, we return the number of bytes that we didn't read from this block. - */ -static size_t +size_t ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos, const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, const void* src, size_t blockSize) @@ -6254,7 +6386,7 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, DEBUGLOG(5, "Compressed sequences size: %zu", compressedSeqsSize); if (!cctx->isFirstBlock && - ZSTD_maybeRLE(&cctx->seqStore) && + ZSTD_maybeRLE(&cctx->seqStore, &cctx->appliedParams) && ZSTD_isRLE((BYTE const*)src, srcSize)) { /* We don't want to emit our first block as a RLE even if it qualifies because * doing so will cause the decoder (cli only) to throw a "should consume all input error." @@ -6527,3 +6659,19 @@ ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long srcSizeH if (srcSizeHint == 0) srcSizeHint = ZSTD_CONTENTSIZE_UNKNOWN; return ZSTD_getParams_internal(compressionLevel, srcSizeHint, dictSize, ZSTD_cpm_unknown); } + +void ZSTD_refExternalMatchFinder( + ZSTD_CCtx* zc, void* mState, + ZSTD_externalMatchFinder_F* mFinder +) { + ZSTD_externalMatchCtx emctx = { + mState, + mFinder, + + /* seqBuffer is allocated later (from the cwskp) */ + NULL, /* seqBuffer */ + 0 /* seqBufferCapacity */ + }; + zc->externalMatchCtx = emctx; + zc->requestedParams.useExternalMatchFinder = 1; +} diff --git a/lib/compress/zstd_compress_internal.h b/lib/compress/zstd_compress_internal.h index baa726f7dff..a5aba62cdba 100644 --- a/lib/compress/zstd_compress_internal.h +++ b/lib/compress/zstd_compress_internal.h @@ -149,6 +149,12 @@ typedef struct { size_t capacity; /* The capacity starting from `seq` pointer */ } rawSeqStore_t; +typedef struct { + U32 idx; /* Index in array of ZSTD_Sequence */ + U32 posInSequence; /* Position within sequence at idx */ + size_t posInSrc; /* Number of bytes given by sequences provided so far */ +} ZSTD_sequencePosition; + UNUSED_ATTR static const rawSeqStore_t kNullRawSeqStore = {NULL, 0, 0, 0, 0}; typedef struct { @@ -339,6 +345,15 @@ struct ZSTD_CCtx_params_s { /* Controls prefetching in some dictMatchState matchfinders */ ZSTD_paramSwitch_e prefetchCDictTables; + + /* Controls whether zstd will fall back to an internal matchfinder + * if the external matchfinder returns an error code. */ + int enableMatchFinderFallback; + + /* Indicates whether an external matchfinder has been referenced. + * Users can't set this externally. + * It is set internally in ZSTD_refExternalMatchFinder(). */ + int useExternalMatchFinder; }; /* typedef'd to ZSTD_CCtx_params within "zstd.h" */ #define COMPRESS_SEQUENCES_WORKSPACE_SIZE (sizeof(unsigned) * (MaxSeq + 2)) @@ -370,6 +385,14 @@ typedef struct { ZSTD_entropyCTablesMetadata_t entropyMetadata; } ZSTD_blockSplitCtx; +/* Context for block-level external matchfinder API */ +typedef struct { + void* mState; + ZSTD_externalMatchFinder_F* mFinder; + ZSTD_Sequence* seqBuffer; + size_t seqBufferCapacity; +} ZSTD_externalMatchCtx; + struct ZSTD_CCtx_s { ZSTD_compressionStage_e stage; int cParamsChanged; /* == 1 if cParams(except wlog) or compression level are changed in requestedParams. Triggers transmission of new params to ZSTDMT (if available) then reset to 0. */ @@ -439,6 +462,9 @@ struct ZSTD_CCtx_s { /* Workspace for block splitter */ ZSTD_blockSplitCtx blockSplitCtx; + + /* Workspace for external matchfinder */ + ZSTD_externalMatchCtx externalMatchCtx; }; typedef enum { ZSTD_dtlm_fast, ZSTD_dtlm_full } ZSTD_dictTableLoadMethod_e; @@ -1410,4 +1436,30 @@ U32 ZSTD_cycleLog(U32 hashLog, ZSTD_strategy strat); */ void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize); +/* Returns 0 on success, and a ZSTD_error otherwise. This function scans through an array of + * ZSTD_Sequence, storing the sequences it finds, until it reaches a block delimiter. + */ +size_t +ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx, + ZSTD_sequencePosition* seqPos, + const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, + const void* src, size_t blockSize); + +/* Returns the number of bytes to move the current read position back by. + * Only non-zero if we ended up splitting a sequence. + * Otherwise, it may return a ZSTD error if something went wrong. + * + * This function will attempt to scan through blockSize bytes + * represented by the sequences in @inSeqs, + * storing any (partial) sequences. + * + * Occasionally, we may want to change the actual number of bytes we consumed from inSeqs to + * avoid splitting a match, or to avoid splitting a match such that it would produce a match + * smaller than MINMATCH. In this case, we return the number of bytes that we didn't read from this block. + */ +size_t +ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos, + const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, + const void* src, size_t blockSize); + #endif /* ZSTD_COMPRESS_H */ diff --git a/lib/zstd.h b/lib/zstd.h index 1867efc93ef..bed40cbc35b 100644 --- a/lib/zstd.h +++ b/lib/zstd.h @@ -446,6 +446,7 @@ typedef enum { * ZSTD_c_useBlockSplitter * ZSTD_c_useRowMatchFinder * ZSTD_c_prefetchCDictTables + * ZSTD_c_enableMatchFinderFallback * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them. * note : never ever use experimentalParam? names directly; * also, the enums values themselves are unstable and can still change. @@ -465,7 +466,8 @@ typedef enum { ZSTD_c_experimentalParam13=1010, ZSTD_c_experimentalParam14=1011, ZSTD_c_experimentalParam15=1012, - ZSTD_c_experimentalParam16=1013 + ZSTD_c_experimentalParam16=1013, + ZSTD_c_experimentalParam17=1014 } ZSTD_cParameter; typedef struct { @@ -528,7 +530,7 @@ typedef enum { * They will be used to compress next frame. * Resetting session never fails. * - The parameters : changes all parameters back to "default". - * This removes any reference to any dictionary too. + * This also removes any reference to any dictionary or external matchfinder. * Parameters can only be changed between 2 sessions (i.e. no compression is currently ongoing) * otherwise the reset fails, and function returns an error value (which can be tested using ZSTD_isError()) * - Both : similar to resetting the session, followed by resetting parameters. @@ -1474,6 +1476,31 @@ ZSTD_compressSequences( ZSTD_CCtx* cctx, void* dst, size_t dstSize, const ZSTD_Sequence* inSeqs, size_t inSeqsSize, const void* src, size_t srcSize); +/* Block-level sequence compression API */ +/* @nocommit document */ + +#define ZSTD_EXTERNAL_MATCHFINDER_ERROR ((size_t)(-1)) + +/* @nocommit document these constraints: + * - outSeqsCapacity >= (blockSize / MINMATCH) + 1 + * - srcSize <= 128 KB + * - dictSize is not bounded */ +typedef size_t ZSTD_externalMatchFinder_F ( + void* externalMatchState, + ZSTD_Sequence* outSeqs, size_t outSeqsCapacity, + const void* src, size_t srcSize, + const void* dict, size_t dictSize, + int compressionLevel +); + +ZSTDLIB_STATIC_API void +ZSTD_refExternalMatchFinder( + ZSTD_CCtx* cctx, + void* externalMatchState, + ZSTD_externalMatchFinder_F* externalMatchFinder +); + +/****************************************/ /*! ZSTD_writeSkippableFrame() : * Generates a zstd skippable frame containing data given by src, and writes it to dst buffer. @@ -2010,6 +2037,9 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo */ #define ZSTD_c_prefetchCDictTables ZSTD_c_experimentalParam16 +/* @nocommit document */ +#define ZSTD_c_enableMatchFinderFallback ZSTD_c_experimentalParam17 + /*! ZSTD_CCtx_getParameter() : * Get the requested compression parameter value, selected by enum ZSTD_cParameter, * and store it into int* value. diff --git a/lib/zstd_errors.h b/lib/zstd_errors.h index 2ec0b0ab168..21f8afb286f 100644 --- a/lib/zstd_errors.h +++ b/lib/zstd_errors.h @@ -62,6 +62,7 @@ typedef enum { ZSTD_error_dictionary_wrong = 32, ZSTD_error_dictionaryCreation_failed = 34, ZSTD_error_parameter_unsupported = 40, + ZSTD_error_parameter_combination_unsupported = 41, ZSTD_error_parameter_outOfBound = 42, ZSTD_error_tableLog_tooLarge = 44, ZSTD_error_maxSymbolValue_tooLarge = 46, @@ -79,6 +80,7 @@ typedef enum { ZSTD_error_seekableIO = 102, ZSTD_error_dstBuffer_wrong = 104, ZSTD_error_srcBuffer_wrong = 105, + ZSTD_error_externalMatchFinder_failed = 106, ZSTD_error_maxCode = 120 /* never EVER use this value directly, it can change in future versions! Use ZSTD_isError() instead */ } ZSTD_ErrorCode; diff --git a/tests/Makefile b/tests/Makefile index afea6475afb..44e0f6a7043 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -169,7 +169,7 @@ fuzzer-dll : $(ZSTDDIR)/common/xxhash.c $(PRGDIR)/util.c $(PRGDIR)/timefn.c $(PR $(CC) $(CPPFLAGS) $(CFLAGS) $(filter %.c,$^) $(LDFLAGS) -o $@$(EXT) CLEAN += zstreamtest zstreamtest32 -ZSTREAM_LOCAL_FILES := $(PRGDIR)/datagen.c $(PRGDIR)/util.c $(PRGDIR)/timefn.c seqgen.c zstreamtest.c +ZSTREAM_LOCAL_FILES := $(PRGDIR)/datagen.c $(PRGDIR)/util.c $(PRGDIR)/timefn.c seqgen.c zstreamtest.c external_matchfinder.c ZSTREAM_PROPER_FILES := $(ZDICT_FILES) $(ZSTREAM_LOCAL_FILES) ZSTREAMFILES := $(ZSTD_FILES) $(ZSTREAM_PROPER_FILES) zstreamtest32 : CFLAGS += -m32 diff --git a/tests/external_matchfinder.c b/tests/external_matchfinder.c new file mode 100644 index 00000000000..6655ed5c038 --- /dev/null +++ b/tests/external_matchfinder.c @@ -0,0 +1,101 @@ +#include "external_matchfinder.h" +#include +#include "zstd_compress_internal.h" +#include + +#define HSIZE 1024 +static U32 const HLOG = 10; +static U32 const MLS = 4; +static U32 const BADIDX = (1 << 31); + +static size_t simpleExternalMatchFinder( + void* externalMatchState, + ZSTD_Sequence* outSeqs, size_t outSeqsCapacity, + const void* src, size_t srcSize, + const void* dict, size_t dictSize, + int compressionLevel +) { + const BYTE* const istart = (const BYTE*)src; + const BYTE* const iend = istart + srcSize; + const BYTE* ip = istart; + const BYTE* anchor = istart; + size_t seqCount = 0; + U32 hashTable[HSIZE]; + + (void)externalMatchState; + (void)dict; + (void)dictSize; + (void)outSeqsCapacity; + (void)compressionLevel; + + { int i; + for (i=0; i < HSIZE; i++) { + hashTable[i] = BADIDX; + } } + + while (ip + 4 < iend) { + size_t const hash = ZSTD_hashPtr(ip, HLOG, MLS); + U32 const matchIndex = hashTable[hash]; + hashTable[hash] = (U32)(ip - istart); + + if (matchIndex != BADIDX) { + const BYTE* const match = istart + matchIndex; + U32 const matchLen = (U32)ZSTD_count(ip, match, iend); + if (matchLen >= ZSTD_MINMATCH_MIN) { + U32 const litLen = (U32)(ip - anchor); + U32 const offset = (U32)(ip - match); + ZSTD_Sequence const seq = { + offset, litLen, matchLen, 0 + }; + outSeqs[seqCount++] = seq; + ip += matchLen; + anchor = ip; + continue; + } + } + + ip++; + } + + { ZSTD_Sequence const finalSeq = { + 0, (U32)(iend - anchor), 0, 0 + }; + outSeqs[seqCount++] = finalSeq; + } + + return seqCount; +} + +size_t zstreamExternalMatchFinder( + void* externalMatchState, + ZSTD_Sequence* outSeqs, size_t outSeqsCapacity, + const void* src, size_t srcSize, + const void* dict, size_t dictSize, + int compressionLevel +) { + EMF_testCase const testCase = *((EMF_testCase*)externalMatchState); + memset(outSeqs, 0, outSeqsCapacity); + + switch (testCase) { + case EMF_ZERO_SEQS: + return 0; + case EMF_ONE_BIG_SEQ: + outSeqs[0].offset = 0; + outSeqs[0].matchLength = 0; + outSeqs[0].litLength = (U32)(srcSize); + return 1; + case EMF_LOTS_OF_SEQS: + return simpleExternalMatchFinder( + externalMatchState, + outSeqs, outSeqsCapacity, + src, srcSize, + dict, dictSize, + compressionLevel + ); + case EMF_SMALL_ERROR: + return outSeqsCapacity + 1; + case EMF_BIG_ERROR: + default: + return ZSTD_EXTERNAL_MATCHFINDER_ERROR; + } +} diff --git a/tests/external_matchfinder.h b/tests/external_matchfinder.h new file mode 100644 index 00000000000..279aa7ab53a --- /dev/null +++ b/tests/external_matchfinder.h @@ -0,0 +1,23 @@ +#ifndef EXTERNAL_MATCHFINDER +#define EXTERNAL_MATCHFINDER + +#define ZSTD_STATIC_LINKING_ONLY +#include "zstd.h" + +/* See external_matchfinder.c for details on each test case */ +typedef enum { + EMF_ZERO_SEQS = 0, + EMF_ONE_BIG_SEQ = 1, + EMF_LOTS_OF_SEQS = 2, + EMF_BIG_ERROR = 3, + EMF_SMALL_ERROR = 4 +} EMF_testCase; + +size_t zstreamExternalMatchFinder( + void* externalMatchState, ZSTD_Sequence* outSeqs, size_t outSeqsCapacity, + const void* src, size_t srcSize, + const void* dict, size_t dictSize, + int compressionLevel +); + +#endif // EXTERNAL_MATCHFINDER diff --git a/tests/zstreamtest.c b/tests/zstreamtest.c index ce9020f128c..3cf791f6ca2 100644 --- a/tests/zstreamtest.c +++ b/tests/zstreamtest.c @@ -39,7 +39,7 @@ #include "seqgen.h" #include "util.h" #include "timefn.h" /* UTIL_time_t, UTIL_clockSpanMicro, UTIL_getTime */ - +#include "external_matchfinder.h" /* zstreamExternalMatchFinder, EMF_testCase */ /*-************************************ * Constants @@ -1777,6 +1777,91 @@ static int basicUnitTests(U32 seed, double compressibility) } DISPLAYLEVEL(3, "OK \n"); + DISPLAYLEVEL(3, "test%3i : External matchfinder API: ", testNb++); + { + size_t const dstBufSize = ZSTD_compressBound(CNBufferSize); + BYTE* dstBuf = (BYTE*)malloc(ZSTD_compressBound(dstBufSize)); + size_t const checkBufSize = CNBufferSize; + BYTE* checkBuf = (BYTE*)malloc(checkBufSize); + int enableFallback; + size_t res; + EMF_testCase externalMatchState; + + ZSTD_CCtx_reset(zc, ZSTD_reset_session_and_parameters); + + /* Reference external matchfinder outside the test loop to + * check that the reference is preserved across compressions */ + ZSTD_refExternalMatchFinder( + zc, + &externalMatchState, + zstreamExternalMatchFinder + ); + + for (enableFallback = 0; enableFallback < 1; enableFallback++) { + size_t testCaseId; + + EMF_testCase const EMF_successCases[] = { + EMF_ONE_BIG_SEQ, + EMF_LOTS_OF_SEQS, + }; + size_t const EMF_numSuccessCases = 2; + + EMF_testCase const EMF_failureCases[] = { + EMF_ZERO_SEQS, + EMF_BIG_ERROR, + EMF_SMALL_ERROR, + }; + size_t const EMF_numFailureCases = 3; + + /* Test external matchfinder success scenarios */ + for (testCaseId = 0; testCaseId < EMF_numSuccessCases; testCaseId++) { + externalMatchState = EMF_successCases[testCaseId]; + ZSTD_CCtx_reset(zc, ZSTD_reset_session_only); + CHECK_Z(ZSTD_CCtx_setParameter(zc, ZSTD_c_enableMatchFinderFallback, enableFallback)); + res = ZSTD_compress2(zc, dstBuf, dstBufSize, CNBuffer, CNBufferSize); + CHECK(ZSTD_isError(res), "EMF: Compression error: %s", ZSTD_getErrorName(res)); + CHECK_Z(ZSTD_decompress(checkBuf, checkBufSize, dstBuf, res)); + CHECK(memcmp(CNBuffer, checkBuf, CNBufferSize) != 0, "EMF: Corruption!"); + } + + /* Test external matchfinder failure scenarios */ + for (testCaseId = 0; testCaseId < EMF_numFailureCases; testCaseId++) { + externalMatchState = EMF_failureCases[testCaseId]; + ZSTD_CCtx_reset(zc, ZSTD_reset_session_only); + CHECK_Z(ZSTD_CCtx_setParameter(zc, ZSTD_c_enableMatchFinderFallback, enableFallback)); + res = ZSTD_compress2(zc, dstBuf, dstBufSize, CNBuffer, CNBufferSize); + if (enableFallback) { + CHECK_Z(ZSTD_decompress(checkBuf, checkBufSize, dstBuf, res)); + CHECK(memcmp(CNBuffer, checkBuf, CNBufferSize) != 0, "EMF: Corruption!"); + } else { + CHECK(!ZSTD_isError(res), "EMF: Should have raised an error!"); + CHECK( + ZSTD_getErrorCode(res) != ZSTD_error_externalMatchFinder_failed, + "EMF: Wrong error code: %s", ZSTD_getErrorName(res) + ); + } + } + + /* Test compression with external matchfinder + empty src buffer */ + externalMatchState = EMF_ZERO_SEQS; + ZSTD_CCtx_reset(zc, ZSTD_reset_session_only); + CHECK_Z(ZSTD_CCtx_setParameter(zc, ZSTD_c_enableMatchFinderFallback, enableFallback)); + res = ZSTD_compress2(zc, dstBuf, dstBufSize, CNBuffer, 0); + CHECK(ZSTD_isError(res), "EMF: Compression error: %s", ZSTD_getErrorName(res)); + CHECK(ZSTD_decompress(checkBuf, checkBufSize, dstBuf, res) != 0, "EMF: Empty src round trip failed!"); + } + + /* Test that reset clears the external matchfinder */ + ZSTD_CCtx_reset(zc, ZSTD_reset_session_and_parameters); + externalMatchState = EMF_BIG_ERROR; /* ensure zstd will fail if the matchfinder wasn't cleared */ + CHECK_Z(ZSTD_CCtx_setParameter(zc, ZSTD_c_enableMatchFinderFallback, 0)); + CHECK_Z(ZSTD_compress2(zc, dstBuf, dstBufSize, CNBuffer, CNBufferSize)); + + free(dstBuf); + free(checkBuf); + } + DISPLAYLEVEL(3, "OK \n"); + _end: FUZ_freeDictionary(dictionary); ZSTD_freeCStream(zc);