contrib/zstd/lib/compress/zstdmt_compress.c in extzstd-0.3.2 vs contrib/zstd/lib/compress/zstdmt_compress.c in extzstd-0.3.3

- old
+ new

@@ -1,7 +1,7 @@ /* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the * LICENSE file in the root directory of this source tree) and the GPLv2 (found * in the COPYING file in the root directory of this source tree). @@ -18,10 +18,11 @@ /* ====== Constants ====== */ #define ZSTDMT_OVERLAPLOG_DEFAULT 0 /* ====== Dependencies ====== */ +#include "../common/allocations.h" /* ZSTD_customMalloc, ZSTD_customCalloc, ZSTD_customFree */ #include "../common/zstd_deps.h" /* ZSTD_memcpy, ZSTD_memset, INT_MAX, UINT_MAX */ #include "../common/mem.h" /* MEM_STATIC */ #include "../common/pool.h" /* threadpool */ #include "../common/threading.h" /* mutex */ #include "zstd_compress_internal.h" /* MIN, ERROR, ZSTD_*, ZSTD_highbit32 */ @@ -100,13 +101,12 @@ unsigned nbBuffers; ZSTD_customMem cMem; buffer_t bTable[1]; /* variable size */ } ZSTDMT_bufferPool; -static ZSTDMT_bufferPool* ZSTDMT_createBufferPool(unsigned nbWorkers, ZSTD_customMem cMem) +static ZSTDMT_bufferPool* ZSTDMT_createBufferPool(unsigned maxNbBuffers, ZSTD_customMem cMem) { - unsigned const maxNbBuffers = 2*nbWorkers + 3; ZSTDMT_bufferPool* const bufPool = (ZSTDMT_bufferPool*)ZSTD_customCalloc( sizeof(ZSTDMT_bufferPool) + (maxNbBuffers-1) * sizeof(buffer_t), cMem); if (bufPool==NULL) return NULL; if (ZSTD_pthread_mutex_init(&bufPool->poolMutex, NULL)) { ZSTD_customFree(bufPool, cMem); @@ -158,22 +158,21 @@ bufPool->bufferSize = bSize; ZSTD_pthread_mutex_unlock(&bufPool->poolMutex); } -static ZSTDMT_bufferPool* ZSTDMT_expandBufferPool(ZSTDMT_bufferPool* srcBufPool, U32 nbWorkers) +static ZSTDMT_bufferPool* ZSTDMT_expandBufferPool(ZSTDMT_bufferPool* srcBufPool, unsigned maxNbBuffers) { - unsigned const maxNbBuffers = 2*nbWorkers + 3; if (srcBufPool==NULL) return NULL; if (srcBufPool->totalBuffers >= maxNbBuffers) /* good enough */ return srcBufPool; /* need a larger buffer pool */ { ZSTD_customMem const cMem = srcBufPool->cMem; size_t const bSize = srcBufPool->bufferSize; /* forward parameters */ ZSTDMT_bufferPool* newBufPool; ZSTDMT_freeBufferPool(srcBufPool); - newBufPool = ZSTDMT_createBufferPool(nbWorkers, cMem); + newBufPool = ZSTDMT_createBufferPool(maxNbBuffers, cMem); if (newBufPool==NULL) return newBufPool; ZSTDMT_setBufferSize(newBufPool, bSize); return newBufPool; } } @@ -261,11 +260,21 @@ /* Reached bufferPool capacity (should not happen) */ DEBUGLOG(5, "ZSTDMT_releaseBuffer: pool capacity reached => freeing "); ZSTD_customFree(buf.start, bufPool->cMem); } +/* We need 2 output buffers per worker since each dstBuff must be flushed after it is released. + * The 3 additional buffers are as follows: + * 1 buffer for input loading + * 1 buffer for "next input" when submitting current one + * 1 buffer stuck in queue */ +#define BUF_POOL_MAX_NB_BUFFERS(nbWorkers) (2*(nbWorkers) + 3) +/* After a worker releases its rawSeqStore, it is immediately ready for reuse. + * So we only need one seq buffer per worker. */ +#define SEQ_POOL_MAX_NB_BUFFERS(nbWorkers) (nbWorkers) + /* ===== Seq Pool Wrapper ====== */ typedef ZSTDMT_bufferPool ZSTDMT_seqPool; static size_t ZSTDMT_sizeof_seqPool(ZSTDMT_seqPool* seqPool) @@ -314,11 +323,11 @@ ZSTDMT_setBufferSize(seqPool, nbSeq * sizeof(rawSeq)); } static ZSTDMT_seqPool* ZSTDMT_createSeqPool(unsigned nbWorkers, ZSTD_customMem cMem) { - ZSTDMT_seqPool* const seqPool = ZSTDMT_createBufferPool(nbWorkers, cMem); + ZSTDMT_seqPool* const seqPool = ZSTDMT_createBufferPool(SEQ_POOL_MAX_NB_BUFFERS(nbWorkers), cMem); if (seqPool == NULL) return NULL; ZSTDMT_setNbSeq(seqPool, 0); return seqPool; } @@ -327,11 +336,11 @@ ZSTDMT_freeBufferPool(seqPool); } static ZSTDMT_seqPool* ZSTDMT_expandSeqPool(ZSTDMT_seqPool* pool, U32 nbWorkers) { - return ZSTDMT_expandBufferPool(pool, nbWorkers); + return ZSTDMT_expandBufferPool(pool, SEQ_POOL_MAX_NB_BUFFERS(nbWorkers)); } /* ===== CCtx Pool ===== */ /* a single CCtx Pool can be invoked from multiple threads in parallel */ @@ -465,33 +474,31 @@ size_t jobSize, const void* dict, size_t const dictSize, ZSTD_dictContentType_e dictContentType) { /* Adjust parameters */ - if (params.ldmParams.enableLdm) { + if (params.ldmParams.enableLdm == ZSTD_ps_enable) { DEBUGLOG(4, "LDM window size = %u KB", (1U << params.cParams.windowLog) >> 10); ZSTD_ldm_adjustParameters(&params.ldmParams, &params.cParams); assert(params.ldmParams.hashLog >= params.ldmParams.bucketSizeLog); assert(params.ldmParams.hashRateLog < 32); - serialState->ldmState.hashPower = - ZSTD_rollingHash_primePower(params.ldmParams.minMatchLength); } else { ZSTD_memset(&params.ldmParams, 0, sizeof(params.ldmParams)); } serialState->nextJobID = 0; if (params.fParams.checksumFlag) XXH64_reset(&serialState->xxhState, 0); - if (params.ldmParams.enableLdm) { + if (params.ldmParams.enableLdm == ZSTD_ps_enable) { ZSTD_customMem cMem = params.customMem; unsigned const hashLog = params.ldmParams.hashLog; size_t const hashSize = ((size_t)1 << hashLog) * sizeof(ldmEntry_t); unsigned const bucketLog = params.ldmParams.hashLog - params.ldmParams.bucketSizeLog; - size_t const bucketSize = (size_t)1 << bucketLog; unsigned const prevBucketLog = serialState->params.ldmParams.hashLog - serialState->params.ldmParams.bucketSizeLog; + size_t const numBuckets = (size_t)1 << bucketLog; /* Size the seq pool tables */ ZSTDMT_setNbSeq(seqPool, ZSTD_ldm_getMaxNbSeq(params.ldmParams, jobSize)); /* Reset the window */ ZSTD_window_init(&serialState->ldmState.window); /* Resize tables and output space if necessary. */ @@ -499,24 +506,24 @@ ZSTD_customFree(serialState->ldmState.hashTable, cMem); serialState->ldmState.hashTable = (ldmEntry_t*)ZSTD_customMalloc(hashSize, cMem); } if (serialState->ldmState.bucketOffsets == NULL || prevBucketLog < bucketLog) { ZSTD_customFree(serialState->ldmState.bucketOffsets, cMem); - serialState->ldmState.bucketOffsets = (BYTE*)ZSTD_customMalloc(bucketSize, cMem); + serialState->ldmState.bucketOffsets = (BYTE*)ZSTD_customMalloc(numBuckets, cMem); } if (!serialState->ldmState.hashTable || !serialState->ldmState.bucketOffsets) return 1; /* Zero the tables */ ZSTD_memset(serialState->ldmState.hashTable, 0, hashSize); - ZSTD_memset(serialState->ldmState.bucketOffsets, 0, bucketSize); + ZSTD_memset(serialState->ldmState.bucketOffsets, 0, numBuckets); /* Update window state and fill hash table with dict */ serialState->ldmState.loadedDictEnd = 0; if (dictSize > 0) { if (dictContentType == ZSTD_dct_rawContent) { BYTE const* const dictEnd = (const BYTE*)dict + dictSize; - ZSTD_window_update(&serialState->ldmState.window, dict, dictSize); + ZSTD_window_update(&serialState->ldmState.window, dict, dictSize, /* forceNonContiguous */ 0); ZSTD_ldm_fillHashTable(&serialState->ldmState, (const BYTE*)dict, dictEnd, &params.ldmParams); serialState->ldmState.loadedDictEnd = params.forceWindow ? 0 : (U32)(dictEnd - serialState->ldmState.window.base); } else { /* don't even load anything */ } @@ -564,16 +571,16 @@ ZSTD_pthread_cond_wait(&serialState->cond, &serialState->mutex); } /* A future job may error and skip our job */ if (serialState->nextJobID == jobID) { /* It is now our turn, do any processing necessary */ - if (serialState->params.ldmParams.enableLdm) { + if (serialState->params.ldmParams.enableLdm == ZSTD_ps_enable) { size_t error; assert(seqStore.seq != NULL && seqStore.pos == 0 && seqStore.size == 0 && seqStore.capacity > 0); assert(src.size <= serialState->params.jobSize); - ZSTD_window_update(&serialState->ldmState.window, src.start, src.size); + ZSTD_window_update(&serialState->ldmState.window, src.start, src.size, /* forceNonContiguous */ 0); error = ZSTD_ldm_generateSequences( &serialState->ldmState, &seqStore, &serialState->params.ldmParams, src.start, src.size); /* We provide a large enough buffer to never fail. */ assert(!ZSTD_isError(error)); (void)error; @@ -594,11 +601,11 @@ ZSTD_pthread_mutex_unlock(&serialState->mutex); if (seqStore.size > 0) { size_t const err = ZSTD_referenceExternalSequences( jobCCtx, seqStore.seq, seqStore.size); - assert(serialState->params.ldmParams.enableLdm); + assert(serialState->params.ldmParams.enableLdm == ZSTD_ps_enable); assert(!ZSTD_isError(err)); (void)err; } } @@ -672,19 +679,21 @@ if (dstBuff.start == NULL) { /* streaming job : doesn't provide a dstBuffer */ dstBuff = ZSTDMT_getBuffer(job->bufPool); if (dstBuff.start==NULL) JOB_ERROR(ERROR(memory_allocation)); job->dstBuff = dstBuff; /* this value can be read in ZSTDMT_flush, when it copies the whole job */ } - if (jobParams.ldmParams.enableLdm && rawSeqStore.seq == NULL) + if (jobParams.ldmParams.enableLdm == ZSTD_ps_enable && rawSeqStore.seq == NULL) JOB_ERROR(ERROR(memory_allocation)); /* Don't compute the checksum for chunks, since we compute it externally, * but write it in the header. */ if (job->jobID != 0) jobParams.fParams.checksumFlag = 0; /* Don't run LDM for the chunks, since we handle it externally */ - jobParams.ldmParams.enableLdm = 0; + jobParams.ldmParams.enableLdm = ZSTD_ps_disable; + /* Correct nbWorkers to 0. */ + jobParams.nbWorkers = 0; /* init */ if (job->cdict) { size_t const initError = ZSTD_compressBegin_advanced_internal(cctx, NULL, 0, ZSTD_dct_auto, ZSTD_dtlm_fast, job->cdict, &jobParams, job->fullFrameSize); @@ -693,10 +702,14 @@ } else { /* srcStart points at reloaded section */ U64 const pledgedSrcSize = job->firstJob ? job->fullFrameSize : job->src.size; { size_t const forceWindowError = ZSTD_CCtxParams_setParameter(&jobParams, ZSTD_c_forceMaxWindow, !job->firstJob); if (ZSTD_isError(forceWindowError)) JOB_ERROR(forceWindowError); } + if (!job->firstJob) { + size_t const err = ZSTD_CCtxParams_setParameter(&jobParams, ZSTD_c_deterministicRefPrefix, 0); + if (ZSTD_isError(err)) JOB_ERROR(err); + } { size_t const initError = ZSTD_compressBegin_advanced_internal(cctx, job->prefix.start, job->prefix.size, ZSTD_dct_rawContent, /* load dictionary in "content-only" mode (no header analysis) */ ZSTD_dtlm_fast, NULL, /*cdict*/ &jobParams, pledgedSrcSize); @@ -705,11 +718,11 @@ /* Perform serial step as early as possible, but after CCtx initialization */ ZSTDMT_serialState_update(job->serial, cctx, rawSeqStore, job->src, job->jobID); if (!job->firstJob) { /* flush and overwrite frame header when it's not first job */ - size_t const hSize = ZSTD_compressContinue(cctx, dstBuff.start, dstBuff.capacity, job->src.start, 0); + size_t const hSize = ZSTD_compressContinue_public(cctx, dstBuff.start, dstBuff.capacity, job->src.start, 0); if (ZSTD_isError(hSize)) JOB_ERROR(hSize); DEBUGLOG(5, "ZSTDMT_compressionJob: flush and overwrite %u bytes of frame header (not first job)", (U32)hSize); ZSTD_invalidateRepCodes(cctx); } @@ -723,11 +736,11 @@ int chunkNb; if (sizeof(size_t) > sizeof(int)) assert(job->src.size < ((size_t)INT_MAX) * chunkSize); /* check overflow */ DEBUGLOG(5, "ZSTDMT_compressionJob: compress %u bytes in %i blocks", (U32)job->src.size, nbChunks); assert(job->cSize == 0); for (chunkNb = 1; chunkNb < nbChunks; chunkNb++) { - size_t const cSize = ZSTD_compressContinue(cctx, op, oend-op, ip, chunkSize); + size_t const cSize = ZSTD_compressContinue_public(cctx, op, oend-op, ip, chunkSize); if (ZSTD_isError(cSize)) JOB_ERROR(cSize); ip += chunkSize; op += cSize; assert(op < oend); /* stats */ ZSTD_PTHREAD_MUTEX_LOCK(&job->job_mutex); @@ -743,15 +756,22 @@ assert((chunkSize & (chunkSize - 1)) == 0); /* chunkSize must be power of 2 for mask==(chunkSize-1) to work */ if ((nbChunks > 0) | job->lastJob /*must output a "last block" flag*/ ) { size_t const lastBlockSize1 = job->src.size & (chunkSize-1); size_t const lastBlockSize = ((lastBlockSize1==0) & (job->src.size>=chunkSize)) ? chunkSize : lastBlockSize1; size_t const cSize = (job->lastJob) ? - ZSTD_compressEnd (cctx, op, oend-op, ip, lastBlockSize) : - ZSTD_compressContinue(cctx, op, oend-op, ip, lastBlockSize); + ZSTD_compressEnd_public(cctx, op, oend-op, ip, lastBlockSize) : + ZSTD_compressContinue_public(cctx, op, oend-op, ip, lastBlockSize); if (ZSTD_isError(cSize)) JOB_ERROR(cSize); lastCBlockSize = cSize; } } + if (!job->firstJob) { + /* Double check that we don't have an ext-dict, because then our + * repcode invalidation doesn't work. + */ + assert(!ZSTD_window_hasExtDict(cctx->blockState.matchState.window)); + } + ZSTD_CCtx_trace(cctx, 0); _endJob: ZSTDMT_serialState_ensureFinished(job->serial, job->jobID, job->cSize); if (job->prefix.size > 0) DEBUGLOG(5, "Finished with prefix: %zx", (size_t)job->prefix.start); @@ -794,10 +814,19 @@ } roundBuff_t; static const roundBuff_t kNullRoundBuff = {NULL, 0, 0}; #define RSYNC_LENGTH 32 +/* Don't create chunks smaller than the zstd block size. + * This stops us from regressing compression ratio too much, + * and ensures our output fits in ZSTD_compressBound(). + * + * If this is shrunk < ZSTD_BLOCKSIZELOG_MIN then + * ZSTD_COMPRESSBOUND() will need to be updated. + */ +#define RSYNC_MIN_BLOCK_LOG ZSTD_BLOCKSIZELOG_MAX +#define RSYNC_MIN_BLOCK_SIZE (1<<RSYNC_MIN_BLOCK_LOG) typedef struct { U64 hash; U64 hitMask; U64 primePower; @@ -914,11 +943,11 @@ mtctx->providedFactory = 0; } mtctx->jobs = ZSTDMT_createJobsTable(&nbJobs, cMem); assert(nbJobs > 0); assert((nbJobs & (nbJobs - 1)) == 0); /* ensure nbJobs is a power of 2 */ mtctx->jobIDMask = nbJobs - 1; - mtctx->bufPool = ZSTDMT_createBufferPool(nbWorkers, cMem); + mtctx->bufPool = ZSTDMT_createBufferPool(BUF_POOL_MAX_NB_BUFFERS(nbWorkers), cMem); mtctx->cctxPool = ZSTDMT_createCCtxPool(nbWorkers, cMem); mtctx->seqPool = ZSTDMT_createSeqPool(nbWorkers, cMem); initError = ZSTDMT_serialState_init(&mtctx->serial); mtctx->roundBuff = kNullRoundBuff; if (!mtctx->factory | !mtctx->jobs | !mtctx->bufPool | !mtctx->cctxPool | !mtctx->seqPool | initError) { @@ -1017,11 +1046,11 @@ * @return : error code if fails, 0 on success */ static size_t ZSTDMT_resize(ZSTDMT_CCtx* mtctx, unsigned nbWorkers) { if (POOL_resize(mtctx->factory, nbWorkers)) return ERROR(memory_allocation); FORWARD_IF_ERROR( ZSTDMT_expandJobsTable(mtctx, nbWorkers) , ""); - mtctx->bufPool = ZSTDMT_expandBufferPool(mtctx->bufPool, nbWorkers); + mtctx->bufPool = ZSTDMT_expandBufferPool(mtctx->bufPool, BUF_POOL_MAX_NB_BUFFERS(nbWorkers)); if (mtctx->bufPool == NULL) return ERROR(memory_allocation); mtctx->cctxPool = ZSTDMT_expandCCtxPool(mtctx->cctxPool, nbWorkers); if (mtctx->cctxPool == NULL) return ERROR(memory_allocation); mtctx->seqPool = ZSTDMT_expandSeqPool(mtctx->seqPool, nbWorkers); if (mtctx->seqPool == NULL) return ERROR(memory_allocation); @@ -1122,11 +1151,11 @@ /* ------------------------------------------ */ static unsigned ZSTDMT_computeTargetJobLog(const ZSTD_CCtx_params* params) { unsigned jobLog; - if (params->ldmParams.enableLdm) { + if (params->ldmParams.enableLdm == ZSTD_ps_enable) { /* In Long Range Mode, the windowLog is typically oversized. * In which case, it's preferable to determine the jobSize * based on cycleLog instead. */ jobLog = MAX(21, ZSTD_cycleLog(params->cParams.chainLog, params->cParams.strategy) + 3); } else { @@ -1166,11 +1195,11 @@ static size_t ZSTDMT_computeOverlapSize(const ZSTD_CCtx_params* params) { int const overlapRLog = 9 - ZSTDMT_overlapLog(params->overlapLog, params->cParams.strategy); int ovLog = (overlapRLog >= 8) ? 0 : (params->cParams.windowLog - overlapRLog); assert(0 <= overlapRLog && overlapRLog <= 8); - if (params->ldmParams.enableLdm) { + if (params->ldmParams.enableLdm == ZSTD_ps_enable) { /* In Long Range Mode, the windowLog is typically oversized. * In which case, it's preferable to determine the jobSize * based on chainLog instead. * Then, ovLog becomes a fraction of the jobSize, rather than windowSize */ ovLog = MIN(params->cParams.windowLog, ZSTDMT_computeTargetJobLog(params) - 2) @@ -1237,13 +1266,15 @@ } assert(mtctx->targetSectionSize <= (size_t)ZSTDMT_JOBSIZE_MAX); if (params.rsyncable) { /* Aim for the targetsectionSize as the average job size. */ - U32 const jobSizeMB = (U32)(mtctx->targetSectionSize >> 20); - U32 const rsyncBits = ZSTD_highbit32(jobSizeMB) + 20; - assert(jobSizeMB >= 1); + U32 const jobSizeKB = (U32)(mtctx->targetSectionSize >> 10); + U32 const rsyncBits = (assert(jobSizeKB >= 1), ZSTD_highbit32(jobSizeKB) + 10); + /* We refuse to create jobs < RSYNC_MIN_BLOCK_SIZE bytes, so make sure our + * expected job size is at least 4x larger. */ + assert(rsyncBits >= RSYNC_MIN_BLOCK_LOG + 2); DEBUGLOG(4, "rsyncLog = %u", rsyncBits); mtctx->rsync.hash = 0; mtctx->rsync.hitMask = (1ULL << rsyncBits) - 1; mtctx->rsync.primePower = ZSTD_rollingHash_primePower(RSYNC_LENGTH); } @@ -1251,11 +1282,11 @@ DEBUGLOG(4, "Job Size : %u KB (note : set to %u)", (U32)(mtctx->targetSectionSize>>10), (U32)params.jobSize); DEBUGLOG(4, "inBuff Size : %u KB", (U32)(mtctx->targetSectionSize>>10)); ZSTDMT_setBufferSize(mtctx->bufPool, ZSTD_compressBound(mtctx->targetSectionSize)); { /* If ldm is enabled we need windowSize space. */ - size_t const windowSize = mtctx->params.ldmParams.enableLdm ? (1U << mtctx->params.cParams.windowLog) : 0; + size_t const windowSize = mtctx->params.ldmParams.enableLdm == ZSTD_ps_enable ? (1U << mtctx->params.cParams.windowLog) : 0; /* Two buffers of slack, plus extra space for the overlap * This is the minimum slack that LDM works with. One extra because * flush might waste up to targetSectionSize-1 bytes. Another extra * for the overlap (if > 0), then one to fill which doesn't overlap * with the LDM window. @@ -1526,21 +1557,25 @@ * Returns non-zero iff buffer and range overlap. */ static int ZSTDMT_isOverlapped(buffer_t buffer, range_t range) { BYTE const* const bufferStart = (BYTE const*)buffer.start; - BYTE const* const bufferEnd = bufferStart + buffer.capacity; BYTE const* const rangeStart = (BYTE const*)range.start; - BYTE const* const rangeEnd = range.size != 0 ? rangeStart + range.size : rangeStart; if (rangeStart == NULL || bufferStart == NULL) return 0; - /* Empty ranges cannot overlap */ - if (bufferStart == bufferEnd || rangeStart == rangeEnd) - return 0; - return bufferStart < rangeEnd && rangeStart < bufferEnd; + { + BYTE const* const bufferEnd = bufferStart + buffer.capacity; + BYTE const* const rangeEnd = rangeStart + range.size; + + /* Empty ranges cannot overlap */ + if (bufferStart == bufferEnd || rangeStart == rangeEnd) + return 0; + + return bufferStart < rangeEnd && rangeStart < bufferEnd; + } } static int ZSTDMT_doesOverlapWindow(buffer_t buffer, ZSTD_window_t window) { range_t extDict; @@ -1563,11 +1598,11 @@ || ZSTDMT_isOverlapped(buffer, prefix); } static void ZSTDMT_waitForLdmComplete(ZSTDMT_CCtx* mtctx, buffer_t buffer) { - if (mtctx->params.ldmParams.enableLdm) { + if (mtctx->params.ldmParams.enableLdm == ZSTD_ps_enable) { ZSTD_pthread_mutex_t* mutex = &mtctx->serial.ldmWindowMutex; DEBUGLOG(5, "ZSTDMT_waitForLdmComplete"); DEBUGLOG(5, "source [0x%zx, 0x%zx)", (size_t)buffer.start, (size_t)buffer.start + buffer.capacity); @@ -1666,24 +1701,47 @@ syncPoint.toLoad = MIN(input.size - input.pos, mtctx->targetSectionSize - mtctx->inBuff.filled); syncPoint.flush = 0; if (!mtctx->params.rsyncable) /* Rsync is disabled. */ return syncPoint; + if (mtctx->inBuff.filled + input.size - input.pos < RSYNC_MIN_BLOCK_SIZE) + /* We don't emit synchronization points if it would produce too small blocks. + * We don't have enough input to find a synchronization point, so don't look. + */ + return syncPoint; if (mtctx->inBuff.filled + syncPoint.toLoad < RSYNC_LENGTH) /* Not enough to compute the hash. * We will miss any synchronization points in this RSYNC_LENGTH byte * window. However, since it depends only in the internal buffers, if the * state is already synchronized, we will remain synchronized. * Additionally, the probability that we miss a synchronization point is * low: RSYNC_LENGTH / targetSectionSize. */ return syncPoint; /* Initialize the loop variables. */ - if (mtctx->inBuff.filled >= RSYNC_LENGTH) { - /* We have enough bytes buffered to initialize the hash. + if (mtctx->inBuff.filled < RSYNC_MIN_BLOCK_SIZE) { + /* We don't need to scan the first RSYNC_MIN_BLOCK_SIZE positions + * because they can't possibly be a sync point. So we can start + * part way through the input buffer. + */ + pos = RSYNC_MIN_BLOCK_SIZE - mtctx->inBuff.filled; + if (pos >= RSYNC_LENGTH) { + prev = istart + pos - RSYNC_LENGTH; + hash = ZSTD_rollingHash_compute(prev, RSYNC_LENGTH); + } else { + assert(mtctx->inBuff.filled >= RSYNC_LENGTH); + prev = (BYTE const*)mtctx->inBuff.buffer.start + mtctx->inBuff.filled - RSYNC_LENGTH; + hash = ZSTD_rollingHash_compute(prev + pos, (RSYNC_LENGTH - pos)); + hash = ZSTD_rollingHash_append(hash, istart, pos); + } + } else { + /* We have enough bytes buffered to initialize the hash, + * and have processed enough bytes to find a sync point. * Start scanning at the beginning of the input. */ + assert(mtctx->inBuff.filled >= RSYNC_MIN_BLOCK_SIZE); + assert(RSYNC_MIN_BLOCK_SIZE >= RSYNC_LENGTH); pos = 0; prev = (BYTE const*)mtctx->inBuff.buffer.start + mtctx->inBuff.filled - RSYNC_LENGTH; hash = ZSTD_rollingHash_compute(prev, RSYNC_LENGTH); if ((hash & hitMask) == hitMask) { /* We're already at a sync point so don't load any more until @@ -1693,38 +1751,36 @@ */ syncPoint.toLoad = 0; syncPoint.flush = 1; return syncPoint; } - } else { - /* We don't have enough bytes buffered to initialize the hash, but - * we know we have at least RSYNC_LENGTH bytes total. - * Start scanning after the first RSYNC_LENGTH bytes less the bytes - * already buffered. - */ - pos = RSYNC_LENGTH - mtctx->inBuff.filled; - prev = (BYTE const*)mtctx->inBuff.buffer.start - pos; - hash = ZSTD_rollingHash_compute(mtctx->inBuff.buffer.start, mtctx->inBuff.filled); - hash = ZSTD_rollingHash_append(hash, istart, pos); } /* Starting with the hash of the previous RSYNC_LENGTH bytes, roll * through the input. If we hit a synchronization point, then cut the * job off, and tell the compressor to flush the job. Otherwise, load * all the bytes and continue as normal. * If we go too long without a synchronization point (targetSectionSize) * then a block will be emitted anyways, but this is okay, since if we * are already synchronized we will remain synchronized. */ + assert(pos < RSYNC_LENGTH || ZSTD_rollingHash_compute(istart + pos - RSYNC_LENGTH, RSYNC_LENGTH) == hash); for (; pos < syncPoint.toLoad; ++pos) { BYTE const toRemove = pos < RSYNC_LENGTH ? prev[pos] : istart[pos - RSYNC_LENGTH]; - /* if (pos >= RSYNC_LENGTH) assert(ZSTD_rollingHash_compute(istart + pos - RSYNC_LENGTH, RSYNC_LENGTH) == hash); */ + /* This assert is very expensive, and Debian compiles with asserts enabled. + * So disable it for now. We can get similar coverage by checking it at the + * beginning & end of the loop. + * assert(pos < RSYNC_LENGTH || ZSTD_rollingHash_compute(istart + pos - RSYNC_LENGTH, RSYNC_LENGTH) == hash); + */ hash = ZSTD_rollingHash_rotate(hash, toRemove, istart[pos], primePower); + assert(mtctx->inBuff.filled + pos >= RSYNC_MIN_BLOCK_SIZE); if ((hash & hitMask) == hitMask) { syncPoint.toLoad = pos + 1; syncPoint.flush = 1; + ++pos; /* for assert */ break; } } + assert(pos < RSYNC_LENGTH || ZSTD_rollingHash_compute(istart + pos - RSYNC_LENGTH, RSYNC_LENGTH) == hash); return syncPoint; } size_t ZSTDMT_nextInputSizeHint(const ZSTDMT_CCtx* mtctx) {