ext/zstdruby/libzstd/decompress/zstd_decompress_block.c in zstd-ruby-1.4.5.0 vs ext/zstdruby/libzstd/decompress/zstd_decompress_block.c in zstd-ruby-1.4.9.0

- old
+ new

@@ -1,7 +1,7 @@ /* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) 2016-2021, Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the * LICENSE file in the root directory of this source tree) and the GPLv2 (found * in the COPYING file in the root directory of this source tree). @@ -12,11 +12,11 @@ * this module takes care of decompressing _compressed_ block */ /*-******************************************************* * Dependencies *********************************************************/ -#include <string.h> /* memcpy, memmove, memset */ +#include "../common/zstd_deps.h" /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */ #include "../common/compiler.h" /* prefetch */ #include "../common/cpu.h" /* bmi2 */ #include "../common/mem.h" /* low level memory routines */ #define FSE_STATIC_LINKING_ONLY #include "../common/fse.h" @@ -42,11 +42,11 @@ /*_******************************************************* * Memory operations **********************************************************/ -static void ZSTD_copy4(void* dst, const void* src) { memcpy(dst, src, 4); } +static void ZSTD_copy4(void* dst, const void* src) { ZSTD_memcpy(dst, src, 4); } /*-************************************************************* * Block decoding ***************************************************************/ @@ -164,11 +164,11 @@ dctx->litPtr = dctx->litBuffer; dctx->litSize = litSize; dctx->litEntropy = 1; if (litEncType==set_compressed) dctx->HUFptr = dctx->entropy.hufTable; - memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH); + ZSTD_memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH); return litCSize + lhSize; } case set_basic: { size_t litSize, lhSize; @@ -189,14 +189,14 @@ break; } if (lhSize+litSize+WILDCOPY_OVERLENGTH > srcSize) { /* risk reading beyond src buffer with wildcopy */ RETURN_ERROR_IF(litSize+lhSize > srcSize, corruption_detected, ""); - memcpy(dctx->litBuffer, istart+lhSize, litSize); + ZSTD_memcpy(dctx->litBuffer, istart+lhSize, litSize); dctx->litPtr = dctx->litBuffer; dctx->litSize = litSize; - memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH); + ZSTD_memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH); return lhSize+litSize; } /* direct reference into compressed stream */ dctx->litPtr = istart+lhSize; dctx->litSize = litSize; @@ -221,11 +221,11 @@ litSize = MEM_readLE24(istart) >> 4; RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4"); break; } RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, ""); - memset(dctx->litBuffer, istart[lhSize], litSize + WILDCOPY_OVERLENGTH); + ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize + WILDCOPY_OVERLENGTH); dctx->litPtr = dctx->litBuffer; dctx->litSize = litSize; return lhSize+1; } default: @@ -234,11 +234,11 @@ } } /* Default FSE distribution tables. * These are pre-calculated FSE decoding tables using default distributions as defined in specification : - * https://github.com/facebook/zstd/blob/master/doc/zstd_compression_format.md#default-distributions + * https://github.com/facebook/zstd/blob/release/doc/zstd_compression_format.md#default-distributions * They were generated programmatically with following method : * - start from default distributions, present in /lib/common/zstd_internal.h * - generate tables normally, using ZSTD_buildFSETable() * - printout the content of tables * - pretify output, report below, test with fuzzer to ensure it's correct */ @@ -362,27 +362,30 @@ /* ZSTD_buildFSETable() : * generate FSE decoding table for one symbol (ll, ml or off) * cannot fail if input is valid => * all inputs are presumed validated at this stage */ -void -ZSTD_buildFSETable(ZSTD_seqSymbol* dt, +FORCE_INLINE_TEMPLATE +void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt, const short* normalizedCounter, unsigned maxSymbolValue, const U32* baseValue, const U32* nbAdditionalBits, - unsigned tableLog) + unsigned tableLog, void* wksp, size_t wkspSize) { ZSTD_seqSymbol* const tableDecode = dt+1; - U16 symbolNext[MaxSeq+1]; - U32 const maxSV1 = maxSymbolValue + 1; U32 const tableSize = 1 << tableLog; - U32 highThreshold = tableSize-1; + U16* symbolNext = (U16*)wksp; + BYTE* spread = (BYTE*)(symbolNext + MaxSeq + 1); + U32 highThreshold = tableSize - 1; + + /* Sanity Checks */ assert(maxSymbolValue <= MaxSeq); assert(tableLog <= MaxFSELog); - + assert(wkspSize >= ZSTD_BUILD_FSE_TABLE_WKSP_SIZE); + (void)wkspSize; /* Init, lay down lowprob symbols */ { ZSTD_seqSymbol_header DTableH; DTableH.tableLog = tableLog; DTableH.fastMode = 1; { S16 const largeLimit= (S16)(1 << (tableLog-1)); @@ -394,50 +397,144 @@ } else { if (normalizedCounter[s] >= largeLimit) DTableH.fastMode=0; assert(normalizedCounter[s]>=0); symbolNext[s] = (U16)normalizedCounter[s]; } } } - memcpy(dt, &DTableH, sizeof(DTableH)); + ZSTD_memcpy(dt, &DTableH, sizeof(DTableH)); } /* Spread symbols */ - { U32 const tableMask = tableSize-1; + assert(tableSize <= 512); + /* Specialized symbol spreading for the case when there are + * no low probability (-1 count) symbols. When compressing + * small blocks we avoid low probability symbols to hit this + * case, since header decoding speed matters more. + */ + if (highThreshold == tableSize - 1) { + size_t const tableMask = tableSize-1; + size_t const step = FSE_TABLESTEP(tableSize); + /* First lay down the symbols in order. + * We use a uint64_t to lay down 8 bytes at a time. This reduces branch + * misses since small blocks generally have small table logs, so nearly + * all symbols have counts <= 8. We ensure we have 8 bytes at the end of + * our buffer to handle the over-write. + */ + { + U64 const add = 0x0101010101010101ull; + size_t pos = 0; + U64 sv = 0; + U32 s; + for (s=0; s<maxSV1; ++s, sv += add) { + int i; + int const n = normalizedCounter[s]; + MEM_write64(spread + pos, sv); + for (i = 8; i < n; i += 8) { + MEM_write64(spread + pos + i, sv); + } + pos += n; + } + } + /* Now we spread those positions across the table. + * The benefit of doing it in two stages is that we avoid the the + * variable size inner loop, which caused lots of branch misses. + * Now we can run through all the positions without any branch misses. + * We unroll the loop twice, since that is what emperically worked best. + */ + { + size_t position = 0; + size_t s; + size_t const unroll = 2; + assert(tableSize % unroll == 0); /* FSE_MIN_TABLELOG is 5 */ + for (s = 0; s < (size_t)tableSize; s += unroll) { + size_t u; + for (u = 0; u < unroll; ++u) { + size_t const uPosition = (position + (u * step)) & tableMask; + tableDecode[uPosition].baseValue = spread[s + u]; + } + position = (position + (unroll * step)) & tableMask; + } + assert(position == 0); + } + } else { + U32 const tableMask = tableSize-1; U32 const step = FSE_TABLESTEP(tableSize); U32 s, position = 0; for (s=0; s<maxSV1; s++) { int i; - for (i=0; i<normalizedCounter[s]; i++) { + int const n = normalizedCounter[s]; + for (i=0; i<n; i++) { tableDecode[position].baseValue = s; position = (position + step) & tableMask; while (position > highThreshold) position = (position + step) & tableMask; /* lowprob area */ } } assert(position == 0); /* position must reach all cells once, otherwise normalizedCounter is incorrect */ } /* Build Decoding table */ - { U32 u; + { + U32 u; for (u=0; u<tableSize; u++) { U32 const symbol = tableDecode[u].baseValue; U32 const nextState = symbolNext[symbol]++; tableDecode[u].nbBits = (BYTE) (tableLog - BIT_highbit32(nextState) ); tableDecode[u].nextState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize); assert(nbAdditionalBits[symbol] < 255); tableDecode[u].nbAdditionalBits = (BYTE)nbAdditionalBits[symbol]; tableDecode[u].baseValue = baseValue[symbol]; - } } + } + } } +/* Avoids the FORCE_INLINE of the _body() function. */ +static void ZSTD_buildFSETable_body_default(ZSTD_seqSymbol* dt, + const short* normalizedCounter, unsigned maxSymbolValue, + const U32* baseValue, const U32* nbAdditionalBits, + unsigned tableLog, void* wksp, size_t wkspSize) +{ + ZSTD_buildFSETable_body(dt, normalizedCounter, maxSymbolValue, + baseValue, nbAdditionalBits, tableLog, wksp, wkspSize); +} +#if DYNAMIC_BMI2 +TARGET_ATTRIBUTE("bmi2") static void ZSTD_buildFSETable_body_bmi2(ZSTD_seqSymbol* dt, + const short* normalizedCounter, unsigned maxSymbolValue, + const U32* baseValue, const U32* nbAdditionalBits, + unsigned tableLog, void* wksp, size_t wkspSize) +{ + ZSTD_buildFSETable_body(dt, normalizedCounter, maxSymbolValue, + baseValue, nbAdditionalBits, tableLog, wksp, wkspSize); +} +#endif + +void ZSTD_buildFSETable(ZSTD_seqSymbol* dt, + const short* normalizedCounter, unsigned maxSymbolValue, + const U32* baseValue, const U32* nbAdditionalBits, + unsigned tableLog, void* wksp, size_t wkspSize, int bmi2) +{ +#if DYNAMIC_BMI2 + if (bmi2) { + ZSTD_buildFSETable_body_bmi2(dt, normalizedCounter, maxSymbolValue, + baseValue, nbAdditionalBits, tableLog, wksp, wkspSize); + return; + } +#endif + (void)bmi2; + ZSTD_buildFSETable_body_default(dt, normalizedCounter, maxSymbolValue, + baseValue, nbAdditionalBits, tableLog, wksp, wkspSize); +} + + /*! ZSTD_buildSeqTable() : * @return : nb bytes read from src, * or an error code if it fails */ static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymbol** DTablePtr, symbolEncodingType_e type, unsigned max, U32 maxLog, const void* src, size_t srcSize, const U32* baseValue, const U32* nbAdditionalBits, const ZSTD_seqSymbol* defaultTable, U32 flagRepeatTable, - int ddictIsCold, int nbSeq) + int ddictIsCold, int nbSeq, U32* wksp, size_t wkspSize, + int bmi2) { switch(type) { case set_rle : RETURN_ERROR_IF(!srcSize, srcSize_wrong, ""); @@ -465,11 +562,11 @@ { unsigned tableLog; S16 norm[MaxSeq+1]; size_t const headerSize = FSE_readNCount(norm, &max, &tableLog, src, srcSize); RETURN_ERROR_IF(FSE_isError(headerSize), corruption_detected, ""); RETURN_ERROR_IF(tableLog > maxLog, corruption_detected, ""); - ZSTD_buildFSETable(DTableSpace, norm, max, baseValue, nbAdditionalBits, tableLog); + ZSTD_buildFSETable(DTableSpace, norm, max, baseValue, nbAdditionalBits, tableLog, wksp, wkspSize, bmi2); *DTablePtr = DTableSpace; return headerSize; } default : assert(0); @@ -478,11 +575,11 @@ } size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr, const void* src, size_t srcSize) { - const BYTE* const istart = (const BYTE* const)src; + const BYTE* const istart = (const BYTE*)src; const BYTE* const iend = istart + srcSize; const BYTE* ip = istart; int nbSeq; DEBUGLOG(5, "ZSTD_decodeSeqHeaders"); @@ -497,11 +594,12 @@ return 1; } if (nbSeq > 0x7F) { if (nbSeq == 0xFF) { RETURN_ERROR_IF(ip+2 > iend, srcSize_wrong, ""); - nbSeq = MEM_readLE16(ip) + LONGNBSEQ, ip+=2; + nbSeq = MEM_readLE16(ip) + LONGNBSEQ; + ip+=2; } else { RETURN_ERROR_IF(ip >= iend, srcSize_wrong, ""); nbSeq = ((nbSeq-0x80)<<8) + *ip++; } } @@ -518,31 +616,37 @@ { size_t const llhSize = ZSTD_buildSeqTable(dctx->entropy.LLTable, &dctx->LLTptr, LLtype, MaxLL, LLFSELog, ip, iend-ip, LL_base, LL_bits, LL_defaultDTable, dctx->fseEntropy, - dctx->ddictIsCold, nbSeq); + dctx->ddictIsCold, nbSeq, + dctx->workspace, sizeof(dctx->workspace), + dctx->bmi2); RETURN_ERROR_IF(ZSTD_isError(llhSize), corruption_detected, "ZSTD_buildSeqTable failed"); ip += llhSize; } { size_t const ofhSize = ZSTD_buildSeqTable(dctx->entropy.OFTable, &dctx->OFTptr, OFtype, MaxOff, OffFSELog, ip, iend-ip, OF_base, OF_bits, OF_defaultDTable, dctx->fseEntropy, - dctx->ddictIsCold, nbSeq); + dctx->ddictIsCold, nbSeq, + dctx->workspace, sizeof(dctx->workspace), + dctx->bmi2); RETURN_ERROR_IF(ZSTD_isError(ofhSize), corruption_detected, "ZSTD_buildSeqTable failed"); ip += ofhSize; } { size_t const mlhSize = ZSTD_buildSeqTable(dctx->entropy.MLTable, &dctx->MLTptr, MLtype, MaxML, MLFSELog, ip, iend-ip, ML_base, ML_bits, ML_defaultDTable, dctx->fseEntropy, - dctx->ddictIsCold, nbSeq); + dctx->ddictIsCold, nbSeq, + dctx->workspace, sizeof(dctx->workspace), + dctx->bmi2); RETURN_ERROR_IF(ZSTD_isError(mlhSize), corruption_detected, "ZSTD_buildSeqTable failed"); ip += mlhSize; } } @@ -684,16 +788,16 @@ if (sequence.offset > (size_t)(oLitEnd - prefixStart)) { /* offset beyond prefix */ RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected, ""); match = dictEnd - (prefixStart-match); if (match + sequence.matchLength <= dictEnd) { - memmove(oLitEnd, match, sequence.matchLength); + ZSTD_memmove(oLitEnd, match, sequence.matchLength); return sequenceLength; } /* span extDict & currentPrefixSegment */ { size_t const length1 = dictEnd - match; - memmove(oLitEnd, match, length1); + ZSTD_memmove(oLitEnd, match, length1); op = oLitEnd + length1; sequence.matchLength -= length1; match = prefixStart; } } ZSTD_safecopy(op, oend_w, match, sequence.matchLength, ZSTD_overlap_src_before_dst); @@ -750,16 +854,16 @@ if (sequence.offset > (size_t)(oLitEnd - prefixStart)) { /* offset beyond prefix -> go into extDict */ RETURN_ERROR_IF(UNLIKELY(sequence.offset > (size_t)(oLitEnd - virtualStart)), corruption_detected, ""); match = dictEnd + (match - prefixStart); if (match + sequence.matchLength <= dictEnd) { - memmove(oLitEnd, match, sequence.matchLength); + ZSTD_memmove(oLitEnd, match, sequence.matchLength); return sequenceLength; } /* span extDict & currentPrefixSegment */ { size_t const length1 = dictEnd - match; - memmove(oLitEnd, match, length1); + ZSTD_memmove(oLitEnd, match, length1); op = oLitEnd + length1; sequence.matchLength -= length1; match = prefixStart; } } /* Match within prefix of 1 or more bytes */ @@ -946,11 +1050,11 @@ return seq; } #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION -static int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefixStart, BYTE const* oLitEnd) +MEM_STATIC int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefixStart, BYTE const* oLitEnd) { size_t const windowSize = dctx->fParams.windowSize; /* No dictionary used. */ if (dctx->dictContentEndForFuzzing == NULL) return 0; /* Dictionary is our prefix. */ @@ -967,10 +1071,11 @@ ZSTD_DCtx const* dctx, BYTE const* op, BYTE const* oend, seq_t const seq, BYTE const* prefixStart, BYTE const* virtualStart) { +#if DEBUGLEVEL >= 1 size_t const windowSize = dctx->fParams.windowSize; size_t const sequenceSize = seq.litLength + seq.matchLength; BYTE const* const oLitEnd = op + seq.litLength; DEBUGLOG(6, "Checking sequence: litL=%u matchL=%u offset=%u", (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset); @@ -984,10 +1089,13 @@ assert(seq.offset <= windowSize + dictSize); } else { /* Offset must be within our window. */ assert(seq.offset <= windowSize); } +#else + (void)dctx, (void)op, (void)oend, (void)seq, (void)prefixStart, (void)virtualStart; +#endif } #endif #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG FORCE_INLINE_TEMPLATE size_t @@ -998,11 +1106,11 @@ const ZSTD_longOffset_e isLongOffset, const int frame) { const BYTE* ip = (const BYTE*)seqStart; const BYTE* const iend = ip + seqSize; - BYTE* const ostart = (BYTE* const)dst; + BYTE* const ostart = (BYTE*)dst; BYTE* const oend = ostart + maxDstSize; BYTE* op = ostart; const BYTE* litPtr = dctx->litPtr; const BYTE* const litEnd = litPtr + dctx->litSize; const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart); @@ -1078,18 +1186,18 @@ assert(!ZSTD_isError(oneSeqSize)); if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); #endif DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); BIT_reloadDStream(&(seqState.DStream)); + op += oneSeqSize; /* gcc and clang both don't like early returns in this loop. - * gcc doesn't like early breaks either. - * Instead save an error and report it at the end. - * When there is an error, don't increment op, so we don't - * overwrite. + * Instead break and check for an error at the end of the loop. */ - if (UNLIKELY(ZSTD_isError(oneSeqSize))) error = oneSeqSize; - else op += oneSeqSize; + if (UNLIKELY(ZSTD_isError(oneSeqSize))) { + error = oneSeqSize; + break; + } if (UNLIKELY(!--nbSeq)) break; } /* check if reached exact end */ DEBUGLOG(5, "ZSTD_decompressSequences_body: after decode loop, remaining nbSeq : %i", nbSeq); @@ -1102,11 +1210,11 @@ /* last literal segment */ { size_t const lastLLSize = litEnd - litPtr; RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, ""); if (op != NULL) { - memcpy(op, litPtr, lastLLSize); + ZSTD_memcpy(op, litPtr, lastLLSize); op += lastLLSize; } } return op-ostart; @@ -1132,11 +1240,11 @@ const ZSTD_longOffset_e isLongOffset, const int frame) { const BYTE* ip = (const BYTE*)seqStart; const BYTE* const iend = ip + seqSize; - BYTE* const ostart = (BYTE* const)dst; + BYTE* const ostart = (BYTE*)dst; BYTE* const oend = ostart + maxDstSize; BYTE* op = ostart; const BYTE* litPtr = dctx->litPtr; const BYTE* const litEnd = litPtr + dctx->litSize; const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart); @@ -1207,11 +1315,11 @@ /* last literal segment */ { size_t const lastLLSize = litEnd - litPtr; RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, ""); if (op != NULL) { - memcpy(op, litPtr, lastLLSize); + ZSTD_memcpy(op, litPtr, lastLLSize); op += lastLLSize; } } return op-ostart; @@ -1407,13 +1515,13 @@ #endif } } -void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst) +void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize) { - if (dst != dctx->previousDstEnd) { /* not contiguous */ + if (dst != dctx->previousDstEnd && dstSize > 0) { /* not contiguous */ dctx->dictEnd = dctx->previousDstEnd; dctx->virtualStart = (const char*)dst - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->prefixStart)); dctx->prefixStart = dst; dctx->previousDstEnd = dst; } @@ -1423,10 +1531,10 @@ size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize) { size_t dSize; - ZSTD_checkContinuity(dctx, dst); + ZSTD_checkContinuity(dctx, dst, dstCapacity); dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 0); dctx->previousDstEnd = (char*)dst + dSize; return dSize; }