contrib/zstd/lib/common/fse_decompress.c in extzstd-0.3.1 vs contrib/zstd/lib/common/fse_decompress.c in extzstd-0.3.2

- old
+ new

@@ -14,17 +14,18 @@ /* ************************************************************** * Includes ****************************************************************/ -#include <stdlib.h> /* malloc, free, qsort */ -#include <string.h> /* memcpy, memset */ +#include "debug.h" /* assert */ #include "bitstream.h" #include "compiler.h" #define FSE_STATIC_LINKING_ONLY #include "fse.h" #include "error_private.h" +#define ZSTD_DEPS_NEED_MALLOC +#include "zstd_deps.h" /* ************************************************************** * Error Management ****************************************************************/ @@ -57,29 +58,31 @@ /* Function templates */ FSE_DTable* FSE_createDTable (unsigned tableLog) { if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX; - return (FSE_DTable*)malloc( FSE_DTABLE_SIZE_U32(tableLog) * sizeof (U32) ); + return (FSE_DTable*)ZSTD_malloc( FSE_DTABLE_SIZE_U32(tableLog) * sizeof (U32) ); } void FSE_freeDTable (FSE_DTable* dt) { - free(dt); + ZSTD_free(dt); } -size_t FSE_buildDTable(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog) +static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize) { void* const tdPtr = dt+1; /* because *dt is unsigned, 32-bits aligned on 32-bits */ FSE_DECODE_TYPE* const tableDecode = (FSE_DECODE_TYPE*) (tdPtr); - U16 symbolNext[FSE_MAX_SYMBOL_VALUE+1]; + U16* symbolNext = (U16*)workSpace; + BYTE* spread = (BYTE*)(symbolNext + maxSymbolValue + 1); U32 const maxSV1 = maxSymbolValue + 1; U32 const tableSize = 1 << tableLog; U32 highThreshold = tableSize-1; /* Sanity Checks */ + if (FSE_BUILD_DTABLE_WKSP_SIZE(tableLog, maxSymbolValue) > wkspSize) return ERROR(maxSymbolValue_tooLarge); if (maxSymbolValue > FSE_MAX_SYMBOL_VALUE) return ERROR(maxSymbolValue_tooLarge); if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge); /* Init, lay down lowprob symbols */ { FSE_DTableHeader DTableH; @@ -93,15 +96,61 @@ symbolNext[s] = 1; } else { if (normalizedCounter[s] >= largeLimit) DTableH.fastMode=0; symbolNext[s] = normalizedCounter[s]; } } } - memcpy(dt, &DTableH, sizeof(DTableH)); + ZSTD_memcpy(dt, &DTableH, sizeof(DTableH)); } /* Spread symbols */ - { U32 const tableMask = tableSize-1; + if (highThreshold == tableSize - 1) { + size_t const tableMask = tableSize-1; + size_t const step = FSE_TABLESTEP(tableSize); + /* First lay down the symbols in order. + * We use a uint64_t to lay down 8 bytes at a time. This reduces branch + * misses since small blocks generally have small table logs, so nearly + * all symbols have counts <= 8. We ensure we have 8 bytes at the end of + * our buffer to handle the over-write. + */ + { + U64 const add = 0x0101010101010101ull; + size_t pos = 0; + U64 sv = 0; + U32 s; + for (s=0; s<maxSV1; ++s, sv += add) { + int i; + int const n = normalizedCounter[s]; + MEM_write64(spread + pos, sv); + for (i = 8; i < n; i += 8) { + MEM_write64(spread + pos + i, sv); + } + pos += n; + } + } + /* Now we spread those positions across the table. + * The benefit of doing it in two stages is that we avoid the the + * variable size inner loop, which caused lots of branch misses. + * Now we can run through all the positions without any branch misses. + * We unroll the loop twice, since that is what emperically worked best. + */ + { + size_t position = 0; + size_t s; + size_t const unroll = 2; + assert(tableSize % unroll == 0); /* FSE_MIN_TABLELOG is 5 */ + for (s = 0; s < (size_t)tableSize; s += unroll) { + size_t u; + for (u = 0; u < unroll; ++u) { + size_t const uPosition = (position + (u * step)) & tableMask; + tableDecode[uPosition].symbol = spread[s + u]; + } + position = (position + (unroll * step)) & tableMask; + } + assert(position == 0); + } + } else { + U32 const tableMask = tableSize-1; U32 const step = FSE_TABLESTEP(tableSize); U32 s, position = 0; for (s=0; s<maxSV1; s++) { int i; for (i=0; i<normalizedCounter[s]; i++) { @@ -122,11 +171,16 @@ } } return 0; } +size_t FSE_buildDTable_wksp(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize) +{ + return FSE_buildDTable_internal(dt, normalizedCounter, maxSymbolValue, tableLog, workSpace, wkspSize); +} + #ifndef FSE_COMMONDEFS_ONLY /*-******************************************************* * Decompression (Byte symbols) *********************************************************/ @@ -249,38 +303,91 @@ if (fastMode) return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 1); return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 0); } -size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, FSE_DTable* workSpace, unsigned maxLog) +size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize) { + return FSE_decompress_wksp_bmi2(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize, /* bmi2 */ 0); +} + +FORCE_INLINE_TEMPLATE size_t FSE_decompress_wksp_body( + void* dst, size_t dstCapacity, + const void* cSrc, size_t cSrcSize, + unsigned maxLog, void* workSpace, size_t wkspSize, + int bmi2) +{ const BYTE* const istart = (const BYTE*)cSrc; const BYTE* ip = istart; short counting[FSE_MAX_SYMBOL_VALUE+1]; unsigned tableLog; unsigned maxSymbolValue = FSE_MAX_SYMBOL_VALUE; + FSE_DTable* const dtable = (FSE_DTable*)workSpace; /* normal FSE decoding mode */ - size_t const NCountLength = FSE_readNCount (counting, &maxSymbolValue, &tableLog, istart, cSrcSize); + size_t const NCountLength = FSE_readNCount_bmi2(counting, &maxSymbolValue, &tableLog, istart, cSrcSize, bmi2); if (FSE_isError(NCountLength)) return NCountLength; - /* if (NCountLength >= cSrcSize) return ERROR(srcSize_wrong); */ /* too small input size; supposed to be already checked in NCountLength, only remaining case : NCountLength==cSrcSize */ if (tableLog > maxLog) return ERROR(tableLog_tooLarge); + assert(NCountLength <= cSrcSize); ip += NCountLength; cSrcSize -= NCountLength; - CHECK_F( FSE_buildDTable (workSpace, counting, maxSymbolValue, tableLog) ); + if (FSE_DECOMPRESS_WKSP_SIZE(tableLog, maxSymbolValue) > wkspSize) return ERROR(tableLog_tooLarge); + workSpace = dtable + FSE_DTABLE_SIZE_U32(tableLog); + wkspSize -= FSE_DTABLE_SIZE(tableLog); - return FSE_decompress_usingDTable (dst, dstCapacity, ip, cSrcSize, workSpace); /* always return, even if it is an error code */ + CHECK_F( FSE_buildDTable_internal(dtable, counting, maxSymbolValue, tableLog, workSpace, wkspSize) ); + + { + const void* ptr = dtable; + const FSE_DTableHeader* DTableH = (const FSE_DTableHeader*)ptr; + const U32 fastMode = DTableH->fastMode; + + /* select fast mode (static) */ + if (fastMode) return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, dtable, 1); + return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, dtable, 0); + } } +/* Avoids the FORCE_INLINE of the _body() function. */ +static size_t FSE_decompress_wksp_body_default(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize) +{ + return FSE_decompress_wksp_body(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize, 0); +} +#if DYNAMIC_BMI2 +TARGET_ATTRIBUTE("bmi2") static size_t FSE_decompress_wksp_body_bmi2(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize) +{ + return FSE_decompress_wksp_body(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize, 1); +} +#endif + +size_t FSE_decompress_wksp_bmi2(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize, int bmi2) +{ +#if DYNAMIC_BMI2 + if (bmi2) { + return FSE_decompress_wksp_body_bmi2(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize); + } +#endif + (void)bmi2; + return FSE_decompress_wksp_body_default(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize); +} + + typedef FSE_DTable DTable_max_t[FSE_DTABLE_SIZE_U32(FSE_MAX_TABLELOG)]; +#ifndef ZSTD_NO_UNUSED_FUNCTIONS +size_t FSE_buildDTable(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog) { + U32 wksp[FSE_BUILD_DTABLE_WKSP_SIZE_U32(FSE_TABLELOG_ABSOLUTE_MAX, FSE_MAX_SYMBOL_VALUE)]; + return FSE_buildDTable_wksp(dt, normalizedCounter, maxSymbolValue, tableLog, wksp, sizeof(wksp)); +} + size_t FSE_decompress(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize) { - DTable_max_t dt; /* Static analyzer seems unable to understand this table will be properly initialized later */ - return FSE_decompress_wksp(dst, dstCapacity, cSrc, cSrcSize, dt, FSE_MAX_TABLELOG); + /* Static analyzer seems unable to understand this table will be properly initialized later */ + U32 wksp[FSE_DECOMPRESS_WKSP_SIZE_U32(FSE_MAX_TABLELOG, FSE_MAX_SYMBOL_VALUE)]; + return FSE_decompress_wksp(dst, dstCapacity, cSrc, cSrcSize, FSE_MAX_TABLELOG, wksp, sizeof(wksp)); } - +#endif #endif /* FSE_COMMONDEFS_ONLY */