contrib/zstd/lib/dictBuilder/cover.c in extzstd-0.3.2 vs contrib/zstd/lib/dictBuilder/cover.c in extzstd-0.3.3

- old
+ new

@@ -1,7 +1,7 @@ /* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the * LICENSE file in the root directory of this source tree) and the GPLv2 (found * in the COPYING file in the root directory of this source tree). @@ -24,31 +24,40 @@ #include <stdio.h> /* fprintf */ #include <stdlib.h> /* malloc, free, qsort */ #include <string.h> /* memset */ #include <time.h> /* clock */ +#ifndef ZDICT_STATIC_LINKING_ONLY +# define ZDICT_STATIC_LINKING_ONLY +#endif + #include "../common/mem.h" /* read */ #include "../common/pool.h" #include "../common/threading.h" -#include "cover.h" #include "../common/zstd_internal.h" /* includes zstd.h */ -#ifndef ZDICT_STATIC_LINKING_ONLY -#define ZDICT_STATIC_LINKING_ONLY -#endif -#include "zdict.h" +#include "../common/bits.h" /* ZSTD_highbit32 */ +#include "../zdict.h" +#include "cover.h" /*-************************************* * Constants ***************************************/ +/** +* There are 32bit indexes used to ref samples, so limit samples size to 4GB +* on 64bit builds. +* For 32bit builds we choose 1 GB. +* Most 32bit platforms have 2GB user-mode addressable space and we allocate a large +* contiguous buffer, so 1GB is already a high limit. +*/ #define COVER_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((unsigned)-1) : ((unsigned)1 GB)) #define COVER_DEFAULT_SPLITPOINT 1.0 /*-************************************* * Console display ***************************************/ #ifndef LOCALDISPLAYLEVEL -static int g_displayLevel = 2; +static int g_displayLevel = 0; #endif #undef DISPLAY #define DISPLAY(...) \ { \ fprintf(stderr, __VA_ARGS__); \ @@ -531,11 +540,11 @@ } } /** * Prepare a context for dictionary building. - * The context is only dependent on the parameter `d` and can used multiple + * The context is only dependent on the parameter `d` and can be used multiple * times. * Returns 0 on success or error code on error. * The context must be destroyed with `COVER_ctx_destroy()`. */ static size_t COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer, @@ -636,11 +645,11 @@ return 0; } void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel) { - const double ratio = (double)nbDmers / maxDictSize; + const double ratio = (double)nbDmers / (double)maxDictSize; if (ratio >= 10) { return; } LOCALDISPLAYLEVEL(displayLevel, 1, "WARNING: The maximum dictionary size %u is too large " @@ -732,11 +741,11 @@ BYTE* const dict = (BYTE*)dictBuffer; COVER_ctx_t ctx; COVER_map_t activeDmers; parameters.splitPoint = 1.0; /* Initialize global data */ - g_displayLevel = parameters.zParams.notificationLevel; + g_displayLevel = (int)parameters.zParams.notificationLevel; /* Checks */ if (!COVER_checkParameters(parameters, dictBufferCapacity)) { DISPLAYLEVEL(1, "Cover parameters incorrect\n"); return ERROR(parameter_outOfBound); } @@ -940,13 +949,21 @@ } ZSTD_pthread_mutex_unlock(&best->mutex); } } +static COVER_dictSelection_t setDictSelection(BYTE* buf, size_t s, size_t csz) +{ + COVER_dictSelection_t ds; + ds.dictContent = buf; + ds.dictSize = s; + ds.totalCompressedSize = csz; + return ds; +} + COVER_dictSelection_t COVER_dictSelectionError(size_t error) { - COVER_dictSelection_t selection = { NULL, 0, error }; - return selection; + return setDictSelection(NULL, 0, error); } unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection) { return (ZSTD_isError(selection.totalCompressedSize) || !selection.dictContent); } @@ -995,13 +1012,12 @@ free(candidateDictBuffer); return COVER_dictSelectionError(totalCompressedSize); } if (params.shrinkDict == 0) { - COVER_dictSelection_t selection = { largestDictbuffer, dictContentSize, totalCompressedSize }; free(candidateDictBuffer); - return selection; + return setDictSelection(largestDictbuffer, dictContentSize, totalCompressedSize); } largestDict = dictContentSize; largestCompressed = totalCompressedSize; dictContentSize = ZDICT_DICTSIZE_MIN; @@ -1029,24 +1045,20 @@ free(largestDictbuffer); free(candidateDictBuffer); return COVER_dictSelectionError(totalCompressedSize); } - if (totalCompressedSize <= largestCompressed * regressionTolerance) { - COVER_dictSelection_t selection = { candidateDictBuffer, dictContentSize, totalCompressedSize }; + if ((double)totalCompressedSize <= (double)largestCompressed * regressionTolerance) { free(largestDictbuffer); - return selection; + return setDictSelection( candidateDictBuffer, dictContentSize, totalCompressedSize ); } dictContentSize *= 2; } dictContentSize = largestDict; totalCompressedSize = largestCompressed; - { - COVER_dictSelection_t selection = { largestDictbuffer, dictContentSize, totalCompressedSize }; - free(candidateDictBuffer); - return selection; - } + free(candidateDictBuffer); + return setDictSelection( largestDictbuffer, dictContentSize, totalCompressedSize ); } /** * Parameters for COVER_tryParameters(). */ @@ -1060,22 +1072,23 @@ /** * Tries a set of parameters and updates the COVER_best_t with the results. * This function is thread safe if zstd is compiled with multithreaded support. * It takes its parameters as an *OWNING* opaque pointer to support threading. */ -static void COVER_tryParameters(void *opaque) { +static void COVER_tryParameters(void *opaque) +{ /* Save parameters as local variables */ - COVER_tryParameters_data_t *const data = (COVER_tryParameters_data_t *)opaque; + COVER_tryParameters_data_t *const data = (COVER_tryParameters_data_t*)opaque; const COVER_ctx_t *const ctx = data->ctx; const ZDICT_cover_params_t parameters = data->parameters; size_t dictBufferCapacity = data->dictBufferCapacity; size_t totalCompressedSize = ERROR(GENERIC); /* Allocate space for hash table, dict, and freqs */ COVER_map_t activeDmers; - BYTE *const dict = (BYTE * const)malloc(dictBufferCapacity); + BYTE* const dict = (BYTE*)malloc(dictBufferCapacity); COVER_dictSelection_t selection = COVER_dictSelectionError(ERROR(GENERIC)); - U32 *freqs = (U32 *)malloc(ctx->suffixSize * sizeof(U32)); + U32* const freqs = (U32*)malloc(ctx->suffixSize * sizeof(U32)); if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) { DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n"); goto _cleanup; } if (!dict || !freqs) { @@ -1101,18 +1114,17 @@ free(dict); COVER_best_finish(data->best, parameters, selection); free(data); COVER_map_destroy(&activeDmers); COVER_dictSelectionFree(selection); - if (freqs) { - free(freqs); - } + free(freqs); } ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover( - void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer, - const size_t *samplesSizes, unsigned nbSamples, - ZDICT_cover_params_t *parameters) { + void* dictBuffer, size_t dictBufferCapacity, const void* samplesBuffer, + const size_t* samplesSizes, unsigned nbSamples, + ZDICT_cover_params_t* parameters) +{ /* constants */ const unsigned nbThreads = parameters->nbThreads; const double splitPoint = parameters->splitPoint <= 0.0 ? COVER_DEFAULT_SPLITPOINT : parameters->splitPoint; const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d;