ext/zstdruby/libzstd/dictBuilder/cover.c in zstd-ruby-1.3.8.0 vs ext/zstdruby/libzstd/dictBuilder/cover.c in zstd-ruby-1.4.0.0
- old
+ new
@@ -389,11 +389,11 @@
* Let F(d) be the frequency of dmer d.
* Let S_i be the dmer at position i of segment S which has length k.
*
* Score(S) = F(S_1) + F(S_2) + ... + F(S_{k-d+1})
*
- * Once the dmer d is in the dictionay we set F(d) = 0.
+ * Once the dmer d is in the dictionary we set F(d) = 0.
*/
static COVER_segment_t COVER_selectSegment(const COVER_ctx_t *ctx, U32 *freqs,
COVER_map_t *activeDmers, U32 begin,
U32 end,
ZDICT_cover_params_t parameters) {
@@ -433,11 +433,11 @@
if (activeSegment.end - activeSegment.begin == dmersInK + 1) {
U32 delDmer = ctx->dmerAt[activeSegment.begin];
U32 *delDmerOcc = COVER_map_at(activeDmers, delDmer);
activeSegment.begin += 1;
*delDmerOcc -= 1;
- /* If this is the last occurence of the dmer, subtract its score */
+ /* If this is the last occurrence of the dmer, subtract its score */
if (*delDmerOcc == 0) {
COVER_map_remove(activeDmers, delDmer);
activeSegment.score -= freqs[delDmer];
}
}
@@ -625,41 +625,80 @@
ctx->freqs = ctx->suffix;
ctx->suffix = NULL;
return 1;
}
+void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel)
+{
+ const double ratio = (double)nbDmers / maxDictSize;
+ if (ratio >= 10) {
+ return;
+ }
+ LOCALDISPLAYLEVEL(displayLevel, 1,
+ "WARNING: The maximum dictionary size %u is too large "
+ "compared to the source size %u! "
+ "size(source)/size(dictionary) = %f, but it should be >= "
+ "10! This may lead to a subpar dictionary! We recommend "
+ "training on sources at least 10x, and up to 100x the "
+ "size of the dictionary!\n", (U32)maxDictSize,
+ (U32)nbDmers, ratio);
+}
+
+COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize,
+ U32 nbDmers, U32 k, U32 passes)
+{
+ const U32 minEpochSize = k * 10;
+ COVER_epoch_info_t epochs;
+ epochs.num = MAX(1, maxDictSize / k / passes);
+ epochs.size = nbDmers / epochs.num;
+ if (epochs.size >= minEpochSize) {
+ assert(epochs.size * epochs.num <= nbDmers);
+ return epochs;
+ }
+ epochs.size = MIN(minEpochSize, nbDmers);
+ epochs.num = nbDmers / epochs.size;
+ assert(epochs.size * epochs.num <= nbDmers);
+ return epochs;
+}
+
/**
* Given the prepared context build the dictionary.
*/
static size_t COVER_buildDictionary(const COVER_ctx_t *ctx, U32 *freqs,
COVER_map_t *activeDmers, void *dictBuffer,
size_t dictBufferCapacity,
ZDICT_cover_params_t parameters) {
BYTE *const dict = (BYTE *)dictBuffer;
size_t tail = dictBufferCapacity;
- /* Divide the data up into epochs of equal size.
- * We will select at least one segment from each epoch.
- */
- const unsigned epochs = MAX(1, (U32)(dictBufferCapacity / parameters.k / 4));
- const unsigned epochSize = (U32)(ctx->suffixSize / epochs);
+ /* Divide the data into epochs. We will select one segment from each epoch. */
+ const COVER_epoch_info_t epochs = COVER_computeEpochs(
+ (U32)dictBufferCapacity, (U32)ctx->suffixSize, parameters.k, 4);
+ const size_t maxZeroScoreRun = MAX(10, MIN(100, epochs.num >> 3));
+ size_t zeroScoreRun = 0;
size_t epoch;
DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n",
- epochs, epochSize);
+ (U32)epochs.num, (U32)epochs.size);
/* Loop through the epochs until there are no more segments or the dictionary
* is full.
*/
- for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs) {
- const U32 epochBegin = (U32)(epoch * epochSize);
- const U32 epochEnd = epochBegin + epochSize;
+ for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs.num) {
+ const U32 epochBegin = (U32)(epoch * epochs.size);
+ const U32 epochEnd = epochBegin + epochs.size;
size_t segmentSize;
/* Select a segment */
COVER_segment_t segment = COVER_selectSegment(
ctx, freqs, activeDmers, epochBegin, epochEnd, parameters);
- /* If the segment covers no dmers, then we are out of content */
+ /* If the segment covers no dmers, then we are out of content.
+ * There may be new content in other epochs, for continue for some time.
+ */
if (segment.score == 0) {
- break;
+ if (++zeroScoreRun >= maxZeroScoreRun) {
+ break;
+ }
+ continue;
}
+ zeroScoreRun = 0;
/* Trim the segment if necessary and if it is too small then we are done */
segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail);
if (segmentSize < parameters.d) {
break;
}
@@ -704,10 +743,11 @@
/* Initialize context and activeDmers */
if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
parameters.d, parameters.splitPoint)) {
return ERROR(GENERIC);
}
+ COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.suffixSize, g_displayLevel);
if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {
DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n");
COVER_ctx_destroy(&ctx);
return ERROR(GENERIC);
}
@@ -975,10 +1015,11 @@
unsigned iteration = 1;
unsigned d;
unsigned k;
COVER_best_t best;
POOL_ctx *pool = NULL;
+ int warned = 0;
/* Checks */
if (splitPoint <= 0 || splitPoint > 1) {
LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");
return ERROR(GENERIC);
@@ -1016,9 +1057,13 @@
if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d, splitPoint)) {
LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n");
COVER_best_destroy(&best);
POOL_free(pool);
return ERROR(GENERIC);
+ }
+ if (!warned) {
+ COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.suffixSize, displayLevel);
+ warned = 1;
}
/* Loop through k reusing the same context */
for (k = kMinK; k <= kMaxK; k += kStepSize) {
/* Prepare the arguments */
COVER_tryParameters_data_t *data = (COVER_tryParameters_data_t *)malloc(