From 255925c231bbceddd45416eeea080792d02654fe Mon Sep 17 00:00:00 2001 From: sen Date: Wed, 31 Mar 2021 15:14:59 -0400 Subject: [PATCH] Fix repcode-related OSS-fuzz issues in block splitter (#2560) * Do not emit last partitions of blocks as RLE/uncompressed * Fix repcode updates within block splitter * Add a entropytables confirm function, redo ZSTD_confirmRepcodesAndEntropyTables() for better function signature * Add a repcode updater to block splitter, no longer need to force emit compressed blocks --- lib/compress/zstd_compress.c | 127 ++++++++++++++++++----------------- 1 file changed, 66 insertions(+), 61 deletions(-) diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index 4f547aab256..27ef756d7fb 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -2483,7 +2483,7 @@ ZSTD_entropyCompressSeqStore(seqStore_t* seqStorePtr, void* dst, size_t dstCapacity, size_t srcSize, void* entropyWorkspace, size_t entropyWkspSize, - int bmi2, U32 const canEmitUncompressed) + int bmi2) { size_t const cSize = ZSTD_entropyCompressSeqStore_internal( seqStorePtr, prevEntropy, nextEntropy, cctxParams, @@ -2493,15 +2493,13 @@ ZSTD_entropyCompressSeqStore(seqStore_t* seqStorePtr, /* When srcSize <= dstCapacity, there is enough space to write a raw uncompressed block. * Since we ran out of space, block must be not compressible, so fall back to raw uncompressed block. */ - if (canEmitUncompressed) { - if ((cSize == ERROR(dstSize_tooSmall)) & (srcSize <= dstCapacity)) - return 0; /* block not compressed */ - FORWARD_IF_ERROR(cSize, "ZSTD_entropyCompressSeqStore_internal failed"); - - /* Check compressibility */ - { size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, cctxParams->cParams.strategy); - if (cSize >= maxCSize) return 0; /* block not compressed */ - } + if ((cSize == ERROR(dstSize_tooSmall)) & (srcSize <= dstCapacity)) + return 0; /* block not compressed */ + FORWARD_IF_ERROR(cSize, "ZSTD_entropyCompressSeqStore_internal failed"); + + /* Check compressibility */ + { size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, cctxParams->cParams.strategy); + if (cSize >= maxCSize) return 0; /* block not compressed */ } DEBUGLOG(4, "ZSTD_entropyCompressSeqStore() cSize: %zu", cSize); return cSize; @@ -2793,11 +2791,11 @@ static int ZSTD_maybeRLE(seqStore_t const* seqStore) return nbSeqs < 4 && nbLits < 10; } -static void ZSTD_confirmRepcodesAndEntropyTables(ZSTD_CCtx* zc) +static void ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* const bs) { - ZSTD_compressedBlockState_t* const tmp = zc->blockState.prevCBlock; - zc->blockState.prevCBlock = zc->blockState.nextCBlock; - zc->blockState.nextCBlock = tmp; + ZSTD_compressedBlockState_t* const tmp = bs->prevCBlock; + bs->prevCBlock = bs->nextCBlock; + bs->nextCBlock = tmp; } /* Writes the block header */ @@ -3180,6 +3178,26 @@ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore, resultSeqStore->ofCode += startIdx; } +/** + * ZSTD_seqStore_updateRepcodes(): Starting from an array of initial repcodes and a seqStore, + * construct the final repcodes at the conclusion of compressing the seqStore, stored in dstRep. + */ +static void ZSTD_seqStore_updateRepcodes(U32 dstRep[ZSTD_REP_NUM], + const U32 initialRep[ZSTD_REP_NUM], + const seqStore_t* const seqStore, U32 const nbSeq) { + repcodes_t updatedRepcodes; + U32 idx = 0; + ZSTD_memcpy(updatedRepcodes.rep, initialRep, sizeof(repcodes_t)); + for (; idx < nbSeq; ++idx) { + seqDef const seq = seqStore->sequencesStart[idx]; + U32 const ll0 = (seq.litLength == 0); + U32 const offCode = seq.offset - 1; + assert(seq.offset >= 1); /* seqDef::offset == offCode+1, and ZSTD_updateRep() expects an offCode */ + updatedRepcodes = ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0); + } + ZSTD_memcpy(dstRep, updatedRepcodes.rep, sizeof(repcodes_t)); +} + /* ZSTD_compressSeqStore_singleBlock(): * Compresses a seqStore into a block with a block header, into the buffer dst. * @@ -3188,7 +3206,7 @@ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore, static size_t ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* seqStore, void* dst, size_t dstCapacity, const void* src, size_t srcSize, - U32 lastBlock, U32 canEmitRLEorNoCompress) { + U32 lastBlock, U32 isPartition) { const U32 rleMaxLength = 25; BYTE* op = (BYTE*)dst; const BYTE* ip = (const BYTE*)src; @@ -3199,13 +3217,12 @@ static size_t ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* seqSt op + ZSTD_blockHeaderSize, dstCapacity - ZSTD_blockHeaderSize, srcSize, zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */, - zc->bmi2, canEmitRLEorNoCompress); + zc->bmi2); FORWARD_IF_ERROR(cSeqsSize, "ZSTD_entropyCompressSeqStore failed!"); if (!zc->isFirstBlock && cSeqsSize < rleMaxLength && - ZSTD_isRLE((BYTE const*)src, srcSize)&& - canEmitRLEorNoCompress) { + ZSTD_isRLE((BYTE const*)src, srcSize)) { /* We don't want to emit our first block as a RLE even if it qualifies because * doing so will cause the decoder (cli only) to throw a "should consume all input error." * This is only an issue for zstd <= v1.4.3 @@ -3215,24 +3232,30 @@ static size_t ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* seqSt if (zc->seqCollector.collectSequences) { ZSTD_copyBlockSequences(zc); - ZSTD_confirmRepcodesAndEntropyTables(zc); + ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); return 0; } if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid) zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check; - if (cSeqsSize == 0 && canEmitRLEorNoCompress) { + if (cSeqsSize == 0) { cSize = ZSTD_noCompressBlock(op, dstCapacity, ip, srcSize, lastBlock); FORWARD_IF_ERROR(cSize, "Nocompress block failed"); DEBUGLOG(4, "Writing out nocompress block, size: %zu", cSize); - } else if (cSeqsSize == 1 && canEmitRLEorNoCompress) { + } else if (cSeqsSize == 1) { cSize = ZSTD_rleCompressBlock(op, dstCapacity, *ip, srcSize, lastBlock); FORWARD_IF_ERROR(cSize, "RLE compress block failed"); DEBUGLOG(4, "Writing out RLE block, size: %zu", cSize); } else { - /* Error checking and repcodes update */ - ZSTD_confirmRepcodesAndEntropyTables(zc); + if (isPartition) { + /* We manually update repcodes if we are currently compressing a partition. Otherwise, + * for non-split blocks, the repcodes are already correct as-is. + */ + ZSTD_seqStore_updateRepcodes(zc->blockState.nextCBlock->rep, zc->blockState.prevCBlock->rep, + seqStore, (U32)(seqStore->sequences - seqStore->sequencesStart)); + } + ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); writeBlockHeader(op, cSeqsSize, srcSize, lastBlock); cSize = ZSTD_blockHeaderSize + cSeqsSize; DEBUGLOG(4, "Writing out compressed block, size: %zu", cSize); @@ -3310,18 +3333,6 @@ static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq) return splits.idx; } -/* Return 1 if if the first three sequences of seqstore/block use repcodes */ -static U32 ZSTD_seqStore_firstThreeContainRepcodes(const seqStore_t* const seqStore) { - U32 const seqLimit = MIN((U32)(seqStore->sequences - seqStore->sequencesStart), ZSTD_REP_NUM); - U32 seqIdx = 0; - for (; seqIdx < seqLimit; ++seqIdx) { - if (seqStore->sequencesStart[seqIdx].offset <= ZSTD_REP_MOVE) { - return 1; - } - } - return 0; -} - /* ZSTD_compressBlock_splitBlock(): * Attempts to split a given block into multiple blocks to improve compression ratio. * @@ -3338,17 +3349,19 @@ static size_t ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, s size_t numSplits = ZSTD_deriveBlockSplits(zc, partitions, nbSeq); seqStore_t nextSeqStore; seqStore_t currSeqStore; - U32 canEmitRLEorNoCompress = 1; - const size_t dstCapacityInitial = dstCapacity; DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)", (unsigned)dstCapacity, (unsigned)zc->blockState.matchState.window.dictLimit, (unsigned)zc->blockState.matchState.nextToUpdate); if (numSplits == 0) { - size_t cSizeSingleBlock = ZSTD_compressSeqStore_singleBlock(zc, &zc->seqStore, op, dstCapacity, ip, blockSize, lastBlock, 1); + size_t cSizeSingleBlock = ZSTD_compressSeqStore_singleBlock(zc, &zc->seqStore, + op, dstCapacity, + ip, blockSize, + lastBlock, 0 /* isPartition */); FORWARD_IF_ERROR(cSizeSingleBlock, "Compressing single block from splitBlock_internal() failed!"); DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal: No splits"); + assert(cSizeSingleBlock <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize); return cSizeSingleBlock; } @@ -3356,41 +3369,33 @@ static size_t ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, s for (i = 0; i <= numSplits; ++i) { size_t srcBytes; size_t cSizeChunk; - U32 lastBlockActual; + U32 const lastPartition = (i == numSplits); + U32 lastBlockEntireSrc = 0; srcBytes = ZSTD_countSeqStoreLiteralsBytes(&currSeqStore) + ZSTD_countSeqStoreMatchBytes(&currSeqStore); - lastBlockActual = lastBlock && (i == numSplits); srcBytesTotal += srcBytes; - if (i == numSplits) { + if (lastPartition) { /* This is the final partition, need to account for possible last literals */ srcBytes += blockSize - srcBytesTotal; + lastBlockEntireSrc = lastBlock; } else { ZSTD_deriveSeqStoreChunk(&nextSeqStore, &zc->seqStore, partitions[i], partitions[i+1]); - if (ZSTD_seqStore_firstThreeContainRepcodes(&nextSeqStore)) { - DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal: Next block contains rep in first three seqs!"); - canEmitRLEorNoCompress = 0; - } } - cSizeChunk = ZSTD_compressSeqStore_singleBlock(zc, &currSeqStore, op, dstCapacity, ip, srcBytes, lastBlockActual, canEmitRLEorNoCompress); + cSizeChunk = ZSTD_compressSeqStore_singleBlock(zc, &currSeqStore, + op, dstCapacity, + ip, srcBytes, + lastBlockEntireSrc, 1 /* isPartition */); DEBUGLOG(5, "Estimated size: %zu actual size: %zu", ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(&currSeqStore, zc), cSizeChunk); FORWARD_IF_ERROR(cSizeChunk, "Compressing chunk failed!"); - ZSTD_memcpy(zc->blockState.nextCBlock->rep, zc->blockState.prevCBlock->rep, sizeof(U32)*ZSTD_REP_NUM); ip += srcBytes; op += cSizeChunk; dstCapacity -= cSizeChunk; cSize += cSizeChunk; currSeqStore = nextSeqStore; + assert(cSizeChunk <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize); } - - if (cSize > ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize) { - /* If too large, recompress the original block to avoid any chance of a single block exceeding ZSTD_BLOCKSIZE_MAX */ - cSize = ZSTD_compressSeqStore_singleBlock(zc, &zc->seqStore, (BYTE*)dst, dstCapacityInitial, (const BYTE*)src, blockSize, lastBlock, 1); - FORWARD_IF_ERROR(cSize, "Compressing single block from splitBlock_internal() fallback failed!"); - DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal: Compressed split block too large, recompressed"); - } - assert(cSize <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize); return cSize; } @@ -3445,7 +3450,7 @@ static size_t ZSTD_compressBlock_internal(ZSTD_CCtx* zc, if (zc->seqCollector.collectSequences) { ZSTD_copyBlockSequences(zc); - ZSTD_confirmRepcodesAndEntropyTables(zc); + ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); return 0; } @@ -3456,7 +3461,7 @@ static size_t ZSTD_compressBlock_internal(ZSTD_CCtx* zc, dst, dstCapacity, srcSize, zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */, - zc->bmi2, 1 /* Can emit uncompressed blocks */); + zc->bmi2); if (zc->seqCollector.collectSequences) { ZSTD_copyBlockSequences(zc); @@ -3479,7 +3484,7 @@ static size_t ZSTD_compressBlock_internal(ZSTD_CCtx* zc, out: if (!ZSTD_isError(cSize) && cSize > 1) { - ZSTD_confirmRepcodesAndEntropyTables(zc); + ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); } /* We check that dictionaries have offset codes available for the first * block. After the first block, the offcode table might not have large @@ -3532,7 +3537,7 @@ static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc, size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, zc->appliedParams.cParams.strategy); FORWARD_IF_ERROR(cSize, "ZSTD_compressSuperBlock failed"); if (cSize != 0 && cSize < maxCSize + ZSTD_blockHeaderSize) { - ZSTD_confirmRepcodesAndEntropyTables(zc); + ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); return cSize; } } @@ -5665,7 +5670,7 @@ static size_t ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, op + ZSTD_blockHeaderSize /* Leave space for block header */, dstCapacity - ZSTD_blockHeaderSize, blockSize, cctx->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */, - cctx->bmi2, 1 /* Can emit uncompressed blocks */); + cctx->bmi2); FORWARD_IF_ERROR(compressedSeqsSize, "Compressing sequences of block failed"); DEBUGLOG(4, "Compressed sequences size: %zu", compressedSeqsSize); @@ -5691,7 +5696,7 @@ static size_t ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, } else { U32 cBlockHeader; /* Error checking and repcodes update */ - ZSTD_confirmRepcodesAndEntropyTables(cctx); + ZSTD_blockState_confirmRepcodesAndEntropyTables(&cctx->blockState); if (cctx->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid) cctx->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check;