Prevent term vectors from exceeding the maximum dictionary size. (#11726)

When indexing term vectors for a very large document, the automatic computation
of the dictionary size based on the overall size of the block might yield a
size that exceeds the maximum window size that is supported by LZ4. This commit
addresses the issue by automatically taking the minimum of the result of this
computation and the maximum window size (64kB).
This commit is contained in:
Adrien Grand 2022-09-08 13:44:21 +02:00 committed by GitHub
parent dbffe3472b
commit f8285fd0fe
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 23 additions and 3 deletions

View File

@ -83,6 +83,13 @@ Other
======================== Lucene 9.5.0 =======================
Bug Fixes
---------------------
* GITHUB#11726: Indexing term vectors on large documents could fail due to
trying to apply a dictionary whose size is greater than the maximum supported
window size for LZ4. (Adrien Grand)
Other
---------------------
* LUCENE-10423: Remove usages of System.currentTimeMillis() from tests. (Marios Trivyzas)

View File

@ -170,7 +170,7 @@ public final class LZ4WithPresetDictCompressionMode extends CompressionMode {
@Override
public void compress(ByteBuffersDataInput buffersInput, DataOutput out) throws IOException {
final int len = (int) (buffersInput.size() - buffersInput.position());
final int dictLength = len / (NUM_SUB_BLOCKS * DICT_SIZE_FACTOR);
final int dictLength = Math.min(LZ4.MAX_DISTANCE, len / (NUM_SUB_BLOCKS * DICT_SIZE_FACTOR));
final int blockLength = (len - dictLength + NUM_SUB_BLOCKS - 1) / NUM_SUB_BLOCKS;
buffer = ArrayUtil.growNoCopy(buffer, dictLength + blockLength);
out.writeVInt(dictLength);

View File

@ -47,9 +47,14 @@ public final class LZ4 {
private LZ4() {}
/**
* Window size: this is the maximum supported distance between two strings so that LZ4 can replace
* the second one by a reference to the first one.
*/
public static final int MAX_DISTANCE = 1 << 16; // maximum distance of a reference
static final int MEMORY_USAGE = 14;
static final int MIN_MATCH = 4; // minimum length of a match
static final int MAX_DISTANCE = 1 << 16; // maximum distance of a reference
static final int LAST_LITERALS = 5; // the last 5 bytes must be encoded as literals
static final int HASH_LOG_HC = 15; // log size of the dictionary for compressHC
static final int HASH_TABLE_SIZE_HC = 1 << HASH_LOG_HC;
@ -512,7 +517,7 @@ public final class LZ4 {
/**
* Compress {@code bytes[dictOff+dictLen:dictOff+dictLen+len]} into {@code out} using at most 16kB
* of memory. {@code bytes[dictOff:dictOff+dictLen]} will be used as a dictionary. {@code dictLen}
* must not be greater than 64kB, the maximum window size.
* must not be greater than {@link LZ4#MAX_DISTANCE 64kB}, the maximum window size.
*
* <p>{@code ht} shouldn't be shared across threads but can safely be reused.
*/

View File

@ -154,4 +154,12 @@ public abstract class AbstractTestCompressionMode extends LuceneTestCase {
Arrays.fill(decompressed, (byte) random().nextInt());
test(decompressed);
}
public void testExtremelyLargeInput() throws IOException {
final byte[] decompressed = new byte[1 << 24]; // 16MB
for (int i = 0; i < decompressed.length; ++i) {
decompressed[i] = (byte) (i & 0x0F);
}
test(decompressed);
}
}