diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 8c43730f205..7fc9d55f926 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -83,6 +83,13 @@ Other ======================== Lucene 9.5.0 ======================= +Bug Fixes +--------------------- + +* GITHUB#11726: Indexing term vectors on large documents could fail due to + trying to apply a dictionary whose size is greater than the maximum supported + window size for LZ4. (Adrien Grand) + Other --------------------- * LUCENE-10423: Remove usages of System.currentTimeMillis() from tests. (Marios Trivyzas) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/LZ4WithPresetDictCompressionMode.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/LZ4WithPresetDictCompressionMode.java index f1d406e60a5..6f28ca3d0a6 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/LZ4WithPresetDictCompressionMode.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/LZ4WithPresetDictCompressionMode.java @@ -170,7 +170,7 @@ public final class LZ4WithPresetDictCompressionMode extends CompressionMode { @Override public void compress(ByteBuffersDataInput buffersInput, DataOutput out) throws IOException { final int len = (int) (buffersInput.size() - buffersInput.position()); - final int dictLength = len / (NUM_SUB_BLOCKS * DICT_SIZE_FACTOR); + final int dictLength = Math.min(LZ4.MAX_DISTANCE, len / (NUM_SUB_BLOCKS * DICT_SIZE_FACTOR)); final int blockLength = (len - dictLength + NUM_SUB_BLOCKS - 1) / NUM_SUB_BLOCKS; buffer = ArrayUtil.growNoCopy(buffer, dictLength + blockLength); out.writeVInt(dictLength); diff --git a/lucene/core/src/java/org/apache/lucene/util/compress/LZ4.java b/lucene/core/src/java/org/apache/lucene/util/compress/LZ4.java index 0deb228356d..67bbdc96ab2 100644 --- a/lucene/core/src/java/org/apache/lucene/util/compress/LZ4.java +++ b/lucene/core/src/java/org/apache/lucene/util/compress/LZ4.java @@ -47,9 +47,14 @@ public final class LZ4 { private LZ4() {} + /** + * Window size: this is the maximum supported distance between two strings so that LZ4 can replace + * the second one by a reference to the first one. + */ + public static final int MAX_DISTANCE = 1 << 16; // maximum distance of a reference + static final int MEMORY_USAGE = 14; static final int MIN_MATCH = 4; // minimum length of a match - static final int MAX_DISTANCE = 1 << 16; // maximum distance of a reference static final int LAST_LITERALS = 5; // the last 5 bytes must be encoded as literals static final int HASH_LOG_HC = 15; // log size of the dictionary for compressHC static final int HASH_TABLE_SIZE_HC = 1 << HASH_LOG_HC; @@ -512,7 +517,7 @@ public final class LZ4 { /** * Compress {@code bytes[dictOff+dictLen:dictOff+dictLen+len]} into {@code out} using at most 16kB * of memory. {@code bytes[dictOff:dictOff+dictLen]} will be used as a dictionary. {@code dictLen} - * must not be greater than 64kB, the maximum window size. + * must not be greater than {@link LZ4#MAX_DISTANCE 64kB}, the maximum window size. * *

{@code ht} shouldn't be shared across threads but can safely be reused. */ diff --git a/lucene/core/src/test/org/apache/lucene/codecs/compressing/AbstractTestCompressionMode.java b/lucene/core/src/test/org/apache/lucene/codecs/compressing/AbstractTestCompressionMode.java index 9febfa97928..366f74730fc 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/compressing/AbstractTestCompressionMode.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/compressing/AbstractTestCompressionMode.java @@ -154,4 +154,12 @@ public abstract class AbstractTestCompressionMode extends LuceneTestCase { Arrays.fill(decompressed, (byte) random().nextInt()); test(decompressed); } + + public void testExtremelyLargeInput() throws IOException { + final byte[] decompressed = new byte[1 << 24]; // 16MB + for (int i = 0; i < decompressed.length; ++i) { + decompressed[i] = (byte) (i & 0x0F); + } + test(decompressed); + } }