Prevent term vectors from exceeding the maximum dictionary size. (#11726)

When indexing term vectors for a very large document, the automatic computation of the dictionary size based on the overall size of the block might yield a size that exceeds the maximum window size that is supported by LZ4. This commit addresses the issue by automatically taking the minimum of the result of this computation and the maximum window size (64kB).
2022-09-08 13:44:21 +02:00 · 2022-09-08 13:44:21 +02:00 · f8285fd0fe
parent dbffe3472b
commit f8285fd0fe
4 changed files with 23 additions and 3 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -83,6 +83,13 @@ Other

 ======================== Lucene 9.5.0 =======================

+Bug Fixes
+---------------------
+
+* GITHUB#11726: Indexing term vectors on large documents could fail due to
+  trying to apply a dictionary whose size is greater than the maximum supported
+  window size for LZ4. (Adrien Grand)
+
 Other
 ---------------------
 * LUCENE-10423: Remove usages of System.currentTimeMillis() from tests. (Marios Trivyzas)
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/LZ4WithPresetDictCompressionMode.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/LZ4WithPresetDictCompressionMode.java
@ -170,7 +170,7 @@ public final class LZ4WithPresetDictCompressionMode extends CompressionMode {
    @Override
    public void compress(ByteBuffersDataInput buffersInput, DataOutput out) throws IOException {
      final int len = (int) (buffersInput.size() - buffersInput.position());
-      final int dictLength = len / (NUM_SUB_BLOCKS * DICT_SIZE_FACTOR);
+      final int dictLength = Math.min(LZ4.MAX_DISTANCE, len / (NUM_SUB_BLOCKS * DICT_SIZE_FACTOR));
      final int blockLength = (len - dictLength + NUM_SUB_BLOCKS - 1) / NUM_SUB_BLOCKS;
      buffer = ArrayUtil.growNoCopy(buffer, dictLength + blockLength);
      out.writeVInt(dictLength);
--- a/lucene/core/src/java/org/apache/lucene/util/compress/LZ4.java
+++ b/lucene/core/src/java/org/apache/lucene/util/compress/LZ4.java
@ -47,9 +47,14 @@ public final class LZ4 {

  private LZ4() {}

+  /**
+   * Window size: this is the maximum supported distance between two strings so that LZ4 can replace
+   * the second one by a reference to the first one.
+   */
+  public static final int MAX_DISTANCE = 1 << 16; // maximum distance of a reference
+
  static final int MEMORY_USAGE = 14;
  static final int MIN_MATCH = 4; // minimum length of a match
-  static final int MAX_DISTANCE = 1 << 16; // maximum distance of a reference
  static final int LAST_LITERALS = 5; // the last 5 bytes must be encoded as literals
  static final int HASH_LOG_HC = 15; // log size of the dictionary for compressHC
  static final int HASH_TABLE_SIZE_HC = 1 << HASH_LOG_HC;
@ -512,7 +517,7 @@ public final class LZ4 {
  /**
   * Compress {@code bytes[dictOff+dictLen:dictOff+dictLen+len]} into {@code out} using at most 16kB
   * of memory. {@code bytes[dictOff:dictOff+dictLen]} will be used as a dictionary. {@code dictLen}
-   * must not be greater than 64kB, the maximum window size.
+   * must not be greater than {@link LZ4#MAX_DISTANCE 64kB}, the maximum window size.
   *
   * <p>{@code ht} shouldn't be shared across threads but can safely be reused.
   */
--- a/lucene/core/src/test/org/apache/lucene/codecs/compressing/AbstractTestCompressionMode.java
+++ b/lucene/core/src/test/org/apache/lucene/codecs/compressing/AbstractTestCompressionMode.java
@ -154,4 +154,12 @@ public abstract class AbstractTestCompressionMode extends LuceneTestCase {
    Arrays.fill(decompressed, (byte) random().nextInt());
    test(decompressed);
  }
+
+  public void testExtremelyLargeInput() throws IOException {
+    final byte[] decompressed = new byte[1 << 24]; // 16MB
+    for (int i = 0; i < decompressed.length; ++i) {
+      decompressed[i] = (byte) (i & 0x0F);
+    }
+    test(decompressed);
+  }
 }