Reduce FST block size for BlockTreeTermsWriter (#12604)

2025-02-28 21:39:25 +00:00 · 2023-10-04 01:58:56 -05:00 · 2023-10-04 01:58:56 -05:00 · 96052891e6
commit 96052891e6
parent 75da33836b
3 changed files with 20 additions and 0 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -167,6 +167,9 @@ Optimizations
  
 * GITHUB#12591: Use stable radix sort to speed up the sorting of update terms. (Guo Feng)

+* GITHUB#12604: Estimate the block size of FST BytesStore in BlockTreeTermsWriter
+  to reduce GC load during indexing. (Guo Feng)
+
 Changes in runtime behavior
 ---------------------

--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java
@ -52,6 +52,7 @@ import org.apache.lucene.util.fst.BytesRefFSTEnum;
 import org.apache.lucene.util.fst.FST;
 import org.apache.lucene.util.fst.FSTCompiler;
 import org.apache.lucene.util.fst.Util;
+import org.apache.lucene.util.packed.PackedInts;

 /*
  TODO:
@ -490,10 +491,22 @@ public final class Lucene90BlockTreeTermsWriter extends FieldsConsumer {
        }
      }

+      long estimateSize = prefix.length;
+      for (PendingBlock block : blocks) {
+        if (block.subIndices != null) {
+          for (FST<BytesRef> subIndex : block.subIndices) {
+            estimateSize += subIndex.numBytes();
+          }
+        }
+      }
+      int estimateBitsRequired = PackedInts.bitsRequired(estimateSize);
+      int pageBits = Math.min(15, Math.max(6, estimateBitsRequired));
+
      final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
      final FSTCompiler<BytesRef> fstCompiler =
          new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs)
              .shouldShareNonSingletonNodes(false)
+              .bytesPageBits(pageBits)
              .build();
      // if (DEBUG) {
      //  System.out.println("  compile index for prefix=" + prefix);
--- a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java
+++ b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java
@ -520,6 +520,10 @@ public final class FST<T> implements Accountable {
    bytes.finish();
  }

+  public long numBytes() {
+    return bytes.getPosition();
+  }
+
  public T getEmptyOutput() {
    return emptyOutput;
  }