Reduce FST block size for BlockTreeTermsWriter (#12604)

This commit is contained in:
gf2121 2023-10-04 01:58:56 -05:00 committed by GitHub
parent 75da33836b
commit 96052891e6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 20 additions and 0 deletions

View File

@ -167,6 +167,9 @@ Optimizations
* GITHUB#12591: Use stable radix sort to speed up the sorting of update terms. (Guo Feng)
* GITHUB#12604: Estimate the block size of FST BytesStore in BlockTreeTermsWriter
to reduce GC load during indexing. (Guo Feng)
Changes in runtime behavior
---------------------

View File

@ -52,6 +52,7 @@ import org.apache.lucene.util.fst.BytesRefFSTEnum;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.Util;
import org.apache.lucene.util.packed.PackedInts;
/*
TODO:
@ -490,10 +491,22 @@ public final class Lucene90BlockTreeTermsWriter extends FieldsConsumer {
}
}
long estimateSize = prefix.length;
for (PendingBlock block : blocks) {
if (block.subIndices != null) {
for (FST<BytesRef> subIndex : block.subIndices) {
estimateSize += subIndex.numBytes();
}
}
}
int estimateBitsRequired = PackedInts.bitsRequired(estimateSize);
int pageBits = Math.min(15, Math.max(6, estimateBitsRequired));
final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
final FSTCompiler<BytesRef> fstCompiler =
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs)
.shouldShareNonSingletonNodes(false)
.bytesPageBits(pageBits)
.build();
// if (DEBUG) {
// System.out.println(" compile index for prefix=" + prefix);

View File

@ -520,6 +520,10 @@ public final class FST<T> implements Accountable {
bytes.finish();
}
public long numBytes() {
return bytes.getPosition();
}
public T getEmptyOutput() {
return emptyOutput;
}