From 701619d35a289a121cad9c9ec136246f5144fec7 Mon Sep 17 00:00:00 2001 From: Dzung Bui Date: Thu, 11 Jan 2024 21:31:24 +0900 Subject: [PATCH] Lazily write the FST padding byte (#12981) * lazily write the FST padding byte * Also write the pad byte when there is emptyOutput * add comment * Add more comments --- .../apache/lucene/util/fst/FSTCompiler.java | 30 +++++++++++++++---- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java index 13ca4f6a14e..eaf26bb91eb 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java @@ -106,6 +106,9 @@ public class FSTCompiler { private final IntsRefBuilder lastInput = new IntsRefBuilder(); + // indicates whether we are not yet to write the padding byte + private boolean paddingBytePending; + // NOTE: cutting this over to ArrayList instead loses ~6% // in build performance on 9.8M Wikipedia terms; so we // left this as an array: @@ -160,15 +163,14 @@ public class FSTCompiler { boolean allowFixedLengthArcs, DataOutput dataOutput, float directAddressingMaxOversizingFactor, - int version) - throws IOException { + int version) { this.allowFixedLengthArcs = allowFixedLengthArcs; this.directAddressingMaxOversizingFactor = directAddressingMaxOversizingFactor; this.version = version; // pad: ensure no node gets address 0 which is reserved to mean - // the stop state w/ no arcs - dataOutput.writeByte((byte) 0); + // the stop state w/ no arcs. the actual byte will be written lazily numBytesWritten++; + paddingBytePending = true; this.dataOutput = dataOutput; fst = new FST<>( @@ -340,7 +342,7 @@ public class FSTCompiler { } /** Creates a new {@link FSTCompiler}. */ - public FSTCompiler build() throws IOException { + public FSTCompiler build() { // create a default DataOutput if not specified if (dataOutput == null) { dataOutput = getOnHeapReaderWriter(15); @@ -548,6 +550,10 @@ public class FSTCompiler { } reverseScratchBytes(); + // write the padding byte if needed + if (paddingBytePending) { + writePaddingByte(); + } scratchBytes.writeTo(dataOutput); numBytesWritten += scratchBytes.getPosition(); @@ -555,6 +561,16 @@ public class FSTCompiler { return numBytesWritten - 1; } + /** + * Write the padding byte, ensure no node gets address 0 which is reserved to mean the stop state + * w/ no arcs + */ + private void writePaddingByte() throws IOException { + assert paddingBytePending; + dataOutput.writeByte((byte) 0); + paddingBytePending = false; + } + private void writeLabel(DataOutput out, int v) throws IOException { assert v >= 0 : "v=" + v; if (fst.metadata.inputType == INPUT_TYPE.BYTE1) { @@ -963,7 +979,11 @@ public class FSTCompiler { freezeTail(0); if (root.numArcs == 0) { if (fst.metadata.emptyOutput == null) { + // return null for completely empty FST which accepts nothing return null; + } else { + // we haven't written the padding byte so far, but the FST is still valid + writePaddingByte(); } }