Lazily write the FST padding byte (#12981)

* lazily write the FST padding byte

* Also write the pad byte when there is emptyOutput

* add comment

* Add more comments
This commit is contained in:
Dzung Bui 2024-01-11 21:31:24 +09:00 committed by GitHub
parent 09837bae73
commit 701619d35a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 25 additions and 5 deletions

View File

@ -106,6 +106,9 @@ public class FSTCompiler<T> {
private final IntsRefBuilder lastInput = new IntsRefBuilder(); private final IntsRefBuilder lastInput = new IntsRefBuilder();
// indicates whether we are not yet to write the padding byte
private boolean paddingBytePending;
// NOTE: cutting this over to ArrayList instead loses ~6% // NOTE: cutting this over to ArrayList instead loses ~6%
// in build performance on 9.8M Wikipedia terms; so we // in build performance on 9.8M Wikipedia terms; so we
// left this as an array: // left this as an array:
@ -160,15 +163,14 @@ public class FSTCompiler<T> {
boolean allowFixedLengthArcs, boolean allowFixedLengthArcs,
DataOutput dataOutput, DataOutput dataOutput,
float directAddressingMaxOversizingFactor, float directAddressingMaxOversizingFactor,
int version) int version) {
throws IOException {
this.allowFixedLengthArcs = allowFixedLengthArcs; this.allowFixedLengthArcs = allowFixedLengthArcs;
this.directAddressingMaxOversizingFactor = directAddressingMaxOversizingFactor; this.directAddressingMaxOversizingFactor = directAddressingMaxOversizingFactor;
this.version = version; this.version = version;
// pad: ensure no node gets address 0 which is reserved to mean // pad: ensure no node gets address 0 which is reserved to mean
// the stop state w/ no arcs // the stop state w/ no arcs. the actual byte will be written lazily
dataOutput.writeByte((byte) 0);
numBytesWritten++; numBytesWritten++;
paddingBytePending = true;
this.dataOutput = dataOutput; this.dataOutput = dataOutput;
fst = fst =
new FST<>( new FST<>(
@ -340,7 +342,7 @@ public class FSTCompiler<T> {
} }
/** Creates a new {@link FSTCompiler}. */ /** Creates a new {@link FSTCompiler}. */
public FSTCompiler<T> build() throws IOException { public FSTCompiler<T> build() {
// create a default DataOutput if not specified // create a default DataOutput if not specified
if (dataOutput == null) { if (dataOutput == null) {
dataOutput = getOnHeapReaderWriter(15); dataOutput = getOnHeapReaderWriter(15);
@ -548,6 +550,10 @@ public class FSTCompiler<T> {
} }
reverseScratchBytes(); reverseScratchBytes();
// write the padding byte if needed
if (paddingBytePending) {
writePaddingByte();
}
scratchBytes.writeTo(dataOutput); scratchBytes.writeTo(dataOutput);
numBytesWritten += scratchBytes.getPosition(); numBytesWritten += scratchBytes.getPosition();
@ -555,6 +561,16 @@ public class FSTCompiler<T> {
return numBytesWritten - 1; return numBytesWritten - 1;
} }
/**
* Write the padding byte, ensure no node gets address 0 which is reserved to mean the stop state
* w/ no arcs
*/
private void writePaddingByte() throws IOException {
assert paddingBytePending;
dataOutput.writeByte((byte) 0);
paddingBytePending = false;
}
private void writeLabel(DataOutput out, int v) throws IOException { private void writeLabel(DataOutput out, int v) throws IOException {
assert v >= 0 : "v=" + v; assert v >= 0 : "v=" + v;
if (fst.metadata.inputType == INPUT_TYPE.BYTE1) { if (fst.metadata.inputType == INPUT_TYPE.BYTE1) {
@ -963,7 +979,11 @@ public class FSTCompiler<T> {
freezeTail(0); freezeTail(0);
if (root.numArcs == 0) { if (root.numArcs == 0) {
if (fst.metadata.emptyOutput == null) { if (fst.metadata.emptyOutput == null) {
// return null for completely empty FST which accepts nothing
return null; return null;
} else {
// we haven't written the padding byte so far, but the FST is still valid
writePaddingByte();
} }
} }