Lazily write the FST padding byte (#12981)

* lazily write the FST padding byte

* Also write the pad byte when there is emptyOutput

* add comment

* Add more comments
This commit is contained in:
Dzung Bui 2024-01-11 21:31:24 +09:00 committed by GitHub
parent 09837bae73
commit 701619d35a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 25 additions and 5 deletions

View File

@ -106,6 +106,9 @@ public class FSTCompiler<T> {
private final IntsRefBuilder lastInput = new IntsRefBuilder();
// indicates whether we are not yet to write the padding byte
private boolean paddingBytePending;
// NOTE: cutting this over to ArrayList instead loses ~6%
// in build performance on 9.8M Wikipedia terms; so we
// left this as an array:
@ -160,15 +163,14 @@ public class FSTCompiler<T> {
boolean allowFixedLengthArcs,
DataOutput dataOutput,
float directAddressingMaxOversizingFactor,
int version)
throws IOException {
int version) {
this.allowFixedLengthArcs = allowFixedLengthArcs;
this.directAddressingMaxOversizingFactor = directAddressingMaxOversizingFactor;
this.version = version;
// pad: ensure no node gets address 0 which is reserved to mean
// the stop state w/ no arcs
dataOutput.writeByte((byte) 0);
// the stop state w/ no arcs. the actual byte will be written lazily
numBytesWritten++;
paddingBytePending = true;
this.dataOutput = dataOutput;
fst =
new FST<>(
@ -340,7 +342,7 @@ public class FSTCompiler<T> {
}
/** Creates a new {@link FSTCompiler}. */
public FSTCompiler<T> build() throws IOException {
public FSTCompiler<T> build() {
// create a default DataOutput if not specified
if (dataOutput == null) {
dataOutput = getOnHeapReaderWriter(15);
@ -548,6 +550,10 @@ public class FSTCompiler<T> {
}
reverseScratchBytes();
// write the padding byte if needed
if (paddingBytePending) {
writePaddingByte();
}
scratchBytes.writeTo(dataOutput);
numBytesWritten += scratchBytes.getPosition();
@ -555,6 +561,16 @@ public class FSTCompiler<T> {
return numBytesWritten - 1;
}
/**
* Write the padding byte, ensure no node gets address 0 which is reserved to mean the stop state
* w/ no arcs
*/
private void writePaddingByte() throws IOException {
assert paddingBytePending;
dataOutput.writeByte((byte) 0);
paddingBytePending = false;
}
private void writeLabel(DataOutput out, int v) throws IOException {
assert v >= 0 : "v=" + v;
if (fst.metadata.inputType == INPUT_TYPE.BYTE1) {
@ -963,7 +979,11 @@ public class FSTCompiler<T> {
freezeTail(0);
if (root.numArcs == 0) {
if (fst.metadata.emptyOutput == null) {
// return null for completely empty FST which accepts nothing
return null;
} else {
// we haven't written the padding byte so far, but the FST is still valid
writePaddingByte();
}
}