Consolidate FSTStore and BytesStore in FST (#12709)

* Remove direct dependency of NodeHash to FST

* Fix index out of bounds when writing FST to different metaOut (#12697)

* Tidify code

* Update CHANGES.txt

* Re-add assertion

* Remove direct dependency of NodeHash to FST

* Hold off the FSTTraversal changes

* Rename variable

* Add Javadoc

* Add @Override

* tidy

* tidy

* Change to FSTReader

* Update CHANGES.txt
This commit is contained in:
Dzung Bui 2023-10-26 10:26:27 +09:00 committed by GitHub
parent 01acb1c37b
commit 12fc7bf49f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 86 additions and 63 deletions

View File

@ -62,6 +62,9 @@ API Changes
* GITHUB#12599: Add RandomAccessInput#readBytes method to the RandomAccessInput interface. (Ignacio Vera) * GITHUB#12599: Add RandomAccessInput#readBytes method to the RandomAccessInput interface. (Ignacio Vera)
* GITHUB#12709 Consolidate FSTStore and BytesStore in FST. Created FSTReader which contains the common methods
of the two (Anh Dung Bui)
New Features New Features
--------------------- ---------------------
@ -150,7 +153,8 @@ API Changes
* GITHUB#12592: Add RandomAccessInput#length method to the RandomAccessInput interface. In addition deprecate * GITHUB#12592: Add RandomAccessInput#length method to the RandomAccessInput interface. In addition deprecate
ByteBuffersDataInput#size in favour of this new method. (Ignacio Vera) ByteBuffersDataInput#size in favour of this new method. (Ignacio Vera)
* GITHUB#12646: Move FST#addNode to FSTCompiler to avoid a circular dependency between FST and FSTCompiler * GITHUB#12646, GITHUB#12690: Move FST#addNode to FSTCompiler to avoid a circular dependency
between FST and FSTCompiler (Anh Dung Bui)
New Features New Features
--------------------- ---------------------
@ -200,7 +204,7 @@ Optimizations
* GITHUB#12382: Faster top-level conjunctions on term queries when sorting by * GITHUB#12382: Faster top-level conjunctions on term queries when sorting by
descending score. (Adrien Grand) descending score. (Adrien Grand)
* GITHUB#12591: Use stable radix sort to speed up the sorting of update terms. (Guo Feng) * GITHUB#12591: Use stable radix sort to speed up the sorting of update terms. (Guo Feng)
* GITHUB#12587: Use radix sort to speed up the sorting of terms in TermInSetQuery. (Guo Feng) * GITHUB#12587: Use radix sort to speed up the sorting of terms in TermInSetQuery. (Guo Feng)
@ -215,7 +219,7 @@ Optimizations
* GITHUB#12668: ImpactsEnums now decode frequencies lazily like PostingsEnums. * GITHUB#12668: ImpactsEnums now decode frequencies lazily like PostingsEnums.
(Adrien Grand) (Adrien Grand)
* GITHUB#12651: Use 2d array for OnHeapHnswGraph representation. (Patrick Zhai) * GITHUB#12651: Use 2d array for OnHeapHnswGraph representation. (Patrick Zhai)
* GITHUB#12653: Optimize computing number of levels in MultiLevelSkipListWriter#bufferSkip. (Shubham Chaudhary) * GITHUB#12653: Optimize computing number of levels in MultiLevelSkipListWriter#bufferSkip. (Shubham Chaudhary)

View File

@ -21,13 +21,12 @@ import java.util.ArrayList;
import java.util.List; import java.util.List;
import org.apache.lucene.store.DataInput; import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput; import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.RamUsageEstimator;
// TODO: merge with PagedBytes, except PagedBytes doesn't // TODO: merge with PagedBytes, except PagedBytes doesn't
// let you read while writing which FST needs // let you read while writing which FST needs
class BytesStore extends DataOutput implements Accountable { class BytesStore extends DataOutput implements FSTReader {
private static final long BASE_RAM_BYTES_USED = private static final long BASE_RAM_BYTES_USED =
RamUsageEstimator.shallowSizeOfInstance(BytesStore.class) RamUsageEstimator.shallowSizeOfInstance(BytesStore.class)
@ -333,6 +332,11 @@ class BytesStore extends DataOutput implements Accountable {
return ((long) blocks.size() - 1) * blockSize + nextWrite; return ((long) blocks.size() - 1) * blockSize + nextWrite;
} }
@Override
public long size() {
return getPosition();
}
/** /**
* Pos must be less than the max position written so far! Ie, you cannot "grow" the file with * Pos must be less than the max position written so far! Ie, you cannot "grow" the file with
* this! * this!
@ -365,6 +369,7 @@ class BytesStore extends DataOutput implements Accountable {
} }
/** Writes all of our bytes to the target {@link DataOutput}. */ /** Writes all of our bytes to the target {@link DataOutput}. */
@Override
public void writeTo(DataOutput out) throws IOException { public void writeTo(DataOutput out) throws IOException {
for (byte[] block : blocks) { for (byte[] block : blocks) {
out.writeBytes(block, 0, block.length); out.writeBytes(block, 0, block.length);
@ -437,7 +442,8 @@ class BytesStore extends DataOutput implements Accountable {
}; };
} }
public FST.BytesReader getReverseReader() { @Override
public FST.BytesReader getReverseBytesReader() {
return getReverseReader(true); return getReverseReader(true);
} }

View File

@ -123,9 +123,7 @@ public final class FST<T> implements Accountable {
* A {@link BytesStore}, used during building, or during reading when the FST is very large (more * A {@link BytesStore}, used during building, or during reading when the FST is very large (more
* than 1 GB). If the FST is less than 1 GB then bytesArray is set instead. * than 1 GB). If the FST is less than 1 GB then bytesArray is set instead.
*/ */
final BytesStore bytes; private final FSTReader fstReader;
private final FSTStore fstStore;
private long startNode = -1; private long startNode = -1;
@ -398,15 +396,11 @@ public final class FST<T> implements Accountable {
} }
// make a new empty FST, for building; Builder invokes this // make a new empty FST, for building; Builder invokes this
FST(INPUT_TYPE inputType, Outputs<T> outputs, int bytesPageBits) { FST(INPUT_TYPE inputType, Outputs<T> outputs, FSTReader fstReader) {
this.inputType = inputType; this.inputType = inputType;
this.outputs = outputs; this.outputs = outputs;
fstStore = null;
bytes = new BytesStore(bytesPageBits);
// pad: ensure no node gets address 0 which is reserved to mean
// the stop state w/ no arcs
bytes.writeByte((byte) 0);
emptyOutput = null; emptyOutput = null;
this.fstReader = fstReader;
this.version = VERSION_CURRENT; this.version = VERSION_CURRENT;
} }
@ -423,8 +417,6 @@ public final class FST<T> implements Accountable {
*/ */
public FST(DataInput metaIn, DataInput in, Outputs<T> outputs, FSTStore fstStore) public FST(DataInput metaIn, DataInput in, Outputs<T> outputs, FSTStore fstStore)
throws IOException { throws IOException {
bytes = null;
this.fstStore = fstStore;
this.outputs = outputs; this.outputs = outputs;
// NOTE: only reads formats VERSION_START up to VERSION_CURRENT; we don't have // NOTE: only reads formats VERSION_START up to VERSION_CURRENT; we don't have
@ -438,7 +430,7 @@ public final class FST<T> implements Accountable {
emptyBytes.copyBytes(metaIn, numBytes); emptyBytes.copyBytes(metaIn, numBytes);
// De-serialize empty-string output: // De-serialize empty-string output:
BytesReader reader = emptyBytes.getReverseReader(); BytesReader reader = emptyBytes.getReverseBytesReader();
// NoOutputs uses 0 bytes when writing its output, // NoOutputs uses 0 bytes when writing its output,
// so we have to check here else BytesStore gets // so we have to check here else BytesStore gets
// angry: // angry:
@ -466,19 +458,13 @@ public final class FST<T> implements Accountable {
startNode = metaIn.readVLong(); startNode = metaIn.readVLong();
long numBytes = metaIn.readVLong(); long numBytes = metaIn.readVLong();
this.fstStore.init(in, numBytes); fstStore.init(in, numBytes);
this.fstReader = fstStore;
} }
@Override @Override
public long ramBytesUsed() { public long ramBytesUsed() {
long size = BASE_RAM_BYTES_USED; return BASE_RAM_BYTES_USED + fstReader.ramBytesUsed();
if (this.fstStore != null) {
size += this.fstStore.ramBytesUsed();
} else {
size += bytes.ramBytesUsed();
}
return size;
} }
@Override @Override
@ -487,7 +473,7 @@ public final class FST<T> implements Accountable {
} }
void finish(long newStartNode) throws IOException { void finish(long newStartNode) throws IOException {
assert newStartNode <= bytes.getPosition(); assert newStartNode <= fstReader.size();
if (startNode != -1) { if (startNode != -1) {
throw new IllegalStateException("already finished"); throw new IllegalStateException("already finished");
} }
@ -495,11 +481,10 @@ public final class FST<T> implements Accountable {
newStartNode = 0; newStartNode = 0;
} }
startNode = newStartNode; startNode = newStartNode;
bytes.finish();
} }
public long numBytes() { public long numBytes() {
return bytes.getPosition(); return fstReader.size();
} }
public T getEmptyOutput() { public T getEmptyOutput() {
@ -555,16 +540,8 @@ public final class FST<T> implements Accountable {
} }
metaOut.writeByte(t); metaOut.writeByte(t);
metaOut.writeVLong(startNode); metaOut.writeVLong(startNode);
if (bytes != null) { metaOut.writeVLong(numBytes());
long numBytes = bytes.getPosition(); fstReader.writeTo(out);
metaOut.writeVLong(numBytes);
bytes.writeTo(out);
} else {
assert fstStore != null;
long numBytes = fstStore.size();
metaOut.writeVLong(numBytes);
fstStore.writeTo(out);
}
} }
/** Writes an automaton to a file. */ /** Writes an automaton to a file. */
@ -1141,11 +1118,7 @@ public final class FST<T> implements Accountable {
/** Returns a {@link BytesReader} for this FST, positioned at position 0. */ /** Returns a {@link BytesReader} for this FST, positioned at position 0. */
public BytesReader getBytesReader() { public BytesReader getBytesReader() {
if (this.fstStore != null) { return fstReader.getReverseBytesReader();
return this.fstStore.getReverseBytesReader();
} else {
return bytes.getReverseReader();
}
} }
/** Reads bytes stored in an FST. */ /** Reads bytes stored in an FST. */

View File

@ -137,9 +137,11 @@ public class FSTCompiler<T> {
float directAddressingMaxOversizingFactor) { float directAddressingMaxOversizingFactor) {
this.allowFixedLengthArcs = allowFixedLengthArcs; this.allowFixedLengthArcs = allowFixedLengthArcs;
this.directAddressingMaxOversizingFactor = directAddressingMaxOversizingFactor; this.directAddressingMaxOversizingFactor = directAddressingMaxOversizingFactor;
fst = new FST<>(inputType, outputs, bytesPageBits); bytes = new BytesStore(bytesPageBits);
bytes = fst.bytes; // pad: ensure no node gets address 0 which is reserved to mean
assert bytes != null; // the stop state w/ no arcs
bytes.writeByte((byte) 0);
fst = new FST<>(inputType, outputs, bytes);
if (suffixRAMLimitMB < 0) { if (suffixRAMLimitMB < 0) {
throw new IllegalArgumentException("ramLimitMB must be >= 0; got: " + suffixRAMLimitMB); throw new IllegalArgumentException("ramLimitMB must be >= 0; got: " + suffixRAMLimitMB);
} else if (suffixRAMLimitMB > 0) { } else if (suffixRAMLimitMB > 0) {
@ -317,8 +319,6 @@ public class FSTCompiler<T> {
// serializes new node by appending its bytes to the end // serializes new node by appending its bytes to the end
// of the current byte[] // of the current byte[]
long addNode(FSTCompiler.UnCompiledNode<T> nodeIn) throws IOException { long addNode(FSTCompiler.UnCompiledNode<T> nodeIn) throws IOException {
T NO_OUTPUT = fst.outputs.getNoOutput();
// System.out.println("FST.addNode pos=" + bytes.getPosition() + " numArcs=" + nodeIn.numArcs); // System.out.println("FST.addNode pos=" + bytes.getPosition() + " numArcs=" + nodeIn.numArcs);
if (nodeIn.numArcs == 0) { if (nodeIn.numArcs == 0) {
if (nodeIn.isFinal) { if (nodeIn.isFinal) {
@ -859,6 +859,7 @@ public class FSTCompiler<T> {
// if (DEBUG) System.out.println(" builder.finish root.isFinal=" + root.isFinal + " // if (DEBUG) System.out.println(" builder.finish root.isFinal=" + root.isFinal + "
// root.output=" + root.output); // root.output=" + root.output);
fst.finish(compileNode(root, lastInput.length()).node); fst.finish(compileNode(root, lastInput.length()).node);
bytes.finish();
return fst; return fst;
} }

View File

@ -0,0 +1,47 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.util.fst;
import java.io.IOException;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.Accountable;
/** Abstraction for reading bytes necessary for FST. */
public interface FSTReader extends Accountable {
/**
* The raw size in bytes of the FST
*
* @return the FST size
*/
long size();
/**
* Get the reverse BytesReader for this FST
*
* @return the reverse BytesReader
*/
FST.BytesReader getReverseBytesReader();
/**
* Write this FST to another DataOutput
*
* @param out the DataOutput
* @throws IOException if exception occurred during writing
*/
void writeTo(DataOutput out) throws IOException;
}

View File

@ -18,16 +18,8 @@ package org.apache.lucene.util.fst;
import java.io.IOException; import java.io.IOException;
import org.apache.lucene.store.DataInput; import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.Accountable;
/** Abstraction for reading/writing bytes necessary for FST. */ /** A type of {@link FSTReader} which needs data to be initialized before use */
public interface FSTStore extends Accountable { public interface FSTStore extends FSTReader {
void init(DataInput in, long numBytes) throws IOException; void init(DataInput in, long numBytes) throws IOException;
long size();
FST.BytesReader getReverseBytesReader();
void writeTo(DataOutput out) throws IOException;
} }

View File

@ -88,7 +88,7 @@ public final class OnHeapFSTStore implements FSTStore {
if (bytesArray != null) { if (bytesArray != null) {
return new ReverseBytesReader(bytesArray); return new ReverseBytesReader(bytesArray);
} else { } else {
return bytes.getReverseReader(); return bytes.getReverseBytesReader();
} }
} }

View File

@ -271,7 +271,7 @@ public class TestBytesStore extends LuceneTestCase {
System.out.println(" bulk: reversed"); System.out.println(" bulk: reversed");
} }
// reversed // reversed
FST.BytesReader r = bytes.getReverseReader(); FST.BytesReader r = bytes.getReverseBytesReader();
assertTrue(r.reversed()); assertTrue(r.reversed());
r.setPosition(totalLength - 1); r.setPosition(totalLength - 1);
r.readBytes(actual, 0, actual.length); r.readBytes(actual, 0, actual.length);
@ -306,7 +306,7 @@ public class TestBytesStore extends LuceneTestCase {
if (VERBOSE) { if (VERBOSE) {
System.out.println(" ops: reversed"); System.out.println(" ops: reversed");
} }
r = bytes.getReverseReader(); r = bytes.getReverseBytesReader();
} else { } else {
if (VERBOSE) { if (VERBOSE) {
System.out.println(" ops: forward"); System.out.println(" ops: forward");