mirror of https://github.com/apache/lucene.git
Consolidate FSTStore and BytesStore in FST (#12709)
* Remove direct dependency of NodeHash to FST * Fix index out of bounds when writing FST to different metaOut (#12697) * Tidify code * Update CHANGES.txt * Re-add assertion * Remove direct dependency of NodeHash to FST * Hold off the FSTTraversal changes * Rename variable * Add Javadoc * Add @Override * tidy * tidy * Change to FSTReader * Update CHANGES.txt
This commit is contained in:
parent
01acb1c37b
commit
12fc7bf49f
|
@ -62,6 +62,9 @@ API Changes
|
||||||
|
|
||||||
* GITHUB#12599: Add RandomAccessInput#readBytes method to the RandomAccessInput interface. (Ignacio Vera)
|
* GITHUB#12599: Add RandomAccessInput#readBytes method to the RandomAccessInput interface. (Ignacio Vera)
|
||||||
|
|
||||||
|
* GITHUB#12709 Consolidate FSTStore and BytesStore in FST. Created FSTReader which contains the common methods
|
||||||
|
of the two (Anh Dung Bui)
|
||||||
|
|
||||||
New Features
|
New Features
|
||||||
---------------------
|
---------------------
|
||||||
|
|
||||||
|
@ -150,7 +153,8 @@ API Changes
|
||||||
* GITHUB#12592: Add RandomAccessInput#length method to the RandomAccessInput interface. In addition deprecate
|
* GITHUB#12592: Add RandomAccessInput#length method to the RandomAccessInput interface. In addition deprecate
|
||||||
ByteBuffersDataInput#size in favour of this new method. (Ignacio Vera)
|
ByteBuffersDataInput#size in favour of this new method. (Ignacio Vera)
|
||||||
|
|
||||||
* GITHUB#12646: Move FST#addNode to FSTCompiler to avoid a circular dependency between FST and FSTCompiler
|
* GITHUB#12646, GITHUB#12690: Move FST#addNode to FSTCompiler to avoid a circular dependency
|
||||||
|
between FST and FSTCompiler (Anh Dung Bui)
|
||||||
|
|
||||||
New Features
|
New Features
|
||||||
---------------------
|
---------------------
|
||||||
|
@ -200,7 +204,7 @@ Optimizations
|
||||||
|
|
||||||
* GITHUB#12382: Faster top-level conjunctions on term queries when sorting by
|
* GITHUB#12382: Faster top-level conjunctions on term queries when sorting by
|
||||||
descending score. (Adrien Grand)
|
descending score. (Adrien Grand)
|
||||||
|
|
||||||
* GITHUB#12591: Use stable radix sort to speed up the sorting of update terms. (Guo Feng)
|
* GITHUB#12591: Use stable radix sort to speed up the sorting of update terms. (Guo Feng)
|
||||||
|
|
||||||
* GITHUB#12587: Use radix sort to speed up the sorting of terms in TermInSetQuery. (Guo Feng)
|
* GITHUB#12587: Use radix sort to speed up the sorting of terms in TermInSetQuery. (Guo Feng)
|
||||||
|
@ -215,7 +219,7 @@ Optimizations
|
||||||
|
|
||||||
* GITHUB#12668: ImpactsEnums now decode frequencies lazily like PostingsEnums.
|
* GITHUB#12668: ImpactsEnums now decode frequencies lazily like PostingsEnums.
|
||||||
(Adrien Grand)
|
(Adrien Grand)
|
||||||
|
|
||||||
* GITHUB#12651: Use 2d array for OnHeapHnswGraph representation. (Patrick Zhai)
|
* GITHUB#12651: Use 2d array for OnHeapHnswGraph representation. (Patrick Zhai)
|
||||||
|
|
||||||
* GITHUB#12653: Optimize computing number of levels in MultiLevelSkipListWriter#bufferSkip. (Shubham Chaudhary)
|
* GITHUB#12653: Optimize computing number of levels in MultiLevelSkipListWriter#bufferSkip. (Shubham Chaudhary)
|
||||||
|
|
|
@ -21,13 +21,12 @@ import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import org.apache.lucene.store.DataInput;
|
import org.apache.lucene.store.DataInput;
|
||||||
import org.apache.lucene.store.DataOutput;
|
import org.apache.lucene.store.DataOutput;
|
||||||
import org.apache.lucene.util.Accountable;
|
|
||||||
import org.apache.lucene.util.RamUsageEstimator;
|
import org.apache.lucene.util.RamUsageEstimator;
|
||||||
|
|
||||||
// TODO: merge with PagedBytes, except PagedBytes doesn't
|
// TODO: merge with PagedBytes, except PagedBytes doesn't
|
||||||
// let you read while writing which FST needs
|
// let you read while writing which FST needs
|
||||||
|
|
||||||
class BytesStore extends DataOutput implements Accountable {
|
class BytesStore extends DataOutput implements FSTReader {
|
||||||
|
|
||||||
private static final long BASE_RAM_BYTES_USED =
|
private static final long BASE_RAM_BYTES_USED =
|
||||||
RamUsageEstimator.shallowSizeOfInstance(BytesStore.class)
|
RamUsageEstimator.shallowSizeOfInstance(BytesStore.class)
|
||||||
|
@ -333,6 +332,11 @@ class BytesStore extends DataOutput implements Accountable {
|
||||||
return ((long) blocks.size() - 1) * blockSize + nextWrite;
|
return ((long) blocks.size() - 1) * blockSize + nextWrite;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long size() {
|
||||||
|
return getPosition();
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Pos must be less than the max position written so far! Ie, you cannot "grow" the file with
|
* Pos must be less than the max position written so far! Ie, you cannot "grow" the file with
|
||||||
* this!
|
* this!
|
||||||
|
@ -365,6 +369,7 @@ class BytesStore extends DataOutput implements Accountable {
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Writes all of our bytes to the target {@link DataOutput}. */
|
/** Writes all of our bytes to the target {@link DataOutput}. */
|
||||||
|
@Override
|
||||||
public void writeTo(DataOutput out) throws IOException {
|
public void writeTo(DataOutput out) throws IOException {
|
||||||
for (byte[] block : blocks) {
|
for (byte[] block : blocks) {
|
||||||
out.writeBytes(block, 0, block.length);
|
out.writeBytes(block, 0, block.length);
|
||||||
|
@ -437,7 +442,8 @@ class BytesStore extends DataOutput implements Accountable {
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
public FST.BytesReader getReverseReader() {
|
@Override
|
||||||
|
public FST.BytesReader getReverseBytesReader() {
|
||||||
return getReverseReader(true);
|
return getReverseReader(true);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -123,9 +123,7 @@ public final class FST<T> implements Accountable {
|
||||||
* A {@link BytesStore}, used during building, or during reading when the FST is very large (more
|
* A {@link BytesStore}, used during building, or during reading when the FST is very large (more
|
||||||
* than 1 GB). If the FST is less than 1 GB then bytesArray is set instead.
|
* than 1 GB). If the FST is less than 1 GB then bytesArray is set instead.
|
||||||
*/
|
*/
|
||||||
final BytesStore bytes;
|
private final FSTReader fstReader;
|
||||||
|
|
||||||
private final FSTStore fstStore;
|
|
||||||
|
|
||||||
private long startNode = -1;
|
private long startNode = -1;
|
||||||
|
|
||||||
|
@ -398,15 +396,11 @@ public final class FST<T> implements Accountable {
|
||||||
}
|
}
|
||||||
|
|
||||||
// make a new empty FST, for building; Builder invokes this
|
// make a new empty FST, for building; Builder invokes this
|
||||||
FST(INPUT_TYPE inputType, Outputs<T> outputs, int bytesPageBits) {
|
FST(INPUT_TYPE inputType, Outputs<T> outputs, FSTReader fstReader) {
|
||||||
this.inputType = inputType;
|
this.inputType = inputType;
|
||||||
this.outputs = outputs;
|
this.outputs = outputs;
|
||||||
fstStore = null;
|
|
||||||
bytes = new BytesStore(bytesPageBits);
|
|
||||||
// pad: ensure no node gets address 0 which is reserved to mean
|
|
||||||
// the stop state w/ no arcs
|
|
||||||
bytes.writeByte((byte) 0);
|
|
||||||
emptyOutput = null;
|
emptyOutput = null;
|
||||||
|
this.fstReader = fstReader;
|
||||||
this.version = VERSION_CURRENT;
|
this.version = VERSION_CURRENT;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -423,8 +417,6 @@ public final class FST<T> implements Accountable {
|
||||||
*/
|
*/
|
||||||
public FST(DataInput metaIn, DataInput in, Outputs<T> outputs, FSTStore fstStore)
|
public FST(DataInput metaIn, DataInput in, Outputs<T> outputs, FSTStore fstStore)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
bytes = null;
|
|
||||||
this.fstStore = fstStore;
|
|
||||||
this.outputs = outputs;
|
this.outputs = outputs;
|
||||||
|
|
||||||
// NOTE: only reads formats VERSION_START up to VERSION_CURRENT; we don't have
|
// NOTE: only reads formats VERSION_START up to VERSION_CURRENT; we don't have
|
||||||
|
@ -438,7 +430,7 @@ public final class FST<T> implements Accountable {
|
||||||
emptyBytes.copyBytes(metaIn, numBytes);
|
emptyBytes.copyBytes(metaIn, numBytes);
|
||||||
|
|
||||||
// De-serialize empty-string output:
|
// De-serialize empty-string output:
|
||||||
BytesReader reader = emptyBytes.getReverseReader();
|
BytesReader reader = emptyBytes.getReverseBytesReader();
|
||||||
// NoOutputs uses 0 bytes when writing its output,
|
// NoOutputs uses 0 bytes when writing its output,
|
||||||
// so we have to check here else BytesStore gets
|
// so we have to check here else BytesStore gets
|
||||||
// angry:
|
// angry:
|
||||||
|
@ -466,19 +458,13 @@ public final class FST<T> implements Accountable {
|
||||||
startNode = metaIn.readVLong();
|
startNode = metaIn.readVLong();
|
||||||
|
|
||||||
long numBytes = metaIn.readVLong();
|
long numBytes = metaIn.readVLong();
|
||||||
this.fstStore.init(in, numBytes);
|
fstStore.init(in, numBytes);
|
||||||
|
this.fstReader = fstStore;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public long ramBytesUsed() {
|
public long ramBytesUsed() {
|
||||||
long size = BASE_RAM_BYTES_USED;
|
return BASE_RAM_BYTES_USED + fstReader.ramBytesUsed();
|
||||||
if (this.fstStore != null) {
|
|
||||||
size += this.fstStore.ramBytesUsed();
|
|
||||||
} else {
|
|
||||||
size += bytes.ramBytesUsed();
|
|
||||||
}
|
|
||||||
|
|
||||||
return size;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -487,7 +473,7 @@ public final class FST<T> implements Accountable {
|
||||||
}
|
}
|
||||||
|
|
||||||
void finish(long newStartNode) throws IOException {
|
void finish(long newStartNode) throws IOException {
|
||||||
assert newStartNode <= bytes.getPosition();
|
assert newStartNode <= fstReader.size();
|
||||||
if (startNode != -1) {
|
if (startNode != -1) {
|
||||||
throw new IllegalStateException("already finished");
|
throw new IllegalStateException("already finished");
|
||||||
}
|
}
|
||||||
|
@ -495,11 +481,10 @@ public final class FST<T> implements Accountable {
|
||||||
newStartNode = 0;
|
newStartNode = 0;
|
||||||
}
|
}
|
||||||
startNode = newStartNode;
|
startNode = newStartNode;
|
||||||
bytes.finish();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public long numBytes() {
|
public long numBytes() {
|
||||||
return bytes.getPosition();
|
return fstReader.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
public T getEmptyOutput() {
|
public T getEmptyOutput() {
|
||||||
|
@ -555,16 +540,8 @@ public final class FST<T> implements Accountable {
|
||||||
}
|
}
|
||||||
metaOut.writeByte(t);
|
metaOut.writeByte(t);
|
||||||
metaOut.writeVLong(startNode);
|
metaOut.writeVLong(startNode);
|
||||||
if (bytes != null) {
|
metaOut.writeVLong(numBytes());
|
||||||
long numBytes = bytes.getPosition();
|
fstReader.writeTo(out);
|
||||||
metaOut.writeVLong(numBytes);
|
|
||||||
bytes.writeTo(out);
|
|
||||||
} else {
|
|
||||||
assert fstStore != null;
|
|
||||||
long numBytes = fstStore.size();
|
|
||||||
metaOut.writeVLong(numBytes);
|
|
||||||
fstStore.writeTo(out);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Writes an automaton to a file. */
|
/** Writes an automaton to a file. */
|
||||||
|
@ -1141,11 +1118,7 @@ public final class FST<T> implements Accountable {
|
||||||
|
|
||||||
/** Returns a {@link BytesReader} for this FST, positioned at position 0. */
|
/** Returns a {@link BytesReader} for this FST, positioned at position 0. */
|
||||||
public BytesReader getBytesReader() {
|
public BytesReader getBytesReader() {
|
||||||
if (this.fstStore != null) {
|
return fstReader.getReverseBytesReader();
|
||||||
return this.fstStore.getReverseBytesReader();
|
|
||||||
} else {
|
|
||||||
return bytes.getReverseReader();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Reads bytes stored in an FST. */
|
/** Reads bytes stored in an FST. */
|
||||||
|
|
|
@ -137,9 +137,11 @@ public class FSTCompiler<T> {
|
||||||
float directAddressingMaxOversizingFactor) {
|
float directAddressingMaxOversizingFactor) {
|
||||||
this.allowFixedLengthArcs = allowFixedLengthArcs;
|
this.allowFixedLengthArcs = allowFixedLengthArcs;
|
||||||
this.directAddressingMaxOversizingFactor = directAddressingMaxOversizingFactor;
|
this.directAddressingMaxOversizingFactor = directAddressingMaxOversizingFactor;
|
||||||
fst = new FST<>(inputType, outputs, bytesPageBits);
|
bytes = new BytesStore(bytesPageBits);
|
||||||
bytes = fst.bytes;
|
// pad: ensure no node gets address 0 which is reserved to mean
|
||||||
assert bytes != null;
|
// the stop state w/ no arcs
|
||||||
|
bytes.writeByte((byte) 0);
|
||||||
|
fst = new FST<>(inputType, outputs, bytes);
|
||||||
if (suffixRAMLimitMB < 0) {
|
if (suffixRAMLimitMB < 0) {
|
||||||
throw new IllegalArgumentException("ramLimitMB must be >= 0; got: " + suffixRAMLimitMB);
|
throw new IllegalArgumentException("ramLimitMB must be >= 0; got: " + suffixRAMLimitMB);
|
||||||
} else if (suffixRAMLimitMB > 0) {
|
} else if (suffixRAMLimitMB > 0) {
|
||||||
|
@ -317,8 +319,6 @@ public class FSTCompiler<T> {
|
||||||
// serializes new node by appending its bytes to the end
|
// serializes new node by appending its bytes to the end
|
||||||
// of the current byte[]
|
// of the current byte[]
|
||||||
long addNode(FSTCompiler.UnCompiledNode<T> nodeIn) throws IOException {
|
long addNode(FSTCompiler.UnCompiledNode<T> nodeIn) throws IOException {
|
||||||
T NO_OUTPUT = fst.outputs.getNoOutput();
|
|
||||||
|
|
||||||
// System.out.println("FST.addNode pos=" + bytes.getPosition() + " numArcs=" + nodeIn.numArcs);
|
// System.out.println("FST.addNode pos=" + bytes.getPosition() + " numArcs=" + nodeIn.numArcs);
|
||||||
if (nodeIn.numArcs == 0) {
|
if (nodeIn.numArcs == 0) {
|
||||||
if (nodeIn.isFinal) {
|
if (nodeIn.isFinal) {
|
||||||
|
@ -859,6 +859,7 @@ public class FSTCompiler<T> {
|
||||||
// if (DEBUG) System.out.println(" builder.finish root.isFinal=" + root.isFinal + "
|
// if (DEBUG) System.out.println(" builder.finish root.isFinal=" + root.isFinal + "
|
||||||
// root.output=" + root.output);
|
// root.output=" + root.output);
|
||||||
fst.finish(compileNode(root, lastInput.length()).node);
|
fst.finish(compileNode(root, lastInput.length()).node);
|
||||||
|
bytes.finish();
|
||||||
|
|
||||||
return fst;
|
return fst;
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,47 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.util.fst;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import org.apache.lucene.store.DataOutput;
|
||||||
|
import org.apache.lucene.util.Accountable;
|
||||||
|
|
||||||
|
/** Abstraction for reading bytes necessary for FST. */
|
||||||
|
public interface FSTReader extends Accountable {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The raw size in bytes of the FST
|
||||||
|
*
|
||||||
|
* @return the FST size
|
||||||
|
*/
|
||||||
|
long size();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the reverse BytesReader for this FST
|
||||||
|
*
|
||||||
|
* @return the reverse BytesReader
|
||||||
|
*/
|
||||||
|
FST.BytesReader getReverseBytesReader();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Write this FST to another DataOutput
|
||||||
|
*
|
||||||
|
* @param out the DataOutput
|
||||||
|
* @throws IOException if exception occurred during writing
|
||||||
|
*/
|
||||||
|
void writeTo(DataOutput out) throws IOException;
|
||||||
|
}
|
|
@ -18,16 +18,8 @@ package org.apache.lucene.util.fst;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import org.apache.lucene.store.DataInput;
|
import org.apache.lucene.store.DataInput;
|
||||||
import org.apache.lucene.store.DataOutput;
|
|
||||||
import org.apache.lucene.util.Accountable;
|
|
||||||
|
|
||||||
/** Abstraction for reading/writing bytes necessary for FST. */
|
/** A type of {@link FSTReader} which needs data to be initialized before use */
|
||||||
public interface FSTStore extends Accountable {
|
public interface FSTStore extends FSTReader {
|
||||||
void init(DataInput in, long numBytes) throws IOException;
|
void init(DataInput in, long numBytes) throws IOException;
|
||||||
|
|
||||||
long size();
|
|
||||||
|
|
||||||
FST.BytesReader getReverseBytesReader();
|
|
||||||
|
|
||||||
void writeTo(DataOutput out) throws IOException;
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -88,7 +88,7 @@ public final class OnHeapFSTStore implements FSTStore {
|
||||||
if (bytesArray != null) {
|
if (bytesArray != null) {
|
||||||
return new ReverseBytesReader(bytesArray);
|
return new ReverseBytesReader(bytesArray);
|
||||||
} else {
|
} else {
|
||||||
return bytes.getReverseReader();
|
return bytes.getReverseBytesReader();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -271,7 +271,7 @@ public class TestBytesStore extends LuceneTestCase {
|
||||||
System.out.println(" bulk: reversed");
|
System.out.println(" bulk: reversed");
|
||||||
}
|
}
|
||||||
// reversed
|
// reversed
|
||||||
FST.BytesReader r = bytes.getReverseReader();
|
FST.BytesReader r = bytes.getReverseBytesReader();
|
||||||
assertTrue(r.reversed());
|
assertTrue(r.reversed());
|
||||||
r.setPosition(totalLength - 1);
|
r.setPosition(totalLength - 1);
|
||||||
r.readBytes(actual, 0, actual.length);
|
r.readBytes(actual, 0, actual.length);
|
||||||
|
@ -306,7 +306,7 @@ public class TestBytesStore extends LuceneTestCase {
|
||||||
if (VERBOSE) {
|
if (VERBOSE) {
|
||||||
System.out.println(" ops: reversed");
|
System.out.println(" ops: reversed");
|
||||||
}
|
}
|
||||||
r = bytes.getReverseReader();
|
r = bytes.getReverseBytesReader();
|
||||||
} else {
|
} else {
|
||||||
if (VERBOSE) {
|
if (VERBOSE) {
|
||||||
System.out.println(" ops: forward");
|
System.out.println(" ops: forward");
|
||||||
|
|
Loading…
Reference in New Issue