mirror of https://github.com/apache/lucene.git
Consolidate FSTStore and BytesStore in FST (#12709)
* Remove direct dependency of NodeHash to FST * Fix index out of bounds when writing FST to different metaOut (#12697) * Tidify code * Update CHANGES.txt * Re-add assertion * Remove direct dependency of NodeHash to FST * Hold off the FSTTraversal changes * Rename variable * Add Javadoc * Add @Override * tidy * tidy * Change to FSTReader * Update CHANGES.txt
This commit is contained in:
parent
01acb1c37b
commit
12fc7bf49f
|
@ -62,6 +62,9 @@ API Changes
|
|||
|
||||
* GITHUB#12599: Add RandomAccessInput#readBytes method to the RandomAccessInput interface. (Ignacio Vera)
|
||||
|
||||
* GITHUB#12709 Consolidate FSTStore and BytesStore in FST. Created FSTReader which contains the common methods
|
||||
of the two (Anh Dung Bui)
|
||||
|
||||
New Features
|
||||
---------------------
|
||||
|
||||
|
@ -150,7 +153,8 @@ API Changes
|
|||
* GITHUB#12592: Add RandomAccessInput#length method to the RandomAccessInput interface. In addition deprecate
|
||||
ByteBuffersDataInput#size in favour of this new method. (Ignacio Vera)
|
||||
|
||||
* GITHUB#12646: Move FST#addNode to FSTCompiler to avoid a circular dependency between FST and FSTCompiler
|
||||
* GITHUB#12646, GITHUB#12690: Move FST#addNode to FSTCompiler to avoid a circular dependency
|
||||
between FST and FSTCompiler (Anh Dung Bui)
|
||||
|
||||
New Features
|
||||
---------------------
|
||||
|
@ -200,7 +204,7 @@ Optimizations
|
|||
|
||||
* GITHUB#12382: Faster top-level conjunctions on term queries when sorting by
|
||||
descending score. (Adrien Grand)
|
||||
|
||||
|
||||
* GITHUB#12591: Use stable radix sort to speed up the sorting of update terms. (Guo Feng)
|
||||
|
||||
* GITHUB#12587: Use radix sort to speed up the sorting of terms in TermInSetQuery. (Guo Feng)
|
||||
|
@ -215,7 +219,7 @@ Optimizations
|
|||
|
||||
* GITHUB#12668: ImpactsEnums now decode frequencies lazily like PostingsEnums.
|
||||
(Adrien Grand)
|
||||
|
||||
|
||||
* GITHUB#12651: Use 2d array for OnHeapHnswGraph representation. (Patrick Zhai)
|
||||
|
||||
* GITHUB#12653: Optimize computing number of levels in MultiLevelSkipListWriter#bufferSkip. (Shubham Chaudhary)
|
||||
|
|
|
@ -21,13 +21,12 @@ import java.util.ArrayList;
|
|||
import java.util.List;
|
||||
import org.apache.lucene.store.DataInput;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.util.Accountable;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
|
||||
// TODO: merge with PagedBytes, except PagedBytes doesn't
|
||||
// let you read while writing which FST needs
|
||||
|
||||
class BytesStore extends DataOutput implements Accountable {
|
||||
class BytesStore extends DataOutput implements FSTReader {
|
||||
|
||||
private static final long BASE_RAM_BYTES_USED =
|
||||
RamUsageEstimator.shallowSizeOfInstance(BytesStore.class)
|
||||
|
@ -333,6 +332,11 @@ class BytesStore extends DataOutput implements Accountable {
|
|||
return ((long) blocks.size() - 1) * blockSize + nextWrite;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long size() {
|
||||
return getPosition();
|
||||
}
|
||||
|
||||
/**
|
||||
* Pos must be less than the max position written so far! Ie, you cannot "grow" the file with
|
||||
* this!
|
||||
|
@ -365,6 +369,7 @@ class BytesStore extends DataOutput implements Accountable {
|
|||
}
|
||||
|
||||
/** Writes all of our bytes to the target {@link DataOutput}. */
|
||||
@Override
|
||||
public void writeTo(DataOutput out) throws IOException {
|
||||
for (byte[] block : blocks) {
|
||||
out.writeBytes(block, 0, block.length);
|
||||
|
@ -437,7 +442,8 @@ class BytesStore extends DataOutput implements Accountable {
|
|||
};
|
||||
}
|
||||
|
||||
public FST.BytesReader getReverseReader() {
|
||||
@Override
|
||||
public FST.BytesReader getReverseBytesReader() {
|
||||
return getReverseReader(true);
|
||||
}
|
||||
|
||||
|
|
|
@ -123,9 +123,7 @@ public final class FST<T> implements Accountable {
|
|||
* A {@link BytesStore}, used during building, or during reading when the FST is very large (more
|
||||
* than 1 GB). If the FST is less than 1 GB then bytesArray is set instead.
|
||||
*/
|
||||
final BytesStore bytes;
|
||||
|
||||
private final FSTStore fstStore;
|
||||
private final FSTReader fstReader;
|
||||
|
||||
private long startNode = -1;
|
||||
|
||||
|
@ -398,15 +396,11 @@ public final class FST<T> implements Accountable {
|
|||
}
|
||||
|
||||
// make a new empty FST, for building; Builder invokes this
|
||||
FST(INPUT_TYPE inputType, Outputs<T> outputs, int bytesPageBits) {
|
||||
FST(INPUT_TYPE inputType, Outputs<T> outputs, FSTReader fstReader) {
|
||||
this.inputType = inputType;
|
||||
this.outputs = outputs;
|
||||
fstStore = null;
|
||||
bytes = new BytesStore(bytesPageBits);
|
||||
// pad: ensure no node gets address 0 which is reserved to mean
|
||||
// the stop state w/ no arcs
|
||||
bytes.writeByte((byte) 0);
|
||||
emptyOutput = null;
|
||||
this.fstReader = fstReader;
|
||||
this.version = VERSION_CURRENT;
|
||||
}
|
||||
|
||||
|
@ -423,8 +417,6 @@ public final class FST<T> implements Accountable {
|
|||
*/
|
||||
public FST(DataInput metaIn, DataInput in, Outputs<T> outputs, FSTStore fstStore)
|
||||
throws IOException {
|
||||
bytes = null;
|
||||
this.fstStore = fstStore;
|
||||
this.outputs = outputs;
|
||||
|
||||
// NOTE: only reads formats VERSION_START up to VERSION_CURRENT; we don't have
|
||||
|
@ -438,7 +430,7 @@ public final class FST<T> implements Accountable {
|
|||
emptyBytes.copyBytes(metaIn, numBytes);
|
||||
|
||||
// De-serialize empty-string output:
|
||||
BytesReader reader = emptyBytes.getReverseReader();
|
||||
BytesReader reader = emptyBytes.getReverseBytesReader();
|
||||
// NoOutputs uses 0 bytes when writing its output,
|
||||
// so we have to check here else BytesStore gets
|
||||
// angry:
|
||||
|
@ -466,19 +458,13 @@ public final class FST<T> implements Accountable {
|
|||
startNode = metaIn.readVLong();
|
||||
|
||||
long numBytes = metaIn.readVLong();
|
||||
this.fstStore.init(in, numBytes);
|
||||
fstStore.init(in, numBytes);
|
||||
this.fstReader = fstStore;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long ramBytesUsed() {
|
||||
long size = BASE_RAM_BYTES_USED;
|
||||
if (this.fstStore != null) {
|
||||
size += this.fstStore.ramBytesUsed();
|
||||
} else {
|
||||
size += bytes.ramBytesUsed();
|
||||
}
|
||||
|
||||
return size;
|
||||
return BASE_RAM_BYTES_USED + fstReader.ramBytesUsed();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -487,7 +473,7 @@ public final class FST<T> implements Accountable {
|
|||
}
|
||||
|
||||
void finish(long newStartNode) throws IOException {
|
||||
assert newStartNode <= bytes.getPosition();
|
||||
assert newStartNode <= fstReader.size();
|
||||
if (startNode != -1) {
|
||||
throw new IllegalStateException("already finished");
|
||||
}
|
||||
|
@ -495,11 +481,10 @@ public final class FST<T> implements Accountable {
|
|||
newStartNode = 0;
|
||||
}
|
||||
startNode = newStartNode;
|
||||
bytes.finish();
|
||||
}
|
||||
|
||||
public long numBytes() {
|
||||
return bytes.getPosition();
|
||||
return fstReader.size();
|
||||
}
|
||||
|
||||
public T getEmptyOutput() {
|
||||
|
@ -555,16 +540,8 @@ public final class FST<T> implements Accountable {
|
|||
}
|
||||
metaOut.writeByte(t);
|
||||
metaOut.writeVLong(startNode);
|
||||
if (bytes != null) {
|
||||
long numBytes = bytes.getPosition();
|
||||
metaOut.writeVLong(numBytes);
|
||||
bytes.writeTo(out);
|
||||
} else {
|
||||
assert fstStore != null;
|
||||
long numBytes = fstStore.size();
|
||||
metaOut.writeVLong(numBytes);
|
||||
fstStore.writeTo(out);
|
||||
}
|
||||
metaOut.writeVLong(numBytes());
|
||||
fstReader.writeTo(out);
|
||||
}
|
||||
|
||||
/** Writes an automaton to a file. */
|
||||
|
@ -1141,11 +1118,7 @@ public final class FST<T> implements Accountable {
|
|||
|
||||
/** Returns a {@link BytesReader} for this FST, positioned at position 0. */
|
||||
public BytesReader getBytesReader() {
|
||||
if (this.fstStore != null) {
|
||||
return this.fstStore.getReverseBytesReader();
|
||||
} else {
|
||||
return bytes.getReverseReader();
|
||||
}
|
||||
return fstReader.getReverseBytesReader();
|
||||
}
|
||||
|
||||
/** Reads bytes stored in an FST. */
|
||||
|
|
|
@ -137,9 +137,11 @@ public class FSTCompiler<T> {
|
|||
float directAddressingMaxOversizingFactor) {
|
||||
this.allowFixedLengthArcs = allowFixedLengthArcs;
|
||||
this.directAddressingMaxOversizingFactor = directAddressingMaxOversizingFactor;
|
||||
fst = new FST<>(inputType, outputs, bytesPageBits);
|
||||
bytes = fst.bytes;
|
||||
assert bytes != null;
|
||||
bytes = new BytesStore(bytesPageBits);
|
||||
// pad: ensure no node gets address 0 which is reserved to mean
|
||||
// the stop state w/ no arcs
|
||||
bytes.writeByte((byte) 0);
|
||||
fst = new FST<>(inputType, outputs, bytes);
|
||||
if (suffixRAMLimitMB < 0) {
|
||||
throw new IllegalArgumentException("ramLimitMB must be >= 0; got: " + suffixRAMLimitMB);
|
||||
} else if (suffixRAMLimitMB > 0) {
|
||||
|
@ -317,8 +319,6 @@ public class FSTCompiler<T> {
|
|||
// serializes new node by appending its bytes to the end
|
||||
// of the current byte[]
|
||||
long addNode(FSTCompiler.UnCompiledNode<T> nodeIn) throws IOException {
|
||||
T NO_OUTPUT = fst.outputs.getNoOutput();
|
||||
|
||||
// System.out.println("FST.addNode pos=" + bytes.getPosition() + " numArcs=" + nodeIn.numArcs);
|
||||
if (nodeIn.numArcs == 0) {
|
||||
if (nodeIn.isFinal) {
|
||||
|
@ -859,6 +859,7 @@ public class FSTCompiler<T> {
|
|||
// if (DEBUG) System.out.println(" builder.finish root.isFinal=" + root.isFinal + "
|
||||
// root.output=" + root.output);
|
||||
fst.finish(compileNode(root, lastInput.length()).node);
|
||||
bytes.finish();
|
||||
|
||||
return fst;
|
||||
}
|
||||
|
|
|
@ -0,0 +1,47 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.util.fst;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.util.Accountable;
|
||||
|
||||
/** Abstraction for reading bytes necessary for FST. */
|
||||
public interface FSTReader extends Accountable {
|
||||
|
||||
/**
|
||||
* The raw size in bytes of the FST
|
||||
*
|
||||
* @return the FST size
|
||||
*/
|
||||
long size();
|
||||
|
||||
/**
|
||||
* Get the reverse BytesReader for this FST
|
||||
*
|
||||
* @return the reverse BytesReader
|
||||
*/
|
||||
FST.BytesReader getReverseBytesReader();
|
||||
|
||||
/**
|
||||
* Write this FST to another DataOutput
|
||||
*
|
||||
* @param out the DataOutput
|
||||
* @throws IOException if exception occurred during writing
|
||||
*/
|
||||
void writeTo(DataOutput out) throws IOException;
|
||||
}
|
|
@ -18,16 +18,8 @@ package org.apache.lucene.util.fst;
|
|||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.store.DataInput;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.util.Accountable;
|
||||
|
||||
/** Abstraction for reading/writing bytes necessary for FST. */
|
||||
public interface FSTStore extends Accountable {
|
||||
/** A type of {@link FSTReader} which needs data to be initialized before use */
|
||||
public interface FSTStore extends FSTReader {
|
||||
void init(DataInput in, long numBytes) throws IOException;
|
||||
|
||||
long size();
|
||||
|
||||
FST.BytesReader getReverseBytesReader();
|
||||
|
||||
void writeTo(DataOutput out) throws IOException;
|
||||
}
|
||||
|
|
|
@ -88,7 +88,7 @@ public final class OnHeapFSTStore implements FSTStore {
|
|||
if (bytesArray != null) {
|
||||
return new ReverseBytesReader(bytesArray);
|
||||
} else {
|
||||
return bytes.getReverseReader();
|
||||
return bytes.getReverseBytesReader();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -271,7 +271,7 @@ public class TestBytesStore extends LuceneTestCase {
|
|||
System.out.println(" bulk: reversed");
|
||||
}
|
||||
// reversed
|
||||
FST.BytesReader r = bytes.getReverseReader();
|
||||
FST.BytesReader r = bytes.getReverseBytesReader();
|
||||
assertTrue(r.reversed());
|
||||
r.setPosition(totalLength - 1);
|
||||
r.readBytes(actual, 0, actual.length);
|
||||
|
@ -306,7 +306,7 @@ public class TestBytesStore extends LuceneTestCase {
|
|||
if (VERBOSE) {
|
||||
System.out.println(" ops: reversed");
|
||||
}
|
||||
r = bytes.getReverseReader();
|
||||
r = bytes.getReverseBytesReader();
|
||||
} else {
|
||||
if (VERBOSE) {
|
||||
System.out.println(" ops: forward");
|
||||
|
|
Loading…
Reference in New Issue