Remove FST constructors with DataInput for metadata (#12803)

* Remove FST constructor

* Move Outputs to FSTMetadata
This commit is contained in:
Dzung Bui 2023-11-22 22:21:59 +09:00 committed by GitHub
parent 9b324a180f
commit e04793d651
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
21 changed files with 85 additions and 79 deletions

View File

@ -16,6 +16,8 @@
*/
package org.apache.lucene.analysis.ja.dict;
import static org.apache.lucene.util.fst.FST.readMetadata;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
@ -103,7 +105,7 @@ public final class TokenInfoDictionary extends BinaryDictionary<TokenInfoMorphDa
FST<Long> fst;
try (InputStream is = new BufferedInputStream(fstResource.get())) {
DataInput in = new InputStreamDataInput(is);
fst = new FST<>(in, in, PositiveIntOutputs.getSingleton());
fst = new FST<>(readMetadata(in, PositiveIntOutputs.getSingleton()), in);
}
// TODO: some way to configure?
this.fst = new TokenInfoFST(fst, true);

View File

@ -16,6 +16,8 @@
*/
package org.apache.lucene.analysis.ko.dict;
import static org.apache.lucene.util.fst.FST.readMetadata;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
@ -102,7 +104,7 @@ public final class TokenInfoDictionary extends BinaryDictionary<TokenInfoMorphDa
FST<Long> fst;
try (InputStream is = new BufferedInputStream(fstResource.get())) {
DataInput in = new InputStreamDataInput(is);
fst = new FST<>(in, in, PositiveIntOutputs.getSingleton());
fst = new FST<>(readMetadata(in, PositiveIntOutputs.getSingleton()), in);
}
this.fst = new TokenInfoFST(fst);
}

View File

@ -16,6 +16,8 @@
*/
package org.apache.lucene.backward_codecs.lucene40.blocktree;
import static org.apache.lucene.util.fst.FST.readMetadata;
import java.io.IOException;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexOptions;
@ -89,9 +91,17 @@ public final class FieldReader extends Terms {
final IndexInput clone = indexIn.clone();
clone.seek(indexStartFP);
if (metaIn == indexIn) { // Only true before Lucene 8.6
index = new FST<>(clone, clone, ByteSequenceOutputs.getSingleton(), new OffHeapFSTStore());
index =
new FST<>(
readMetadata(clone, ByteSequenceOutputs.getSingleton()),
clone,
new OffHeapFSTStore());
} else {
index = new FST<>(metaIn, clone, ByteSequenceOutputs.getSingleton(), new OffHeapFSTStore());
index =
new FST<>(
readMetadata(metaIn, ByteSequenceOutputs.getSingleton()),
clone,
new OffHeapFSTStore());
}
/*
if (false) {

View File

@ -16,6 +16,8 @@
*/
package org.apache.lucene.codecs.blockterms;
import static org.apache.lucene.util.fst.FST.readMetadata;
import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
@ -154,7 +156,7 @@ public class VariableGapTermsIndexReader extends TermsIndexReaderBase {
public FieldIndexData(IndexInput in, FieldInfo fieldInfo, long indexStart) throws IOException {
IndexInput clone = in.clone();
clone.seek(indexStart);
fst = new FST<>(clone, clone, fstOutputs);
fst = new FST<>(readMetadata(clone, fstOutputs), clone);
clone.close();
/*

View File

@ -16,6 +16,8 @@
*/
package org.apache.lucene.codecs.blocktreeords;
import static org.apache.lucene.util.fst.FST.readMetadata;
import java.io.IOException;
import org.apache.lucene.codecs.blocktreeords.FSTOrdsOutputs.Output;
import org.apache.lucene.index.FieldInfo;
@ -85,7 +87,7 @@ final class OrdsFieldReader extends Terms {
final IndexInput clone = indexIn.clone();
// System.out.println("start=" + indexStartFP + " field=" + fieldInfo.name);
clone.seek(indexStartFP);
index = new FST<>(clone, clone, OrdsBlockTreeTermsWriter.FST_OUTPUTS);
index = new FST<>(readMetadata(clone, OrdsBlockTreeTermsWriter.FST_OUTPUTS), clone);
/*
if (true) {

View File

@ -194,7 +194,8 @@ public class FSTTermsReader extends FieldsProducer {
this.sumDocFreq = sumDocFreq;
this.docCount = docCount;
OffHeapFSTStore offHeapFSTStore = new OffHeapFSTStore();
this.dict = new FST<>(in, in, new FSTTermOutputs(fieldInfo), offHeapFSTStore);
FSTTermOutputs outputs = new FSTTermOutputs(fieldInfo);
this.dict = new FST<>(FST.readMetadata(in, outputs), in, offHeapFSTStore);
in.skipBytes(offHeapFSTStore.size());
}

View File

@ -89,10 +89,11 @@ public class FSTDictionary implements IndexDictionary {
isFSTOnHeap = true;
}
PositiveIntOutputs fstOutputs = PositiveIntOutputs.getSingleton();
FST.FSTMetadata<Long> metadata = FST.readMetadata(fstDataInput, fstOutputs);
FST<Long> fst =
isFSTOnHeap
? new FST<>(fstDataInput, fstDataInput, fstOutputs)
: new FST<>(fstDataInput, fstDataInput, fstOutputs, new OffHeapFSTStore());
? new FST<>(metadata, fstDataInput)
: new FST<>(metadata, fstDataInput, new OffHeapFSTStore());
return new FSTDictionary(fst);
}

View File

@ -91,7 +91,11 @@ public final class FieldReader extends Terms {
// Initialize FST always off-heap.
final IndexInput clone = indexIn.clone();
clone.seek(indexStartFP);
index = new FST<>(metaIn, clone, ByteSequenceOutputs.getSingleton(), new OffHeapFSTStore());
index =
new FST<>(
FST.readMetadata(metaIn, ByteSequenceOutputs.getSingleton()),
clone,
new OffHeapFSTStore());
/*
if (false) {
final String dotFileName = segment + "_" + fieldInfo.name + ".dot";

View File

@ -404,18 +404,8 @@ public final class FST<T> implements Accountable {
* Load a previously saved FST with a DataInput for metdata using an {@link OnHeapFSTStore} with
* maxBlockBits set to {@link #DEFAULT_MAX_BLOCK_BITS}
*/
public FST(DataInput metaIn, DataInput in, Outputs<T> outputs) throws IOException {
this(metaIn, in, outputs, new OnHeapFSTStore(DEFAULT_MAX_BLOCK_BITS));
}
/**
* Load a previously saved FST with a DataInput for metdata and a FSTStore. If using {@link
* OnHeapFSTStore}, setting maxBlockBits allows you to control the size of the byte[] pages used
* to hold the FST bytes.
*/
public FST(DataInput metaIn, DataInput in, Outputs<T> outputs, FSTStore fstStore)
throws IOException {
this(readMetadata(metaIn, outputs), in, outputs, fstStore);
public FST(FSTMetadata<T> metadata, DataInput in) throws IOException {
this(metadata, in, new OnHeapFSTStore(DEFAULT_MAX_BLOCK_BITS));
}
/**
@ -423,15 +413,14 @@ public final class FST<T> implements Accountable {
* OnHeapFSTStore}, setting maxBlockBits allows you to control the size of the byte[] pages used
* to hold the FST bytes.
*/
public FST(FSTMetadata<T> metadata, DataInput in, Outputs<T> outputs, FSTStore fstStore)
throws IOException {
this(metadata, outputs, fstStore.init(in, metadata.numBytes));
public FST(FSTMetadata<T> metadata, DataInput in, FSTStore fstStore) throws IOException {
this(metadata, fstStore.init(in, metadata.numBytes));
}
/** Create the FST with a metadata object and a FSTReader. */
FST(FSTMetadata<T> metadata, Outputs<T> outputs, FSTReader fstReader) {
FST(FSTMetadata<T> metadata, FSTReader fstReader) {
this.metadata = metadata;
this.outputs = outputs;
this.outputs = metadata.outputs;
this.fstReader = fstReader;
}
@ -486,7 +475,7 @@ public final class FST<T> implements Accountable {
}
long startNode = metaIn.readVLong();
long numBytes = metaIn.readVLong();
return new FSTMetadata<>(inputType, emptyOutput, startNode, version, numBytes);
return new FSTMetadata<>(inputType, outputs, emptyOutput, startNode, version, numBytes);
}
@Override
@ -574,7 +563,7 @@ public final class FST<T> implements Accountable {
public static <T> FST<T> read(Path path, Outputs<T> outputs) throws IOException {
try (InputStream is = Files.newInputStream(path)) {
DataInput in = new InputStreamDataInput(new BufferedInputStream(is));
return new FST<>(in, in, outputs);
return new FST<>(readMetadata(in, outputs), in);
}
}
@ -1202,6 +1191,7 @@ public final class FST<T> implements Accountable {
*/
public static final class FSTMetadata<T> {
final INPUT_TYPE inputType;
final Outputs<T> outputs;
final int version;
// if non-null, this FST accepts the empty string and
// produces this output
@ -1210,8 +1200,14 @@ public final class FST<T> implements Accountable {
long numBytes;
public FSTMetadata(
INPUT_TYPE inputType, T emptyOutput, long startNode, int version, long numBytes) {
INPUT_TYPE inputType,
Outputs<T> outputs,
T emptyOutput,
long startNode,
int version,
long numBytes) {
this.inputType = inputType;
this.outputs = outputs;
this.emptyOutput = emptyOutput;
this.startNode = startNode;
this.version = version;

View File

@ -135,7 +135,7 @@ public class FSTCompiler<T> {
// pad: ensure no node gets address 0 which is reserved to mean
// the stop state w/ no arcs
bytes.writeByte((byte) 0);
fst = new FST<>(new FST.FSTMetadata<>(inputType, null, -1, VERSION_CURRENT, 0), outputs, bytes);
fst = new FST<>(new FST.FSTMetadata<>(inputType, outputs, null, -1, VERSION_CURRENT, 0), bytes);
if (suffixRAMLimitMB < 0) {
throw new IllegalArgumentException("ramLimitMB must be >= 0; got: " + suffixRAMLimitMB);
} else if (suffixRAMLimitMB > 0) {
@ -702,21 +702,6 @@ public class FSTCompiler<T> {
* IntSequenceOutputs}) then you cannot reuse across calls.
*/
public void add(IntsRef input, T output) throws IOException {
/*
if (DEBUG) {
BytesRef b = new BytesRef(input.length);
for(int x=0;x<input.length;x++) {
b.bytes[x] = (byte) input.ints[x];
}
b.length = input.length;
if (output == NO_OUTPUT) {
System.out.println("\nFST ADD: input=" + toString(b) + " " + b);
} else {
System.out.println("\nFST ADD: input=" + toString(b) + " " + b + " output=" + fst.outputs.outputToString(output));
}
}
*/
// De-dup NO_OUTPUT since it must be a singleton:
if (output.equals(NO_OUTPUT)) {
output = NO_OUTPUT;

View File

@ -141,7 +141,7 @@ public class Test2BFST extends LuceneTestCase {
fst.save(out, out);
out.close();
IndexInput in = dir.openInput("fst", IOContext.DEFAULT);
fst = new FST<>(in, in, outputs);
fst = new FST<>(FST.readMetadata(in, outputs), in);
in.close();
} else {
dir.deleteFile("fst");
@ -228,7 +228,7 @@ public class Test2BFST extends LuceneTestCase {
fst.save(out, out);
out.close();
IndexInput in = dir.openInput("fst", IOContext.DEFAULT);
fst = new FST<>(in, in, outputs);
fst = new FST<>(FST.readMetadata(in, outputs), in);
in.close();
} else {
dir.deleteFile("fst");
@ -320,7 +320,7 @@ public class Test2BFST extends LuceneTestCase {
fst.save(out, out);
out.close();
IndexInput in = dir.openInput("fst", IOContext.DEFAULT);
fst = new FST<>(in, in, outputs);
fst = new FST<>(FST.readMetadata(in, outputs), in);
in.close();
} else {
dir.deleteFile("fst");

View File

@ -16,6 +16,8 @@
*/
package org.apache.lucene.util.fst;
import static org.apache.lucene.util.fst.FST.readMetadata;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
@ -219,7 +221,7 @@ public class TestFSTDirectAddressing extends LuceneTestCase {
private static void countFSTArcs(String fstFilePath) throws IOException {
byte[] buf = Files.readAllBytes(Paths.get(fstFilePath));
DataInput in = new ByteArrayDataInput(buf);
FST<BytesRef> fst = new FST<>(in, in, ByteSequenceOutputs.getSingleton());
FST<BytesRef> fst = new FST<>(readMetadata(in, ByteSequenceOutputs.getSingleton()), in);
BytesRefFSTEnum<BytesRef> fstEnum = new BytesRefFSTEnum<>(fst);
int binarySearchArcCount = 0,
directAddressingArcCount = 0,
@ -286,7 +288,8 @@ public class TestFSTDirectAddressing extends LuceneTestCase {
System.out.println("Reading FST");
long startTimeMs = System.nanoTime();
FST<CharsRef> originalFst = new FST<>(in, in, CharSequenceOutputs.getSingleton());
FST<CharsRef> originalFst =
new FST<>(readMetadata(in, CharSequenceOutputs.getSingleton()), in);
long endTimeMs = System.nanoTime();
System.out.println(
"time = " + TimeUnit.NANOSECONDS.toMillis(endTimeMs - startTimeMs) + " ms");

View File

@ -1226,7 +1226,7 @@ public class TestFSTs extends LuceneTestCase {
// load the FST, which will force it to use FSTStore instead of BytesStore
ByteArrayDataInput in = new ByteArrayDataInput(outOS.toByteArray());
FST<Long> loadedFST = new FST<>(in, in, outputs);
FST<Long> loadedFST = new FST<>(FST.readMetadata(in, outputs), in);
// now save the FST again, this time to different DataOutput for meta
ByteArrayOutputStream metdataOS = new ByteArrayOutputStream();
@ -1238,7 +1238,7 @@ public class TestFSTs extends LuceneTestCase {
// finally load it again
ByteArrayDataInput metaIn = new ByteArrayDataInput(metdataOS.toByteArray());
ByteArrayDataInput dataIn = new ByteArrayDataInput(dataOS.toByteArray());
loadedFST = new FST<>(metaIn, dataIn, outputs);
loadedFST = new FST<>(FST.readMetadata(metaIn, outputs), dataIn);
assertEquals(22L, Util.get(loadedFST, Util.toIntsRef(newBytesRef("aab"), scratch)).longValue());
assertEquals(7L, Util.get(loadedFST, Util.toIntsRef(newBytesRef("aac"), scratch)).longValue());
@ -1299,7 +1299,7 @@ public class TestFSTs extends LuceneTestCase {
out.close();
IndexInput in = dir.openInput("fst", IOContext.DEFAULT);
final FST<Long> fst2 = new FST<>(in, in, outputs);
final FST<Long> fst2 = new FST<>(FST.readMetadata(in, outputs), in);
checkStopNodes(fst2, outputs);
in.close();
dir.close();

View File

@ -16,6 +16,8 @@
*/
package org.apache.lucene.demo.knn;
import static org.apache.lucene.util.fst.FST.readMetadata;
import java.io.BufferedReader;
import java.io.Closeable;
import java.io.IOException;
@ -58,7 +60,7 @@ public class KnnVectorDict implements Closeable {
*/
public KnnVectorDict(Directory directory, String dictName) throws IOException {
try (IndexInput fstIn = directory.openInput(dictName + ".fst", IOContext.READ)) {
fst = new FST<>(fstIn, fstIn, PositiveIntOutputs.getSingleton());
fst = new FST<>(readMetadata(fstIn, PositiveIntOutputs.getSingleton()), fstIn);
}
vectors = directory.openInput(dictName + ".bin", IOContext.READ);

View File

@ -16,6 +16,8 @@
*/
package org.apache.lucene.sandbox.codecs.idversion;
import static org.apache.lucene.util.fst.FST.readMetadata;
import java.io.IOException;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexOptions;
@ -86,7 +88,7 @@ final class VersionFieldReader extends Terms {
final IndexInput clone = indexIn.clone();
// System.out.println("start=" + indexStartFP + " field=" + fieldInfo.name);
clone.seek(indexStartFP);
index = new FST<>(clone, clone, VersionBlockTreeTermsWriter.FST_OUTPUTS);
index = new FST<>(readMetadata(clone, VersionBlockTreeTermsWriter.FST_OUTPUTS), clone);
/*
if (false) {

View File

@ -612,12 +612,9 @@ public class AnalyzingSuggester extends Lookup {
@Override
public boolean load(DataInput input) throws IOException {
count = input.readVLong();
this.fst =
new FST<>(
input,
input,
new PairOutputs<>(
PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton()));
PairOutputs<Long, BytesRef> outputs =
new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton());
this.fst = new FST<>(FST.readMetadata(input, outputs), input);
maxAnalyzedPathsForOneInput = input.readVInt();
hasPayloads = input.readByte() == 1;
return true;

View File

@ -20,6 +20,8 @@ package org.apache.lucene.search.suggest.analyzing;
// - test w/ syns
// - add pruning of low-freq ngrams?
import static org.apache.lucene.util.fst.FST.readMetadata;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
@ -384,7 +386,7 @@ public class FreeTextSuggester extends Lookup {
}
totTokens = input.readVLong();
fst = new FST<>(input, input, PositiveIntOutputs.getSingleton());
fst = new FST<>(readMetadata(input, PositiveIntOutputs.getSingleton()), input);
return true;
}

View File

@ -337,25 +337,16 @@ public final class NRTSuggester implements Accountable {
*/
public static NRTSuggester load(IndexInput input, FSTLoadMode fstLoadMode) throws IOException {
final FST<Pair<Long, BytesRef>> fst;
PairOutputs<Long, BytesRef> outputs =
new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton());
if (shouldLoadFSTOffHeap(input, fstLoadMode)) {
OffHeapFSTStore store = new OffHeapFSTStore();
IndexInput clone = input.clone();
clone.seek(input.getFilePointer());
fst =
new FST<>(
clone,
clone,
new PairOutputs<>(
PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton()),
store);
fst = new FST<>(FST.readMetadata(clone, outputs), clone, store);
input.seek(clone.getFilePointer() + store.size());
} else {
fst =
new FST<>(
input,
input,
new PairOutputs<>(
PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton()));
fst = new FST<>(FST.readMetadata(input, outputs), input);
}
/* read some meta info */

View File

@ -16,6 +16,8 @@
*/
package org.apache.lucene.search.suggest.fst;
import static org.apache.lucene.util.fst.FST.readMetadata;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
@ -301,7 +303,7 @@ public class FSTCompletionLookup extends Lookup {
public synchronized boolean load(DataInput input) throws IOException {
count = input.readVLong();
this.higherWeightsCompletion =
new FSTCompletion(new FST<>(input, input, NoOutputs.getSingleton()));
new FSTCompletion(new FST<>(readMetadata(input, NoOutputs.getSingleton()), input));
this.normalCompletion =
new FSTCompletion(higherWeightsCompletion.getFST(), false, exactMatchFirst);
return true;

View File

@ -16,6 +16,8 @@
*/
package org.apache.lucene.search.suggest.fst;
import static org.apache.lucene.util.fst.FST.readMetadata;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
@ -141,7 +143,7 @@ public class WFSTCompletionLookup extends Lookup {
@Override
public boolean load(DataInput input) throws IOException {
count = input.readVLong();
this.fst = new FST<>(input, input, PositiveIntOutputs.getSingleton());
this.fst = new FST<>(readMetadata(input, PositiveIntOutputs.getSingleton()), input);
return true;
}

View File

@ -281,7 +281,7 @@ public class FSTTester<T> {
fst.save(out, out);
}
try (IndexInput in = dir.openInput("fst.bin", context)) {
fst = new FST<>(in, in, outputs);
fst = new FST<>(FST.readMetadata(in, outputs), in);
} finally {
dir.deleteFile("fst.bin");
}