LUCENE-9353: Move terms metadata to its own file. (#1473)

This commit is contained in:
Adrien Grand 2020-06-16 15:05:28 +02:00 committed by GitHub
parent c083e5414e
commit 87a3bef50f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
26 changed files with 228 additions and 219 deletions

View File

@ -200,6 +200,10 @@ Improvements
* LUCENE-9342: TotalHits' relation will be EQUAL_TO when the number of hits is lower than TopDocsColector's numHits
(Tomás Fernández Löbbe)
* LUCENE-9353: Metadata of the terms dictionary moved to its own file, with the
`.tmd` extension. This allows checksums of metadata to be verified when
opening indices and helps save seeks when opening an index. (Adrien Grand)
* LUCENE-9359: SegmentInfos#readCommit now always returns a
CorruptIndexException if the content of the file is invalid. (Adrien Grand)

View File

@ -21,6 +21,7 @@ import java.io.BufferedInputStream;
import java.io.InputStream;
import java.io.IOException;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.InputStreamDataInput;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PositiveIntOutputs;
@ -44,7 +45,8 @@ public final class TokenInfoDictionary extends BinaryDictionary {
super(resourceScheme, resourcePath);
FST<Long> fst;
try (InputStream is = new BufferedInputStream(getResource(FST_FILENAME_SUFFIX))) {
fst = new FST<>(new InputStreamDataInput(is), PositiveIntOutputs.getSingleton());
DataInput in = new InputStreamDataInput(is);
fst = new FST<>(in, in, PositiveIntOutputs.getSingleton());
}
// TODO: some way to configure?
this.fst = new TokenInfoFST(fst, true);

View File

@ -20,6 +20,7 @@ import java.io.BufferedInputStream;
import java.io.InputStream;
import java.io.IOException;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.InputStreamDataInput;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PositiveIntOutputs;
@ -47,7 +48,8 @@ public final class TokenInfoDictionary extends BinaryDictionary {
super(resourceScheme, resourcePath);
FST<Long> fst;
try (InputStream is = new BufferedInputStream(getResource(FST_FILENAME_SUFFIX))) {
fst = new FST<>(new InputStreamDataInput(is), PositiveIntOutputs.getSingleton());
DataInput in = new InputStreamDataInput(is);
fst = new FST<>(in, in, PositiveIntOutputs.getSingleton());
}
this.fst = new TokenInfoFST(fst);
}

View File

@ -148,7 +148,7 @@ public class VariableGapTermsIndexReader extends TermsIndexReaderBase {
public FieldIndexData(IndexInput in, FieldInfo fieldInfo, long indexStart) throws IOException {
IndexInput clone = in.clone();
clone.seek(indexStart);
fst = new FST<>(clone, fstOutputs);
fst = new FST<>(clone, clone, fstOutputs);
clone.close();
/*

View File

@ -280,7 +280,7 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
public void finish(long termsFilePointer) throws IOException {
fst = fstCompiler.compile();
if (fst != null) {
fst.save(out);
fst.save(out, out);
}
}
}

View File

@ -832,7 +832,7 @@ public final class OrdsBlockTreeTermsWriter extends FieldsConsumer {
// Write FST to index
indexStartFP = indexOut.getFilePointer();
root.index.save(indexOut);
root.index.save(indexOut, indexOut);
//System.out.println(" write FST " + indexStartFP + " field=" + fieldInfo.name);
// if (SAVE_DOT_FILES || DEBUG) {

View File

@ -78,7 +78,7 @@ final class OrdsFieldReader extends Terms implements Accountable {
final IndexInput clone = indexIn.clone();
//System.out.println("start=" + indexStartFP + " field=" + fieldInfo.name);
clone.seek(indexStartFP);
index = new FST<>(clone, OrdsBlockTreeTermsWriter.FST_OUTPUTS);
index = new FST<>(clone, clone, OrdsBlockTreeTermsWriter.FST_OUTPUTS);
/*
if (true) {

View File

@ -176,7 +176,7 @@ public class FSTTermsReader extends FieldsProducer {
this.sumTotalTermFreq = sumTotalTermFreq;
this.sumDocFreq = sumDocFreq;
this.docCount = docCount;
this.dict = new FST<>(in, new FSTTermOutputs(fieldInfo));
this.dict = new FST<>(in, in, new FSTTermOutputs(fieldInfo));
}
@Override

View File

@ -209,7 +209,7 @@ public class FSTTermsWriter extends FieldsConsumer {
}
out.writeVLong(field.sumDocFreq);
out.writeVInt(field.docCount);
field.dict.save(out);
field.dict.save(out, out);
}
writeTrailer(out, dirStart);
CodecUtil.writeFooter(out);

View File

@ -71,10 +71,10 @@ public class FSTDictionary implements IndexDictionary {
@Override
public void write(DataOutput output, BlockEncoder blockEncoder) throws IOException {
if (blockEncoder == null) {
fst.save(output);
fst.save(output, output);
} else {
ByteBuffersDataOutput bytesDataOutput = ByteBuffersDataOutput.newResettableInstance();
fst.save(bytesDataOutput);
fst.save(bytesDataOutput, bytesDataOutput);
BlockEncoder.WritableBytes encodedBytes = blockEncoder.encode(bytesDataOutput.toDataInput(), bytesDataOutput.size());
output.writeVLong(encodedBytes.size());
encodedBytes.writeTo(output);
@ -98,8 +98,8 @@ public class FSTDictionary implements IndexDictionary {
isFSTOnHeap = true;
}
PositiveIntOutputs fstOutputs = PositiveIntOutputs.getSingleton();
FST<Long> fst = isFSTOnHeap ? new FST<>(fstDataInput, fstOutputs)
: new FST<>(fstDataInput, fstOutputs, new OffHeapFSTStore());
FST<Long> fst = isFSTOnHeap ? new FST<>(fstDataInput, fstDataInput, fstOutputs)
: new FST<>(fstDataInput, fstDataInput, fstOutputs, new OffHeapFSTStore());
return new FSTDictionary(fst);
}

View File

@ -16,7 +16,6 @@
*/
package org.apache.lucene.codecs.blocktree;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
@ -35,6 +34,7 @@ import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.Accountables;
@ -97,13 +97,20 @@ public final class BlockTreeTermsReader extends FieldsProducer {
/** Suffixes are compressed to save space. */
public static final int VERSION_COMPRESSED_SUFFIXES = 5;
/** Metadata is written to its own file. */
public static final int VERSION_META_FILE = 6;
/** Current terms format. */
public static final int VERSION_CURRENT = VERSION_COMPRESSED_SUFFIXES;
public static final int VERSION_CURRENT = VERSION_META_FILE;
/** Extension of terms index file */
static final String TERMS_INDEX_EXTENSION = "tip";
final static String TERMS_INDEX_CODEC_NAME = "BlockTreeTermsIndex";
/** Extension of terms meta file */
static final String TERMS_META_EXTENSION = "tmd";
final static String TERMS_META_CODEC_NAME = "BlockTreeTermsMeta";
// Open input to the main terms dict file (_X.tib)
final IndexInput termsIn;
// Open input to the terms index file (_X.tip)
@ -128,9 +135,9 @@ public final class BlockTreeTermsReader extends FieldsProducer {
this.postingsReader = postingsReader;
this.segment = state.segmentInfo.name;
String termsName = IndexFileNames.segmentFileName(segment, state.segmentSuffix, TERMS_EXTENSION);
try {
String termsName = IndexFileNames.segmentFileName(segment, state.segmentSuffix, TERMS_EXTENSION);
termsIn = state.directory.openInput(termsName, state.context);
version = CodecUtil.checkIndexHeader(termsIn, TERMS_CODEC_NAME, VERSION_START, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
@ -138,66 +145,106 @@ public final class BlockTreeTermsReader extends FieldsProducer {
indexIn = state.directory.openInput(indexName, state.context);
CodecUtil.checkIndexHeader(indexIn, TERMS_INDEX_CODEC_NAME, version, version, state.segmentInfo.getId(), state.segmentSuffix);
// Have PostingsReader init itself
postingsReader.init(termsIn, state);
if (version < VERSION_META_FILE) {
// Have PostingsReader init itself
postingsReader.init(termsIn, state);
// Verifying the checksum against all bytes would be too costly, but for now we at least
// verify proper structure of the checksum footer. This is cheap and can detect some forms
// of corruption such as file truncation.
CodecUtil.retrieveChecksum(indexIn);
CodecUtil.retrieveChecksum(termsIn);
// Verifying the checksum against all bytes would be too costly, but for now we at least
// verify proper structure of the checksum footer. This is cheap and can detect some forms
// of corruption such as file truncation.
CodecUtil.retrieveChecksum(indexIn);
CodecUtil.retrieveChecksum(termsIn);
}
// Read per-field details
seekDir(termsIn);
seekDir(indexIn);
String metaName = IndexFileNames.segmentFileName(segment, state.segmentSuffix, TERMS_META_EXTENSION);
Map<String, FieldReader> fieldMap = null;
Throwable priorE = null;
long indexLength = -1, termsLength = -1;
try (ChecksumIndexInput metaIn = version >= VERSION_META_FILE ? state.directory.openChecksumInput(metaName, state.context) : null) {
try {
final IndexInput indexMetaIn, termsMetaIn;
if (version >= VERSION_META_FILE) {
CodecUtil.checkIndexHeader(metaIn, TERMS_META_CODEC_NAME, version, version, state.segmentInfo.getId(), state.segmentSuffix);
indexMetaIn = termsMetaIn = metaIn;
postingsReader.init(metaIn, state);
} else {
seekDir(termsIn);
seekDir(indexIn);
indexMetaIn = indexIn;
termsMetaIn = termsIn;
}
final int numFields = termsIn.readVInt();
if (numFields < 0) {
throw new CorruptIndexException("invalid numFields: " + numFields, termsIn);
}
fieldMap = new HashMap<>((int) (numFields / 0.75f) + 1);
for (int i = 0; i < numFields; ++i) {
final int field = termsIn.readVInt();
final long numTerms = termsIn.readVLong();
if (numTerms <= 0) {
throw new CorruptIndexException("Illegal numTerms for field number: " + field, termsIn);
}
final BytesRef rootCode = readBytesRef(termsIn);
final FieldInfo fieldInfo = state.fieldInfos.fieldInfo(field);
if (fieldInfo == null) {
throw new CorruptIndexException("invalid field number: " + field, termsIn);
}
final long sumTotalTermFreq = termsIn.readVLong();
// when frequencies are omitted, sumDocFreq=sumTotalTermFreq and only one value is written.
final long sumDocFreq = fieldInfo.getIndexOptions() == IndexOptions.DOCS ? sumTotalTermFreq : termsIn.readVLong();
final int docCount = termsIn.readVInt();
if (version < VERSION_META_LONGS_REMOVED) {
final int longsSize = termsIn.readVInt();
if (longsSize < 0) {
throw new CorruptIndexException("invalid longsSize for field: " + fieldInfo.name + ", longsSize=" + longsSize, termsIn);
final int numFields = termsMetaIn.readVInt();
if (numFields < 0) {
throw new CorruptIndexException("invalid numFields: " + numFields, termsMetaIn);
}
fieldMap = new HashMap<>((int) (numFields / 0.75f) + 1);
for (int i = 0; i < numFields; ++i) {
final int field = termsMetaIn.readVInt();
final long numTerms = termsMetaIn.readVLong();
if (numTerms <= 0) {
throw new CorruptIndexException("Illegal numTerms for field number: " + field, termsMetaIn);
}
final BytesRef rootCode = readBytesRef(termsMetaIn);
final FieldInfo fieldInfo = state.fieldInfos.fieldInfo(field);
if (fieldInfo == null) {
throw new CorruptIndexException("invalid field number: " + field, termsMetaIn);
}
final long sumTotalTermFreq = termsMetaIn.readVLong();
// when frequencies are omitted, sumDocFreq=sumTotalTermFreq and only one value is written.
final long sumDocFreq = fieldInfo.getIndexOptions() == IndexOptions.DOCS ? sumTotalTermFreq : termsMetaIn.readVLong();
final int docCount = termsMetaIn.readVInt();
if (version < VERSION_META_LONGS_REMOVED) {
final int longsSize = termsMetaIn.readVInt();
if (longsSize < 0) {
throw new CorruptIndexException("invalid longsSize for field: " + fieldInfo.name + ", longsSize=" + longsSize, termsMetaIn);
}
}
BytesRef minTerm = readBytesRef(termsMetaIn);
BytesRef maxTerm = readBytesRef(termsMetaIn);
if (docCount < 0 || docCount > state.segmentInfo.maxDoc()) { // #docs with field must be <= #docs
throw new CorruptIndexException("invalid docCount: " + docCount + " maxDoc: " + state.segmentInfo.maxDoc(), termsMetaIn);
}
if (sumDocFreq < docCount) { // #postings must be >= #docs with field
throw new CorruptIndexException("invalid sumDocFreq: " + sumDocFreq + " docCount: " + docCount, termsMetaIn);
}
if (sumTotalTermFreq < sumDocFreq) { // #positions must be >= #postings
throw new CorruptIndexException("invalid sumTotalTermFreq: " + sumTotalTermFreq + " sumDocFreq: " + sumDocFreq, termsMetaIn);
}
final long indexStartFP = indexMetaIn.readVLong();
FieldReader previous = fieldMap.put(fieldInfo.name,
new FieldReader(this, fieldInfo, numTerms, rootCode, sumTotalTermFreq, sumDocFreq, docCount,
indexStartFP, indexMetaIn, indexIn, minTerm, maxTerm));
if (previous != null) {
throw new CorruptIndexException("duplicate field: " + fieldInfo.name, termsMetaIn);
}
}
if (version >= VERSION_META_FILE) {
indexLength = metaIn.readLong();
termsLength = metaIn.readLong();
}
} catch (Throwable exception) {
priorE = exception;
} finally {
if (metaIn != null) {
CodecUtil.checkFooter(metaIn, priorE);
} else if (priorE != null) {
IOUtils.rethrowAlways(priorE);
}
}
BytesRef minTerm = readBytesRef(termsIn);
BytesRef maxTerm = readBytesRef(termsIn);
if (docCount < 0 || docCount > state.segmentInfo.maxDoc()) { // #docs with field must be <= #docs
throw new CorruptIndexException("invalid docCount: " + docCount + " maxDoc: " + state.segmentInfo.maxDoc(), termsIn);
}
if (sumDocFreq < docCount) { // #postings must be >= #docs with field
throw new CorruptIndexException("invalid sumDocFreq: " + sumDocFreq + " docCount: " + docCount, termsIn);
}
if (sumTotalTermFreq < sumDocFreq) { // #positions must be >= #postings
throw new CorruptIndexException("invalid sumTotalTermFreq: " + sumTotalTermFreq + " sumDocFreq: " + sumDocFreq, termsIn);
}
final long indexStartFP = indexIn.readVLong();
FieldReader previous = fieldMap.put(fieldInfo.name,
new FieldReader(this, fieldInfo, numTerms, rootCode, sumTotalTermFreq, sumDocFreq, docCount,
indexStartFP, indexIn, minTerm, maxTerm));
if (previous != null) {
throw new CorruptIndexException("duplicate field: " + fieldInfo.name, termsIn);
}
}
if (version >= VERSION_META_FILE) {
// At this point the checksum of the meta file has been verified so the lengths are likely correct
CodecUtil.retrieveChecksum(indexIn, indexLength);
CodecUtil.retrieveChecksum(termsIn, termsLength);
} else {
assert indexLength == -1 : indexLength;
assert termsLength == -1 : termsLength;
}
List<String> fieldList = new ArrayList<>(fieldMap.keySet());
fieldList.sort(null);
this.fieldMap = fieldMap;
this.fieldList = Collections.unmodifiableList(fieldList);
success = true;
} finally {

View File

@ -211,6 +211,7 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
//private final static boolean SAVE_DOT_FILES = false;
private final IndexOutput metaOut;
private final IndexOutput termsOut;
private final IndexOutput indexOut;
final int maxDoc;
@ -220,34 +221,7 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
final PostingsWriterBase postingsWriter;
final FieldInfos fieldInfos;
private static class FieldMetaData {
public final FieldInfo fieldInfo;
public final BytesRef rootCode;
public final long numTerms;
public final long indexStartFP;
public final long sumTotalTermFreq;
public final long sumDocFreq;
public final int docCount;
public final BytesRef minTerm;
public final BytesRef maxTerm;
public FieldMetaData(FieldInfo fieldInfo, BytesRef rootCode, long numTerms, long indexStartFP, long sumTotalTermFreq, long sumDocFreq, int docCount,
BytesRef minTerm, BytesRef maxTerm) {
assert numTerms > 0;
this.fieldInfo = fieldInfo;
assert rootCode != null: "field=" + fieldInfo.name + " numTerms=" + numTerms;
this.rootCode = rootCode;
this.indexStartFP = indexStartFP;
this.numTerms = numTerms;
this.sumTotalTermFreq = sumTotalTermFreq;
this.sumDocFreq = sumDocFreq;
this.docCount = docCount;
this.minTerm = minTerm;
this.maxTerm = maxTerm;
}
}
private final List<FieldMetaData> fields = new ArrayList<>();
private final List<ByteBuffersDataOutput> fields = new ArrayList<>();
/** Create a new writer. The number of items (terms or
* sub-blocks) per block will aim to be between
@ -272,7 +246,7 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
final String termsName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, BlockTreeTermsReader.TERMS_EXTENSION);
termsOut = state.directory.createOutput(termsName, state.context);
boolean success = false;
IndexOutput indexOut = null;
IndexOutput metaOut = null, indexOut = null;
try {
CodecUtil.writeIndexHeader(termsOut, BlockTreeTermsReader.TERMS_CODEC_NAME, BlockTreeTermsReader.VERSION_CURRENT,
state.segmentInfo.getId(), state.segmentSuffix);
@ -283,27 +257,23 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
state.segmentInfo.getId(), state.segmentSuffix);
//segment = state.segmentInfo.name;
postingsWriter.init(termsOut, state); // have consumer write its format/header
final String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, BlockTreeTermsReader.TERMS_META_EXTENSION);
metaOut = state.directory.createOutput(metaName, state.context);
CodecUtil.writeIndexHeader(metaOut, BlockTreeTermsReader.TERMS_META_CODEC_NAME, BlockTreeTermsReader.VERSION_CURRENT,
state.segmentInfo.getId(), state.segmentSuffix);
postingsWriter.init(metaOut, state); // have consumer write its format/header
this.metaOut = metaOut;
this.indexOut = indexOut;
success = true;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(termsOut, indexOut);
IOUtils.closeWhileHandlingException(metaOut, termsOut, indexOut);
}
}
}
/** Writes the terms file trailer. */
private void writeTrailer(IndexOutput out, long dirStart) throws IOException {
out.writeLong(dirStart);
}
/** Writes the index file trailer. */
private void writeIndexTrailer(IndexOutput indexOut, long dirStart) throws IOException {
indexOut.writeLong(dirStart);
}
/** Throws {@code IllegalArgumentException} if any of these settings
* is invalid. */
public static void validateSettings(int minItemsInBlock, int maxItemsInBlock) {
@ -548,7 +518,6 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
final FixedBitSet docsSeen;
long sumTotalTermFreq;
long sumDocFreq;
long indexStartFP;
// Records index into pending where the current prefix at that
// length "started"; for example, if current term starts with 't',
@ -1006,11 +975,27 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
assert pending.size() == 1 && !pending.get(0).isTerm: "pending.size()=" + pending.size() + " pending=" + pending;
final PendingBlock root = (PendingBlock) pending.get(0);
assert root.prefix.length == 0;
assert root.index.getEmptyOutput() != null;
final BytesRef rootCode = root.index.getEmptyOutput();
assert rootCode != null;
ByteBuffersDataOutput metaOut = new ByteBuffersDataOutput();
fields.add(metaOut);
metaOut.writeVInt(fieldInfo.number);
metaOut.writeVLong(numTerms);
metaOut.writeVInt(rootCode.length);
metaOut.writeBytes(rootCode.bytes, rootCode.offset, rootCode.length);
assert fieldInfo.getIndexOptions() != IndexOptions.NONE;
if (fieldInfo.getIndexOptions() != IndexOptions.DOCS) {
metaOut.writeVLong(sumTotalTermFreq);
}
metaOut.writeVLong(sumDocFreq);
metaOut.writeVInt(docsSeen.cardinality());
writeBytesRef(metaOut, new BytesRef(firstPendingTerm.termBytes));
writeBytesRef(metaOut, new BytesRef(lastPendingTerm.termBytes));
metaOut.writeVLong(indexOut.getFilePointer());
// Write FST to index
indexStartFP = indexOut.getFilePointer();
root.index.save(indexOut);
root.index.save(metaOut, indexOut);
//System.out.println(" write FST " + indexStartFP + " field=" + fieldInfo.name);
/*
@ -1022,20 +1007,7 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
w.close();
}
*/
assert firstPendingTerm != null;
BytesRef minTerm = new BytesRef(firstPendingTerm.termBytes);
assert lastPendingTerm != null;
BytesRef maxTerm = new BytesRef(lastPendingTerm.termBytes);
fields.add(new FieldMetaData(fieldInfo,
((PendingBlock) pending.get(0)).index.getEmptyOutput(),
numTerms,
indexStartFP,
sumTotalTermFreq,
sumDocFreq,
docsSeen.cardinality(),
minTerm, maxTerm));
} else {
assert sumTotalTermFreq == 0 || fieldInfo.getIndexOptions() == IndexOptions.DOCS && sumTotalTermFreq == -1;
assert sumDocFreq == 0;
@ -1060,47 +1032,29 @@ public final class BlockTreeTermsWriter extends FieldsConsumer {
return;
}
closed = true;
boolean success = false;
try {
final long dirStart = termsOut.getFilePointer();
final long indexDirStart = indexOut.getFilePointer();
termsOut.writeVInt(fields.size());
for(FieldMetaData field : fields) {
//System.out.println(" field " + field.fieldInfo.name + " " + field.numTerms + " terms");
termsOut.writeVInt(field.fieldInfo.number);
assert field.numTerms > 0;
termsOut.writeVLong(field.numTerms);
termsOut.writeVInt(field.rootCode.length);
termsOut.writeBytes(field.rootCode.bytes, field.rootCode.offset, field.rootCode.length);
assert field.fieldInfo.getIndexOptions() != IndexOptions.NONE;
if (field.fieldInfo.getIndexOptions() != IndexOptions.DOCS) {
termsOut.writeVLong(field.sumTotalTermFreq);
}
termsOut.writeVLong(field.sumDocFreq);
termsOut.writeVInt(field.docCount);
indexOut.writeVLong(field.indexStartFP);
writeBytesRef(termsOut, field.minTerm);
writeBytesRef(termsOut, field.maxTerm);
metaOut.writeVInt(fields.size());
for (ByteBuffersDataOutput fieldMeta : fields) {
fieldMeta.copyTo(metaOut);
}
writeTrailer(termsOut, dirStart);
CodecUtil.writeFooter(termsOut);
writeIndexTrailer(indexOut, indexDirStart);
CodecUtil.writeFooter(indexOut);
metaOut.writeLong(indexOut.getFilePointer());
CodecUtil.writeFooter(termsOut);
metaOut.writeLong(termsOut.getFilePointer());
CodecUtil.writeFooter(metaOut);
success = true;
} finally {
if (success) {
IOUtils.close(termsOut, indexOut, postingsWriter);
IOUtils.close(metaOut, termsOut, indexOut, postingsWriter);
} else {
IOUtils.closeWhileHandlingException(termsOut, indexOut, postingsWriter);
IOUtils.closeWhileHandlingException(metaOut, termsOut, indexOut, postingsWriter);
}
}
}
private static void writeBytesRef(IndexOutput out, BytesRef bytes) throws IOException {
private static void writeBytesRef(DataOutput out, BytesRef bytes) throws IOException {
out.writeVInt(bytes.length);
out.writeBytes(bytes.bytes, bytes.offset, bytes.length);
}

View File

@ -52,7 +52,6 @@ public final class FieldReader extends Terms implements Accountable {
final long sumTotalTermFreq;
final long sumDocFreq;
final int docCount;
final long indexStartFP;
final long rootBlockFP;
final BytesRef rootCode;
final BytesRef minTerm;
@ -63,7 +62,7 @@ public final class FieldReader extends Terms implements Accountable {
//private boolean DEBUG;
FieldReader(BlockTreeTermsReader parent, FieldInfo fieldInfo, long numTerms, BytesRef rootCode, long sumTotalTermFreq, long sumDocFreq, int docCount,
long indexStartFP, IndexInput indexIn, BytesRef minTerm, BytesRef maxTerm) throws IOException {
long indexStartFP, IndexInput metaIn, IndexInput indexIn, BytesRef minTerm, BytesRef maxTerm) throws IOException {
assert numTerms > 0;
this.fieldInfo = fieldInfo;
//DEBUG = BlockTreeTermsReader.DEBUG && fieldInfo.name.equals("id");
@ -72,7 +71,6 @@ public final class FieldReader extends Terms implements Accountable {
this.sumTotalTermFreq = sumTotalTermFreq;
this.sumDocFreq = sumDocFreq;
this.docCount = docCount;
this.indexStartFP = indexStartFP;
this.rootCode = rootCode;
this.minTerm = minTerm;
this.maxTerm = maxTerm;
@ -81,22 +79,22 @@ public final class FieldReader extends Terms implements Accountable {
// }
rootBlockFP = (new ByteArrayDataInput(rootCode.bytes, rootCode.offset, rootCode.length)).readVLong() >>> BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS;
// Initialize FST always off-heap.
if (indexIn != null) {
final IndexInput clone = indexIn.clone();
clone.seek(indexStartFP);
index = new FST<>(clone, ByteSequenceOutputs.getSingleton(), new OffHeapFSTStore());
/*
if (false) {
final String dotFileName = segment + "_" + fieldInfo.name + ".dot";
Writer w = new OutputStreamWriter(new FileOutputStream(dotFileName));
Util.toDot(index, w, false, false);
System.out.println("FST INDEX: SAVED to " + dotFileName);
w.close();
}
*/
final IndexInput clone = indexIn.clone();
clone.seek(indexStartFP);
if (metaIn == indexIn) { // Only true before Lucene 8.6
index = new FST<>(clone, clone, ByteSequenceOutputs.getSingleton(), new OffHeapFSTStore());
} else {
index = null;
index = new FST<>(metaIn, clone, ByteSequenceOutputs.getSingleton(), new OffHeapFSTStore());
}
/*
if (false) {
final String dotFileName = segment + "_" + fieldInfo.name + ".dot";
Writer w = new OutputStreamWriter(new FileOutputStream(dotFileName));
Util.toDot(index, w, false, false);
System.out.println("FST INDEX: SAVED to " + dotFileName);
w.close();
}
*/
}
@Override

View File

@ -408,26 +408,26 @@ public final class FST<T> implements Accountable {
private static final int DEFAULT_MAX_BLOCK_BITS = Constants.JRE_IS_64BIT ? 30 : 28;
/** Load a previously saved FST. */
public FST(DataInput in, Outputs<T> outputs) throws IOException {
this(in, outputs, new OnHeapFSTStore(DEFAULT_MAX_BLOCK_BITS));
public FST(DataInput metaIn, DataInput in, Outputs<T> outputs) throws IOException {
this(metaIn, in, outputs, new OnHeapFSTStore(DEFAULT_MAX_BLOCK_BITS));
}
/** Load a previously saved FST; maxBlockBits allows you to
* control the size of the byte[] pages used to hold the FST bytes. */
public FST(DataInput in, Outputs<T> outputs, FSTStore fstStore) throws IOException {
public FST(DataInput metaIn, DataInput in, Outputs<T> outputs, FSTStore fstStore) throws IOException {
bytes = null;
this.fstStore = fstStore;
this.outputs = outputs;
// NOTE: only reads formats VERSION_START up to VERSION_CURRENT; we don't have
// back-compat promise for FSTs (they are experimental), but we are sometimes able to offer it
CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_START, VERSION_CURRENT);
if (in.readByte() == 1) {
CodecUtil.checkHeader(metaIn, FILE_FORMAT_NAME, VERSION_START, VERSION_CURRENT);
if (metaIn.readByte() == 1) {
// accepts empty string
// 1 KB blocks:
BytesStore emptyBytes = new BytesStore(10);
int numBytes = in.readVInt();
emptyBytes.copyBytes(in, numBytes);
int numBytes = metaIn.readVInt();
emptyBytes.copyBytes(metaIn, numBytes);
// De-serialize empty-string output:
BytesReader reader = emptyBytes.getReverseReader();
@ -441,7 +441,7 @@ public final class FST<T> implements Accountable {
} else {
emptyOutput = null;
}
final byte t = in.readByte();
final byte t = metaIn.readByte();
switch(t) {
case 0:
inputType = INPUT_TYPE.BYTE1;
@ -455,9 +455,9 @@ public final class FST<T> implements Accountable {
default:
throw new CorruptIndexException("invalid input type " + t, in);
}
startNode = in.readVLong();
startNode = metaIn.readVLong();
long numBytes = in.readVLong();
long numBytes = metaIn.readVLong();
this.fstStore.init(in, numBytes);
}
@ -502,16 +502,16 @@ public final class FST<T> implements Accountable {
}
}
public void save(DataOutput out) throws IOException {
public void save(DataOutput metaOut, DataOutput out) throws IOException {
if (startNode == -1) {
throw new IllegalStateException("call finish first");
}
CodecUtil.writeHeader(out, FILE_FORMAT_NAME, VERSION_CURRENT);
CodecUtil.writeHeader(metaOut, FILE_FORMAT_NAME, VERSION_CURRENT);
// TODO: really we should encode this as an arc, arriving
// to the root node, instead of special casing here:
if (emptyOutput != null) {
// Accepts empty string
out.writeByte((byte) 1);
metaOut.writeByte((byte) 1);
// Serialize empty-string output:
ByteBuffersDataOutput ros = new ByteBuffersDataOutput();
@ -528,10 +528,10 @@ public final class FST<T> implements Accountable {
emptyOutputBytes[emptyLen - upto - 1] = b;
upto++;
}
out.writeVInt(emptyLen);
out.writeBytes(emptyOutputBytes, 0, emptyLen);
metaOut.writeVInt(emptyLen);
metaOut.writeBytes(emptyOutputBytes, 0, emptyLen);
} else {
out.writeByte((byte) 0);
metaOut.writeByte((byte) 0);
}
final byte t;
if (inputType == INPUT_TYPE.BYTE1) {
@ -541,11 +541,11 @@ public final class FST<T> implements Accountable {
} else {
t = 2;
}
out.writeByte(t);
out.writeVLong(startNode);
metaOut.writeByte(t);
metaOut.writeVLong(startNode);
if (bytes != null) {
long numBytes = bytes.getPosition();
out.writeVLong(numBytes);
metaOut.writeVLong(numBytes);
bytes.writeTo(out);
} else {
assert fstStore != null;
@ -558,7 +558,8 @@ public final class FST<T> implements Accountable {
*/
public void save(final Path path) throws IOException {
try (OutputStream os = new BufferedOutputStream(Files.newOutputStream(path))) {
save(new OutputStreamDataOutput(os));
DataOutput out = new OutputStreamDataOutput(os);
save(out, out);
}
}
@ -567,7 +568,8 @@ public final class FST<T> implements Accountable {
*/
public static <T> FST<T> read(Path path, Outputs<T> outputs) throws IOException {
try (InputStream is = Files.newInputStream(path)) {
return new FST<>(new InputStreamDataInput(new BufferedInputStream(is)), outputs);
DataInput in = new InputStreamDataInput(new BufferedInputStream(is));
return new FST<>(in, in, outputs);
}
}

View File

@ -119,10 +119,10 @@ public class Test2BFST extends LuceneTestCase {
if (verify == 0) {
System.out.println("\nTEST: save/load FST and re-verify");
IndexOutput out = dir.createOutput("fst", IOContext.DEFAULT);
fst.save(out);
fst.save(out, out);
out.close();
IndexInput in = dir.openInput("fst", IOContext.DEFAULT);
fst = new FST<>(in, outputs);
fst = new FST<>(in, in, outputs);
in.close();
} else {
dir.deleteFile("fst");
@ -198,10 +198,10 @@ public class Test2BFST extends LuceneTestCase {
if (verify == 0) {
System.out.println("\nTEST: save/load FST and re-verify");
IndexOutput out = dir.createOutput("fst", IOContext.DEFAULT);
fst.save(out);
fst.save(out, out);
out.close();
IndexInput in = dir.openInput("fst", IOContext.DEFAULT);
fst = new FST<>(in, outputs);
fst = new FST<>(in, in, outputs);
in.close();
} else {
dir.deleteFile("fst");
@ -286,10 +286,10 @@ public class Test2BFST extends LuceneTestCase {
if (verify == 0) {
System.out.println("\nTEST: save/load FST and re-verify");
IndexOutput out = dir.createOutput("fst", IOContext.DEFAULT);
fst.save(out);
fst.save(out, out);
out.close();
IndexInput in = dir.openInput("fst", IOContext.DEFAULT);
fst = new FST<>(in, outputs);
fst = new FST<>(in, in, outputs);
in.close();
} else {
dir.deleteFile("fst");

View File

@ -174,7 +174,7 @@ public class TestFSTDirectAddressing extends LuceneTestCase {
private static void countFSTArcs(String fstFilePath) throws IOException {
byte[] buf = Files.readAllBytes(Paths.get(fstFilePath));
DataInput in = new ByteArrayDataInput(buf);
FST<BytesRef> fst = new FST<>(in, ByteSequenceOutputs.getSingleton());
FST<BytesRef> fst = new FST<>(in, in, ByteSequenceOutputs.getSingleton());
BytesRefFSTEnum<BytesRef> fstEnum = new BytesRefFSTEnum<>(fst);
int binarySearchArcCount = 0, directAddressingArcCount = 0, listArcCount = 0;
while(fstEnum.next() != null) {
@ -228,7 +228,7 @@ public class TestFSTDirectAddressing extends LuceneTestCase {
System.out.println("Reading FST");
long startTimeMs = System.currentTimeMillis();
FST<CharsRef> originalFst = new FST<>(in, CharSequenceOutputs.getSingleton());
FST<CharsRef> originalFst = new FST<>(in, in, CharSequenceOutputs.getSingleton());
long endTimeMs = System.currentTimeMillis();
System.out.println("time = " + (endTimeMs - startTimeMs) + " ms");

View File

@ -529,7 +529,7 @@ public class TestFSTs extends LuceneTestCase {
Directory dir = FSDirectory.open(dirOut);
IndexOutput out = dir.createOutput("fst.bin", IOContext.DEFAULT);
fst.save(out);
fst.save(out, out);
out.close();
System.out.println("Saved FST to fst.bin.");
@ -1195,11 +1195,11 @@ public class TestFSTs extends LuceneTestCase {
// Make sure it still works after save/load:
Directory dir = newDirectory();
IndexOutput out = dir.createOutput("fst", IOContext.DEFAULT);
fst.save(out);
fst.save(out, out);
out.close();
IndexInput in = dir.openInput("fst", IOContext.DEFAULT);
final FST<Long> fst2 = new FST<>(in, outputs);
final FST<Long> fst2 = new FST<>(in, in, outputs);
checkStopNodes(fst2, outputs);
in.close();
dir.close();

View File

@ -779,7 +779,7 @@ public final class VersionBlockTreeTermsWriter extends FieldsConsumer {
// Write FST to index
indexStartFP = indexOut.getFilePointer();
root.index.save(indexOut);
root.index.save(indexOut, indexOut);
//System.out.println(" write FST " + indexStartFP + " field=" + fieldInfo.name);
// if (SAVE_DOT_FILES || DEBUG) {

View File

@ -74,7 +74,7 @@ final class VersionFieldReader extends Terms implements Accountable {
final IndexInput clone = indexIn.clone();
//System.out.println("start=" + indexStartFP + " field=" + fieldInfo.name);
clone.seek(indexStartFP);
index = new FST<>(clone, VersionBlockTreeTermsWriter.FST_OUTPUTS);
index = new FST<>(clone, clone, VersionBlockTreeTermsWriter.FST_OUTPUTS);
/*
if (false) {

View File

@ -598,7 +598,7 @@ public class AnalyzingSuggester extends Lookup implements Accountable {
return false;
}
fst.save(output);
fst.save(output, output);
output.writeVInt(maxAnalyzedPathsForOneInput);
output.writeByte((byte) (hasPayloads ? 1 : 0));
return true;
@ -607,7 +607,7 @@ public class AnalyzingSuggester extends Lookup implements Accountable {
@Override
public boolean load(DataInput input) throws IOException {
count = input.readVLong();
this.fst = new FST<>(input, new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton()));
this.fst = new FST<>(input, input, new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton()));
maxAnalyzedPathsForOneInput = input.readVInt();
hasPayloads = input.readByte() == 1;
return true;

View File

@ -360,7 +360,7 @@ public class FreeTextSuggester extends Lookup implements Accountable {
output.writeByte(separator);
output.writeVInt(grams);
output.writeVLong(totTokens);
fst.save(output);
fst.save(output, output);
return true;
}
@ -378,7 +378,7 @@ public class FreeTextSuggester extends Lookup implements Accountable {
}
totTokens = input.readVLong();
fst = new FST<>(input, PositiveIntOutputs.getSingleton());
fst = new FST<>(input, input, PositiveIntOutputs.getSingleton());
return true;
}

View File

@ -324,11 +324,11 @@ public final class NRTSuggester implements Accountable {
OffHeapFSTStore store = new OffHeapFSTStore();
IndexInput clone = input.clone();
clone.seek(input.getFilePointer());
fst = new FST<>(clone, new PairOutputs<>(
fst = new FST<>(clone, clone, new PairOutputs<>(
PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton()), store);
input.seek(clone.getFilePointer() + store.size());
} else {
fst = new FST<>(input, new PairOutputs<>(
fst = new FST<>(input, input, new PairOutputs<>(
PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton()));
}

View File

@ -123,7 +123,7 @@ final class NRTSuggesterBuilder {
if (fst == null) {
return false;
}
fst.save(output);
fst.save(output, output);
/* write some more meta-info */
assert maxAnalyzedPathsPerOutput > 0;

View File

@ -298,7 +298,7 @@ public class FSTCompletionLookup extends Lookup implements Accountable {
if (normalCompletion == null || normalCompletion.getFST() == null) {
return false;
}
normalCompletion.getFST().save(output);
normalCompletion.getFST().save(output, output);
return true;
}
@ -306,7 +306,7 @@ public class FSTCompletionLookup extends Lookup implements Accountable {
public synchronized boolean load(DataInput input) throws IOException {
count = input.readVLong();
this.higherWeightsCompletion = new FSTCompletion(new FST<>(
input, NoOutputs.getSingleton()));
input, input, NoOutputs.getSingleton()));
this.normalCompletion = new FSTCompletion(
higherWeightsCompletion.getFST(), false, exactMatchFirst);
return true;

View File

@ -141,14 +141,14 @@ public class WFSTCompletionLookup extends Lookup implements Accountable {
if (fst == null) {
return false;
}
fst.save(output);
fst.save(output, output);
return true;
}
@Override
public boolean load(DataInput input) throws IOException {
count = input.readVLong();
this.fst = new FST<>(input, PositiveIntOutputs.getSingleton());
this.fst = new FST<>(input, input, PositiveIntOutputs.getSingleton());
return true;
}

View File

@ -296,11 +296,11 @@ public class FSTTester<T> {
if (random.nextBoolean() && fst != null) {
IOContext context = LuceneTestCase.newIOContext(random);
IndexOutput out = dir.createOutput("fst.bin", context);
fst.save(out);
fst.save(out, out);
out.close();
IndexInput in = dir.openInput("fst.bin", context);
try {
fst = new FST<T>(in, outputs);
fst = new FST<T>(in, in, outputs);
} finally {
in.close();
dir.deleteFile("fst.bin");