Clean up variable-gaps terms format. (#13216)

The variable-gaps terms format uses the legacy storage layout of storing
metadata at the end of the index file, and storing the start pointer of the
metadata as the last 8 bytes of the index files (just before the footer). This
forces an awkward access pattern at open time when we first need to seek
towards the end to check that a footer is present, then seek some more bytes
backwards to read metadata, and finally read the content of the index that sits
before metadata.

To fix this, meta data and index data are now split into different files. This
way, both files have a clean sequential and read-once access pattern, and can
take advantage of the `ChecksumIndexInput` abstraction for checksum validation.

This further helps clean up `IOContext` by removing the ability to set
`readOnce` to `true` on an existing `IOContext`.
This commit is contained in:
Adrien Grand 2024-03-27 18:16:30 +01:00 committed by GitHub
parent d54663ad76
commit 96c0c3082a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 90 additions and 132 deletions

View File

@ -20,19 +20,16 @@ import static org.apache.lucene.util.fst.FST.readMetadata;
import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.Accountables;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.fst.BytesRefFSTEnum;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PositiveIntOutputs;
@ -46,53 +43,61 @@ public class VariableGapTermsIndexReader extends TermsIndexReaderBase {
private final PositiveIntOutputs fstOutputs = PositiveIntOutputs.getSingleton();
final HashMap<String, FieldIndexData> fields = new HashMap<>();
final HashMap<String, FST<Long>> fields = new HashMap<>();
public VariableGapTermsIndexReader(SegmentReadState state) throws IOException {
String fileName =
String metaFileName =
IndexFileNames.segmentFileName(
state.segmentInfo.name,
state.segmentSuffix,
VariableGapTermsIndexWriter.TERMS_META_EXTENSION);
String indexFileName =
IndexFileNames.segmentFileName(
state.segmentInfo.name,
state.segmentSuffix,
VariableGapTermsIndexWriter.TERMS_INDEX_EXTENSION);
final IndexInput in = state.directory.openInput(fileName, IOContext.READONCE);
boolean success = false;
try {
try (ChecksumIndexInput metaIn = state.directory.openChecksumInput(metaFileName);
ChecksumIndexInput indexIn = state.directory.openChecksumInput(indexFileName)) {
CodecUtil.checkIndexHeader(
in,
VariableGapTermsIndexWriter.CODEC_NAME,
VariableGapTermsIndexWriter.VERSION_START,
VariableGapTermsIndexWriter.VERSION_CURRENT,
state.segmentInfo.getId(),
state.segmentSuffix);
Throwable priorE = null;
try {
CodecUtil.checkIndexHeader(
metaIn,
VariableGapTermsIndexWriter.META_CODEC_NAME,
VariableGapTermsIndexWriter.VERSION_START,
VariableGapTermsIndexWriter.VERSION_CURRENT,
state.segmentInfo.getId(),
state.segmentSuffix);
CodecUtil.checksumEntireFile(in);
CodecUtil.checkIndexHeader(
indexIn,
VariableGapTermsIndexWriter.CODEC_NAME,
VariableGapTermsIndexWriter.VERSION_START,
VariableGapTermsIndexWriter.VERSION_CURRENT,
state.segmentInfo.getId(),
state.segmentSuffix);
seekDir(in);
// Read directory
final int numFields = in.readVInt();
if (numFields < 0) {
throw new CorruptIndexException("invalid numFields: " + numFields, in);
}
for (int i = 0; i < numFields; i++) {
final int field = in.readVInt();
final long indexStart = in.readVLong();
final FieldInfo fieldInfo = state.fieldInfos.fieldInfo(field);
FieldIndexData previous =
fields.put(fieldInfo.name, new FieldIndexData(in, fieldInfo, indexStart));
if (previous != null) {
throw new CorruptIndexException("duplicate field: " + fieldInfo.name, in);
// Read directory
for (int field = metaIn.readInt(); field != -1; field = metaIn.readInt()) {
final long indexStart = metaIn.readVLong();
final FieldInfo fieldInfo = state.fieldInfos.fieldInfo(field);
if (indexIn.getFilePointer() != indexStart) {
throw new CorruptIndexException(
"Gap in FST, expected position " + indexIn.getFilePointer() + ", got " + indexStart,
metaIn);
}
FST<Long> fst = new FST<>(readMetadata(metaIn, fstOutputs), indexIn);
FST<Long> previous = fields.put(fieldInfo.name, fst);
if (previous != null) {
throw new CorruptIndexException("duplicate field: " + fieldInfo.name, metaIn);
}
}
}
success = true;
} finally {
if (success) {
IOUtils.close(in);
} else {
IOUtils.closeWhileHandlingException(in);
} catch (Throwable t) {
priorE = t;
} finally {
CodecUtil.checkFooter(metaIn, priorE);
CodecUtil.checkFooter(indexIn, priorE);
}
}
}
@ -150,68 +155,26 @@ public class VariableGapTermsIndexReader extends TermsIndexReaderBase {
return false;
}
private final class FieldIndexData implements Accountable {
private final FST<Long> fst;
public FieldIndexData(IndexInput in, FieldInfo fieldInfo, long indexStart) throws IOException {
IndexInput clone = in.clone();
clone.seek(indexStart);
fst = new FST<>(readMetadata(clone, fstOutputs), clone);
clone.close();
/*
final String dotFileName = segment + "_" + fieldInfo.name + ".dot";
Writer w = new OutputStreamWriter(new FileOutputStream(dotFileName));
Util.toDot(fst, w, false, false);
System.out.println("FST INDEX: SAVED to " + dotFileName);
w.close();
*/
}
@Override
public long ramBytesUsed() {
return fst == null ? 0 : fst.ramBytesUsed();
}
@Override
public Collection<Accountable> getChildResources() {
if (fst == null) {
return Collections.emptyList();
} else {
return Collections.singletonList(Accountables.namedAccountable("index data", fst));
}
}
@Override
public String toString() {
return "VarGapTermIndex";
}
}
@Override
public FieldIndexEnum getFieldEnum(FieldInfo fieldInfo) {
final FieldIndexData fieldData = fields.get(fieldInfo.name);
if (fieldData.fst == null) {
final FST<Long> fieldData = fields.get(fieldInfo.name);
if (fieldData == null) {
return null;
} else {
return new IndexEnum(fieldData.fst);
return new IndexEnum(fieldData);
}
}
@Override
public void close() throws IOException {}
private void seekDir(IndexInput input) throws IOException {
input.seek(input.length() - CodecUtil.footerLength() - 8);
long dirOffset = input.readLong();
input.seek(dirOffset);
}
@Override
public long ramBytesUsed() {
long sizeInBytes = 0;
for (FieldIndexData entry : fields.values()) {
sizeInBytes += entry.ramBytesUsed();
for (FST<Long> entry : fields.values()) {
if (entry != null) {
sizeInBytes += entry.ramBytesUsed();
}
}
return sizeInBytes;
}

View File

@ -17,8 +17,6 @@
package org.apache.lucene.codecs.blockterms;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.TermStats;
import org.apache.lucene.index.FieldInfo;
@ -44,16 +42,19 @@ import org.apache.lucene.util.fst.Util;
* @lucene.experimental
*/
public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
protected IndexOutput metaOut;
protected IndexOutput out;
/** Extension of terms index file */
static final String TERMS_INDEX_EXTENSION = "tiv";
static final String CODEC_NAME = "VariableGapTermsIndex";
static final int VERSION_START = 3;
static final int VERSION_CURRENT = VERSION_START;
/** Extension of terms meta file */
static final String TERMS_META_EXTENSION = "tmv";
private final List<FSTFieldWriter> fields = new ArrayList<>();
static final String META_CODEC_NAME = "VariableGapTermsMeta";
static final String CODEC_NAME = "VariableGapTermsIndex";
static final int VERSION_START = 4;
static final int VERSION_CURRENT = VERSION_START;
@SuppressWarnings("unused")
private final FieldInfos fieldInfos; // unread
@ -176,20 +177,32 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
public VariableGapTermsIndexWriter(SegmentWriteState state, IndexTermSelector policy)
throws IOException {
fieldInfos = state.fieldInfos;
this.policy = policy;
final String metaFileName =
IndexFileNames.segmentFileName(
state.segmentInfo.name, state.segmentSuffix, TERMS_META_EXTENSION);
final String indexFileName =
IndexFileNames.segmentFileName(
state.segmentInfo.name, state.segmentSuffix, TERMS_INDEX_EXTENSION);
out = state.directory.createOutput(indexFileName, state.context);
boolean success = false;
try {
fieldInfos = state.fieldInfos;
this.policy = policy;
metaOut = state.directory.createOutput(metaFileName, state.context);
out = state.directory.createOutput(indexFileName, state.context);
CodecUtil.writeIndexHeader(
metaOut,
META_CODEC_NAME,
VERSION_CURRENT,
state.segmentInfo.getId(),
state.segmentSuffix);
CodecUtil.writeIndexHeader(
out, CODEC_NAME, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
success = true;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(out);
IOUtils.closeWhileHandlingException(this);
}
}
}
@ -198,9 +211,7 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
public FieldWriter addField(FieldInfo field, long termsFilePointer) throws IOException {
//// System.out.println("VGW: field=" + field.name);
policy.newField(field);
FSTFieldWriter writer = new FSTFieldWriter(field, termsFilePointer);
fields.add(writer);
return writer;
return new FSTFieldWriter(field, termsFilePointer);
}
/**
@ -230,7 +241,6 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
final FieldInfo fieldInfo;
FST<Long> fst;
final long indexStart;
private final BytesRefBuilder lastTerm = new BytesRefBuilder();
private boolean first = true;
@ -239,7 +249,6 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
this.fieldInfo = fieldInfo;
fstOutputs = PositiveIntOutputs.getSingleton();
fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, fstOutputs).build();
indexStart = out.getFilePointer();
//// System.out.println("VGW: field=" + fieldInfo.name);
// Always put empty string in
@ -285,44 +294,30 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
public void finish(long termsFilePointer) throws IOException {
fst = FST.fromFSTReader(fstCompiler.compile(), fstCompiler.getFSTReader());
if (fst != null) {
fst.save(out, out);
metaOut.writeInt(fieldInfo.number);
metaOut.writeVLong(out.getFilePointer());
fst.save(metaOut, out);
}
}
}
@Override
public void close() throws IOException {
if (out != null) {
try {
final long dirStart = out.getFilePointer();
final int fieldCount = fields.size();
int nonNullFieldCount = 0;
for (int i = 0; i < fieldCount; i++) {
FSTFieldWriter field = fields.get(i);
if (field.fst != null) {
nonNullFieldCount++;
}
}
out.writeVInt(nonNullFieldCount);
for (int i = 0; i < fieldCount; i++) {
FSTFieldWriter field = fields.get(i);
if (field.fst != null) {
out.writeVInt(field.fieldInfo.number);
out.writeVLong(field.indexStart);
}
}
writeTrailer(dirStart);
try {
if (metaOut != null) {
metaOut.writeInt(-1);
CodecUtil.writeFooter(metaOut);
}
if (out != null) {
CodecUtil.writeFooter(out);
}
} finally {
try {
IOUtils.close(out, metaOut);
} finally {
out.close();
out = null;
metaOut = null;
}
}
}
private void writeTrailer(long dirStart) throws IOException {
out.writeLong(dirStart);
}
}