mirror of https://github.com/apache/lucene.git
Clean up variable-gaps terms format. (#13216)
The variable-gaps terms format uses the legacy storage layout of storing metadata at the end of the index file, and storing the start pointer of the metadata as the last 8 bytes of the index files (just before the footer). This forces an awkward access pattern at open time when we first need to seek towards the end to check that a footer is present, then seek some more bytes backwards to read metadata, and finally read the content of the index that sits before metadata. To fix this, meta data and index data are now split into different files. This way, both files have a clean sequential and read-once access pattern, and can take advantage of the `ChecksumIndexInput` abstraction for checksum validation. This further helps clean up `IOContext` by removing the ability to set `readOnce` to `true` on an existing `IOContext`.
This commit is contained in:
parent
d54663ad76
commit
96c0c3082a
|
@ -20,19 +20,16 @@ import static org.apache.lucene.util.fst.FST.readMetadata;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.index.CorruptIndexException;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.ChecksumIndexInput;
|
||||
import org.apache.lucene.util.Accountable;
|
||||
import org.apache.lucene.util.Accountables;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.fst.BytesRefFSTEnum;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
import org.apache.lucene.util.fst.PositiveIntOutputs;
|
||||
|
@ -46,53 +43,61 @@ public class VariableGapTermsIndexReader extends TermsIndexReaderBase {
|
|||
|
||||
private final PositiveIntOutputs fstOutputs = PositiveIntOutputs.getSingleton();
|
||||
|
||||
final HashMap<String, FieldIndexData> fields = new HashMap<>();
|
||||
final HashMap<String, FST<Long>> fields = new HashMap<>();
|
||||
|
||||
public VariableGapTermsIndexReader(SegmentReadState state) throws IOException {
|
||||
String fileName =
|
||||
String metaFileName =
|
||||
IndexFileNames.segmentFileName(
|
||||
state.segmentInfo.name,
|
||||
state.segmentSuffix,
|
||||
VariableGapTermsIndexWriter.TERMS_META_EXTENSION);
|
||||
String indexFileName =
|
||||
IndexFileNames.segmentFileName(
|
||||
state.segmentInfo.name,
|
||||
state.segmentSuffix,
|
||||
VariableGapTermsIndexWriter.TERMS_INDEX_EXTENSION);
|
||||
final IndexInput in = state.directory.openInput(fileName, IOContext.READONCE);
|
||||
boolean success = false;
|
||||
|
||||
try {
|
||||
try (ChecksumIndexInput metaIn = state.directory.openChecksumInput(metaFileName);
|
||||
ChecksumIndexInput indexIn = state.directory.openChecksumInput(indexFileName)) {
|
||||
|
||||
CodecUtil.checkIndexHeader(
|
||||
in,
|
||||
VariableGapTermsIndexWriter.CODEC_NAME,
|
||||
VariableGapTermsIndexWriter.VERSION_START,
|
||||
VariableGapTermsIndexWriter.VERSION_CURRENT,
|
||||
state.segmentInfo.getId(),
|
||||
state.segmentSuffix);
|
||||
Throwable priorE = null;
|
||||
try {
|
||||
CodecUtil.checkIndexHeader(
|
||||
metaIn,
|
||||
VariableGapTermsIndexWriter.META_CODEC_NAME,
|
||||
VariableGapTermsIndexWriter.VERSION_START,
|
||||
VariableGapTermsIndexWriter.VERSION_CURRENT,
|
||||
state.segmentInfo.getId(),
|
||||
state.segmentSuffix);
|
||||
|
||||
CodecUtil.checksumEntireFile(in);
|
||||
CodecUtil.checkIndexHeader(
|
||||
indexIn,
|
||||
VariableGapTermsIndexWriter.CODEC_NAME,
|
||||
VariableGapTermsIndexWriter.VERSION_START,
|
||||
VariableGapTermsIndexWriter.VERSION_CURRENT,
|
||||
state.segmentInfo.getId(),
|
||||
state.segmentSuffix);
|
||||
|
||||
seekDir(in);
|
||||
|
||||
// Read directory
|
||||
final int numFields = in.readVInt();
|
||||
if (numFields < 0) {
|
||||
throw new CorruptIndexException("invalid numFields: " + numFields, in);
|
||||
}
|
||||
|
||||
for (int i = 0; i < numFields; i++) {
|
||||
final int field = in.readVInt();
|
||||
final long indexStart = in.readVLong();
|
||||
final FieldInfo fieldInfo = state.fieldInfos.fieldInfo(field);
|
||||
FieldIndexData previous =
|
||||
fields.put(fieldInfo.name, new FieldIndexData(in, fieldInfo, indexStart));
|
||||
if (previous != null) {
|
||||
throw new CorruptIndexException("duplicate field: " + fieldInfo.name, in);
|
||||
// Read directory
|
||||
for (int field = metaIn.readInt(); field != -1; field = metaIn.readInt()) {
|
||||
final long indexStart = metaIn.readVLong();
|
||||
final FieldInfo fieldInfo = state.fieldInfos.fieldInfo(field);
|
||||
if (indexIn.getFilePointer() != indexStart) {
|
||||
throw new CorruptIndexException(
|
||||
"Gap in FST, expected position " + indexIn.getFilePointer() + ", got " + indexStart,
|
||||
metaIn);
|
||||
}
|
||||
FST<Long> fst = new FST<>(readMetadata(metaIn, fstOutputs), indexIn);
|
||||
FST<Long> previous = fields.put(fieldInfo.name, fst);
|
||||
if (previous != null) {
|
||||
throw new CorruptIndexException("duplicate field: " + fieldInfo.name, metaIn);
|
||||
}
|
||||
}
|
||||
}
|
||||
success = true;
|
||||
} finally {
|
||||
if (success) {
|
||||
IOUtils.close(in);
|
||||
} else {
|
||||
IOUtils.closeWhileHandlingException(in);
|
||||
} catch (Throwable t) {
|
||||
priorE = t;
|
||||
} finally {
|
||||
CodecUtil.checkFooter(metaIn, priorE);
|
||||
CodecUtil.checkFooter(indexIn, priorE);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -150,68 +155,26 @@ public class VariableGapTermsIndexReader extends TermsIndexReaderBase {
|
|||
return false;
|
||||
}
|
||||
|
||||
private final class FieldIndexData implements Accountable {
|
||||
private final FST<Long> fst;
|
||||
|
||||
public FieldIndexData(IndexInput in, FieldInfo fieldInfo, long indexStart) throws IOException {
|
||||
IndexInput clone = in.clone();
|
||||
clone.seek(indexStart);
|
||||
fst = new FST<>(readMetadata(clone, fstOutputs), clone);
|
||||
clone.close();
|
||||
|
||||
/*
|
||||
final String dotFileName = segment + "_" + fieldInfo.name + ".dot";
|
||||
Writer w = new OutputStreamWriter(new FileOutputStream(dotFileName));
|
||||
Util.toDot(fst, w, false, false);
|
||||
System.out.println("FST INDEX: SAVED to " + dotFileName);
|
||||
w.close();
|
||||
*/
|
||||
}
|
||||
|
||||
@Override
|
||||
public long ramBytesUsed() {
|
||||
return fst == null ? 0 : fst.ramBytesUsed();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Collection<Accountable> getChildResources() {
|
||||
if (fst == null) {
|
||||
return Collections.emptyList();
|
||||
} else {
|
||||
return Collections.singletonList(Accountables.namedAccountable("index data", fst));
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "VarGapTermIndex";
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public FieldIndexEnum getFieldEnum(FieldInfo fieldInfo) {
|
||||
final FieldIndexData fieldData = fields.get(fieldInfo.name);
|
||||
if (fieldData.fst == null) {
|
||||
final FST<Long> fieldData = fields.get(fieldInfo.name);
|
||||
if (fieldData == null) {
|
||||
return null;
|
||||
} else {
|
||||
return new IndexEnum(fieldData.fst);
|
||||
return new IndexEnum(fieldData);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {}
|
||||
|
||||
private void seekDir(IndexInput input) throws IOException {
|
||||
input.seek(input.length() - CodecUtil.footerLength() - 8);
|
||||
long dirOffset = input.readLong();
|
||||
input.seek(dirOffset);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long ramBytesUsed() {
|
||||
long sizeInBytes = 0;
|
||||
for (FieldIndexData entry : fields.values()) {
|
||||
sizeInBytes += entry.ramBytesUsed();
|
||||
for (FST<Long> entry : fields.values()) {
|
||||
if (entry != null) {
|
||||
sizeInBytes += entry.ramBytesUsed();
|
||||
}
|
||||
}
|
||||
return sizeInBytes;
|
||||
}
|
||||
|
|
|
@ -17,8 +17,6 @@
|
|||
package org.apache.lucene.codecs.blockterms;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.TermStats;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
|
@ -44,16 +42,19 @@ import org.apache.lucene.util.fst.Util;
|
|||
* @lucene.experimental
|
||||
*/
|
||||
public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
|
||||
protected IndexOutput metaOut;
|
||||
protected IndexOutput out;
|
||||
|
||||
/** Extension of terms index file */
|
||||
static final String TERMS_INDEX_EXTENSION = "tiv";
|
||||
|
||||
static final String CODEC_NAME = "VariableGapTermsIndex";
|
||||
static final int VERSION_START = 3;
|
||||
static final int VERSION_CURRENT = VERSION_START;
|
||||
/** Extension of terms meta file */
|
||||
static final String TERMS_META_EXTENSION = "tmv";
|
||||
|
||||
private final List<FSTFieldWriter> fields = new ArrayList<>();
|
||||
static final String META_CODEC_NAME = "VariableGapTermsMeta";
|
||||
static final String CODEC_NAME = "VariableGapTermsIndex";
|
||||
static final int VERSION_START = 4;
|
||||
static final int VERSION_CURRENT = VERSION_START;
|
||||
|
||||
@SuppressWarnings("unused")
|
||||
private final FieldInfos fieldInfos; // unread
|
||||
|
@ -176,20 +177,32 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
|
|||
|
||||
public VariableGapTermsIndexWriter(SegmentWriteState state, IndexTermSelector policy)
|
||||
throws IOException {
|
||||
fieldInfos = state.fieldInfos;
|
||||
this.policy = policy;
|
||||
|
||||
final String metaFileName =
|
||||
IndexFileNames.segmentFileName(
|
||||
state.segmentInfo.name, state.segmentSuffix, TERMS_META_EXTENSION);
|
||||
final String indexFileName =
|
||||
IndexFileNames.segmentFileName(
|
||||
state.segmentInfo.name, state.segmentSuffix, TERMS_INDEX_EXTENSION);
|
||||
out = state.directory.createOutput(indexFileName, state.context);
|
||||
|
||||
boolean success = false;
|
||||
try {
|
||||
fieldInfos = state.fieldInfos;
|
||||
this.policy = policy;
|
||||
metaOut = state.directory.createOutput(metaFileName, state.context);
|
||||
out = state.directory.createOutput(indexFileName, state.context);
|
||||
CodecUtil.writeIndexHeader(
|
||||
metaOut,
|
||||
META_CODEC_NAME,
|
||||
VERSION_CURRENT,
|
||||
state.segmentInfo.getId(),
|
||||
state.segmentSuffix);
|
||||
CodecUtil.writeIndexHeader(
|
||||
out, CODEC_NAME, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
|
||||
success = true;
|
||||
} finally {
|
||||
if (!success) {
|
||||
IOUtils.closeWhileHandlingException(out);
|
||||
IOUtils.closeWhileHandlingException(this);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -198,9 +211,7 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
|
|||
public FieldWriter addField(FieldInfo field, long termsFilePointer) throws IOException {
|
||||
//// System.out.println("VGW: field=" + field.name);
|
||||
policy.newField(field);
|
||||
FSTFieldWriter writer = new FSTFieldWriter(field, termsFilePointer);
|
||||
fields.add(writer);
|
||||
return writer;
|
||||
return new FSTFieldWriter(field, termsFilePointer);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -230,7 +241,6 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
|
|||
|
||||
final FieldInfo fieldInfo;
|
||||
FST<Long> fst;
|
||||
final long indexStart;
|
||||
|
||||
private final BytesRefBuilder lastTerm = new BytesRefBuilder();
|
||||
private boolean first = true;
|
||||
|
@ -239,7 +249,6 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
|
|||
this.fieldInfo = fieldInfo;
|
||||
fstOutputs = PositiveIntOutputs.getSingleton();
|
||||
fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, fstOutputs).build();
|
||||
indexStart = out.getFilePointer();
|
||||
//// System.out.println("VGW: field=" + fieldInfo.name);
|
||||
|
||||
// Always put empty string in
|
||||
|
@ -285,44 +294,30 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
|
|||
public void finish(long termsFilePointer) throws IOException {
|
||||
fst = FST.fromFSTReader(fstCompiler.compile(), fstCompiler.getFSTReader());
|
||||
if (fst != null) {
|
||||
fst.save(out, out);
|
||||
metaOut.writeInt(fieldInfo.number);
|
||||
metaOut.writeVLong(out.getFilePointer());
|
||||
fst.save(metaOut, out);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
if (out != null) {
|
||||
try {
|
||||
final long dirStart = out.getFilePointer();
|
||||
final int fieldCount = fields.size();
|
||||
|
||||
int nonNullFieldCount = 0;
|
||||
for (int i = 0; i < fieldCount; i++) {
|
||||
FSTFieldWriter field = fields.get(i);
|
||||
if (field.fst != null) {
|
||||
nonNullFieldCount++;
|
||||
}
|
||||
}
|
||||
|
||||
out.writeVInt(nonNullFieldCount);
|
||||
for (int i = 0; i < fieldCount; i++) {
|
||||
FSTFieldWriter field = fields.get(i);
|
||||
if (field.fst != null) {
|
||||
out.writeVInt(field.fieldInfo.number);
|
||||
out.writeVLong(field.indexStart);
|
||||
}
|
||||
}
|
||||
writeTrailer(dirStart);
|
||||
try {
|
||||
if (metaOut != null) {
|
||||
metaOut.writeInt(-1);
|
||||
CodecUtil.writeFooter(metaOut);
|
||||
}
|
||||
if (out != null) {
|
||||
CodecUtil.writeFooter(out);
|
||||
}
|
||||
} finally {
|
||||
try {
|
||||
IOUtils.close(out, metaOut);
|
||||
} finally {
|
||||
out.close();
|
||||
out = null;
|
||||
metaOut = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void writeTrailer(long dirStart) throws IOException {
|
||||
out.writeLong(dirStart);
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue