mirror of https://github.com/apache/lucene.git
Use `ReadAdvice#RANDOM` when appropriate. (#13222)
This switches the following files to `ReadAdvice.RANDOM`: - Stored fields data file. - Term vectors data file. - HNSW graph. - Temporary file storing vectors at merge time that we use to construct the merged HNSW graph. - Vector data files, including quantized data files. I hesitated using `ReadAdvice.RANDOM` on terms, since they have a random access pattern when running term queries, but a more sequential access pattern when running multi-term queries. I erred on the conservative side and did not switch them to `ReadAdvice.RANDOM` for now. For simplicity, I'm only touching the current codec, not previous codecs. There are also some known issues: - These files will keep using a `RANDOM` `ReadAdvice` at merge time. We need some way for merge instances to get an updated `IOContext`? We have the same problem with `IOContext#LOAD` today. - With quantized vectors, raw vectors don't have random access pattern, but it was challenging to give raw vectors a sequential access pattern when there are quantized vectors and a random access pattern otherwise. So they assume a random access pattern all the time.
This commit is contained in:
parent
6104c86abc
commit
4ea2bae119
|
@ -58,6 +58,7 @@ import org.apache.lucene.store.DataInput;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
import org.apache.lucene.store.IOContext;
|
import org.apache.lucene.store.IOContext;
|
||||||
import org.apache.lucene.store.IndexInput;
|
import org.apache.lucene.store.IndexInput;
|
||||||
|
import org.apache.lucene.store.ReadAdvice;
|
||||||
import org.apache.lucene.util.ArrayUtil;
|
import org.apache.lucene.util.ArrayUtil;
|
||||||
import org.apache.lucene.util.BitUtil;
|
import org.apache.lucene.util.BitUtil;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
@ -128,7 +129,7 @@ public final class Lucene90CompressingStoredFieldsReader extends StoredFieldsRea
|
||||||
ChecksumIndexInput metaIn = null;
|
ChecksumIndexInput metaIn = null;
|
||||||
try {
|
try {
|
||||||
// Open the data file
|
// Open the data file
|
||||||
fieldsStream = d.openInput(fieldsStreamFN, context);
|
fieldsStream = d.openInput(fieldsStreamFN, context.withReadAdvice(ReadAdvice.RANDOM));
|
||||||
version =
|
version =
|
||||||
CodecUtil.checkIndexHeader(
|
CodecUtil.checkIndexHeader(
|
||||||
fieldsStream, formatName, VERSION_START, VERSION_CURRENT, si.getId(), segmentSuffix);
|
fieldsStream, formatName, VERSION_START, VERSION_CURRENT, si.getId(), segmentSuffix);
|
||||||
|
|
|
@ -59,6 +59,7 @@ import org.apache.lucene.store.Directory;
|
||||||
import org.apache.lucene.store.IOContext;
|
import org.apache.lucene.store.IOContext;
|
||||||
import org.apache.lucene.store.IndexInput;
|
import org.apache.lucene.store.IndexInput;
|
||||||
import org.apache.lucene.store.RandomAccessInput;
|
import org.apache.lucene.store.RandomAccessInput;
|
||||||
|
import org.apache.lucene.store.ReadAdvice;
|
||||||
import org.apache.lucene.util.ArrayUtil;
|
import org.apache.lucene.util.ArrayUtil;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
import org.apache.lucene.util.IOUtils;
|
import org.apache.lucene.util.IOUtils;
|
||||||
|
@ -134,7 +135,7 @@ public final class Lucene90CompressingTermVectorsReader extends TermVectorsReade
|
||||||
// Open the data file
|
// Open the data file
|
||||||
final String vectorsStreamFN =
|
final String vectorsStreamFN =
|
||||||
IndexFileNames.segmentFileName(segment, segmentSuffix, VECTORS_EXTENSION);
|
IndexFileNames.segmentFileName(segment, segmentSuffix, VECTORS_EXTENSION);
|
||||||
vectorsStream = d.openInput(vectorsStreamFN, context);
|
vectorsStream = d.openInput(vectorsStreamFN, context.withReadAdvice(ReadAdvice.RANDOM));
|
||||||
version =
|
version =
|
||||||
CodecUtil.checkIndexHeader(
|
CodecUtil.checkIndexHeader(
|
||||||
vectorsStream, formatName, VERSION_START, VERSION_CURRENT, si.getId(), segmentSuffix);
|
vectorsStream, formatName, VERSION_START, VERSION_CURRENT, si.getId(), segmentSuffix);
|
||||||
|
|
|
@ -38,7 +38,9 @@ import org.apache.lucene.index.SegmentReadState;
|
||||||
import org.apache.lucene.index.VectorEncoding;
|
import org.apache.lucene.index.VectorEncoding;
|
||||||
import org.apache.lucene.index.VectorSimilarityFunction;
|
import org.apache.lucene.index.VectorSimilarityFunction;
|
||||||
import org.apache.lucene.store.ChecksumIndexInput;
|
import org.apache.lucene.store.ChecksumIndexInput;
|
||||||
|
import org.apache.lucene.store.IOContext;
|
||||||
import org.apache.lucene.store.IndexInput;
|
import org.apache.lucene.store.IndexInput;
|
||||||
|
import org.apache.lucene.store.ReadAdvice;
|
||||||
import org.apache.lucene.util.Accountable;
|
import org.apache.lucene.util.Accountable;
|
||||||
import org.apache.lucene.util.IOUtils;
|
import org.apache.lucene.util.IOUtils;
|
||||||
import org.apache.lucene.util.RamUsageEstimator;
|
import org.apache.lucene.util.RamUsageEstimator;
|
||||||
|
@ -66,7 +68,10 @@ public final class Lucene99FlatVectorsReader extends FlatVectorsReader {
|
||||||
state,
|
state,
|
||||||
versionMeta,
|
versionMeta,
|
||||||
Lucene99FlatVectorsFormat.VECTOR_DATA_EXTENSION,
|
Lucene99FlatVectorsFormat.VECTOR_DATA_EXTENSION,
|
||||||
Lucene99FlatVectorsFormat.VECTOR_DATA_CODEC_NAME);
|
Lucene99FlatVectorsFormat.VECTOR_DATA_CODEC_NAME,
|
||||||
|
// Flat formats are used to randomly access vectors from their node ID that is stored
|
||||||
|
// in the HNSW graph.
|
||||||
|
state.context.withReadAdvice(ReadAdvice.RANDOM));
|
||||||
success = true;
|
success = true;
|
||||||
} finally {
|
} finally {
|
||||||
if (success == false) {
|
if (success == false) {
|
||||||
|
@ -102,11 +107,15 @@ public final class Lucene99FlatVectorsReader extends FlatVectorsReader {
|
||||||
}
|
}
|
||||||
|
|
||||||
private static IndexInput openDataInput(
|
private static IndexInput openDataInput(
|
||||||
SegmentReadState state, int versionMeta, String fileExtension, String codecName)
|
SegmentReadState state,
|
||||||
|
int versionMeta,
|
||||||
|
String fileExtension,
|
||||||
|
String codecName,
|
||||||
|
IOContext context)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
String fileName =
|
String fileName =
|
||||||
IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, fileExtension);
|
IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, fileExtension);
|
||||||
IndexInput in = state.directory.openInput(fileName, state.context);
|
IndexInput in = state.directory.openInput(fileName, context);
|
||||||
boolean success = false;
|
boolean success = false;
|
||||||
try {
|
try {
|
||||||
int versionVectorData =
|
int versionVectorData =
|
||||||
|
|
|
@ -44,8 +44,10 @@ import org.apache.lucene.index.SegmentWriteState;
|
||||||
import org.apache.lucene.index.Sorter;
|
import org.apache.lucene.index.Sorter;
|
||||||
import org.apache.lucene.index.VectorEncoding;
|
import org.apache.lucene.index.VectorEncoding;
|
||||||
import org.apache.lucene.search.DocIdSetIterator;
|
import org.apache.lucene.search.DocIdSetIterator;
|
||||||
|
import org.apache.lucene.store.IOContext;
|
||||||
import org.apache.lucene.store.IndexInput;
|
import org.apache.lucene.store.IndexInput;
|
||||||
import org.apache.lucene.store.IndexOutput;
|
import org.apache.lucene.store.IndexOutput;
|
||||||
|
import org.apache.lucene.store.ReadAdvice;
|
||||||
import org.apache.lucene.util.ArrayUtil;
|
import org.apache.lucene.util.ArrayUtil;
|
||||||
import org.apache.lucene.util.IOUtils;
|
import org.apache.lucene.util.IOUtils;
|
||||||
import org.apache.lucene.util.RamUsageEstimator;
|
import org.apache.lucene.util.RamUsageEstimator;
|
||||||
|
@ -283,10 +285,13 @@ public final class Lucene99FlatVectorsWriter extends FlatVectorsWriter {
|
||||||
CodecUtil.writeFooter(tempVectorData);
|
CodecUtil.writeFooter(tempVectorData);
|
||||||
IOUtils.close(tempVectorData);
|
IOUtils.close(tempVectorData);
|
||||||
|
|
||||||
// copy the temporary file vectors to the actual data file
|
// This temp file will be accessed in a random-access fashion to construct the HNSW graph.
|
||||||
|
// Note: don't use the context from the state, which is a flush/merge context, not expecting
|
||||||
|
// to perform random reads.
|
||||||
vectorDataInput =
|
vectorDataInput =
|
||||||
segmentWriteState.directory.openInput(
|
segmentWriteState.directory.openInput(
|
||||||
tempVectorData.getName(), segmentWriteState.context);
|
tempVectorData.getName(), IOContext.DEFAULT.withReadAdvice(ReadAdvice.RANDOM));
|
||||||
|
// copy the temporary file vectors to the actual data file
|
||||||
vectorData.copyBytes(vectorDataInput, vectorDataInput.length() - CodecUtil.footerLength());
|
vectorData.copyBytes(vectorDataInput, vectorDataInput.length() - CodecUtil.footerLength());
|
||||||
CodecUtil.retrieveChecksum(vectorDataInput);
|
CodecUtil.retrieveChecksum(vectorDataInput);
|
||||||
long vectorDataLength = vectorData.getFilePointer() - vectorDataOffset;
|
long vectorDataLength = vectorData.getFilePointer() - vectorDataOffset;
|
||||||
|
|
|
@ -40,8 +40,10 @@ import org.apache.lucene.index.VectorSimilarityFunction;
|
||||||
import org.apache.lucene.search.KnnCollector;
|
import org.apache.lucene.search.KnnCollector;
|
||||||
import org.apache.lucene.store.ChecksumIndexInput;
|
import org.apache.lucene.store.ChecksumIndexInput;
|
||||||
import org.apache.lucene.store.DataInput;
|
import org.apache.lucene.store.DataInput;
|
||||||
|
import org.apache.lucene.store.IOContext;
|
||||||
import org.apache.lucene.store.IndexInput;
|
import org.apache.lucene.store.IndexInput;
|
||||||
import org.apache.lucene.store.RandomAccessInput;
|
import org.apache.lucene.store.RandomAccessInput;
|
||||||
|
import org.apache.lucene.store.ReadAdvice;
|
||||||
import org.apache.lucene.util.Accountable;
|
import org.apache.lucene.util.Accountable;
|
||||||
import org.apache.lucene.util.Bits;
|
import org.apache.lucene.util.Bits;
|
||||||
import org.apache.lucene.util.IOUtils;
|
import org.apache.lucene.util.IOUtils;
|
||||||
|
@ -102,7 +104,8 @@ public final class Lucene99HnswVectorsReader extends KnnVectorsReader
|
||||||
state,
|
state,
|
||||||
versionMeta,
|
versionMeta,
|
||||||
Lucene99HnswVectorsFormat.VECTOR_INDEX_EXTENSION,
|
Lucene99HnswVectorsFormat.VECTOR_INDEX_EXTENSION,
|
||||||
Lucene99HnswVectorsFormat.VECTOR_INDEX_CODEC_NAME);
|
Lucene99HnswVectorsFormat.VECTOR_INDEX_CODEC_NAME,
|
||||||
|
state.context.withReadAdvice(ReadAdvice.RANDOM));
|
||||||
success = true;
|
success = true;
|
||||||
} finally {
|
} finally {
|
||||||
if (success == false) {
|
if (success == false) {
|
||||||
|
@ -112,11 +115,15 @@ public final class Lucene99HnswVectorsReader extends KnnVectorsReader
|
||||||
}
|
}
|
||||||
|
|
||||||
private static IndexInput openDataInput(
|
private static IndexInput openDataInput(
|
||||||
SegmentReadState state, int versionMeta, String fileExtension, String codecName)
|
SegmentReadState state,
|
||||||
|
int versionMeta,
|
||||||
|
String fileExtension,
|
||||||
|
String codecName,
|
||||||
|
IOContext context)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
String fileName =
|
String fileName =
|
||||||
IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, fileExtension);
|
IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, fileExtension);
|
||||||
IndexInput in = state.directory.openInput(fileName, state.context);
|
IndexInput in = state.directory.openInput(fileName, context);
|
||||||
boolean success = false;
|
boolean success = false;
|
||||||
try {
|
try {
|
||||||
int versionVectorData =
|
int versionVectorData =
|
||||||
|
|
|
@ -36,7 +36,9 @@ import org.apache.lucene.index.SegmentReadState;
|
||||||
import org.apache.lucene.index.VectorEncoding;
|
import org.apache.lucene.index.VectorEncoding;
|
||||||
import org.apache.lucene.index.VectorSimilarityFunction;
|
import org.apache.lucene.index.VectorSimilarityFunction;
|
||||||
import org.apache.lucene.store.ChecksumIndexInput;
|
import org.apache.lucene.store.ChecksumIndexInput;
|
||||||
|
import org.apache.lucene.store.IOContext;
|
||||||
import org.apache.lucene.store.IndexInput;
|
import org.apache.lucene.store.IndexInput;
|
||||||
|
import org.apache.lucene.store.ReadAdvice;
|
||||||
import org.apache.lucene.util.Accountable;
|
import org.apache.lucene.util.Accountable;
|
||||||
import org.apache.lucene.util.IOUtils;
|
import org.apache.lucene.util.IOUtils;
|
||||||
import org.apache.lucene.util.RamUsageEstimator;
|
import org.apache.lucene.util.RamUsageEstimator;
|
||||||
|
@ -93,7 +95,10 @@ public final class Lucene99ScalarQuantizedVectorsReader extends FlatVectorsReade
|
||||||
state,
|
state,
|
||||||
versionMeta,
|
versionMeta,
|
||||||
Lucene99ScalarQuantizedVectorsFormat.VECTOR_DATA_EXTENSION,
|
Lucene99ScalarQuantizedVectorsFormat.VECTOR_DATA_EXTENSION,
|
||||||
Lucene99ScalarQuantizedVectorsFormat.VECTOR_DATA_CODEC_NAME);
|
Lucene99ScalarQuantizedVectorsFormat.VECTOR_DATA_CODEC_NAME,
|
||||||
|
// Quantized vectors are accessed randomly from their node ID stored in the HNSW
|
||||||
|
// graph.
|
||||||
|
state.context.withReadAdvice(ReadAdvice.RANDOM));
|
||||||
success = true;
|
success = true;
|
||||||
} finally {
|
} finally {
|
||||||
if (success == false) {
|
if (success == false) {
|
||||||
|
@ -166,11 +171,15 @@ public final class Lucene99ScalarQuantizedVectorsReader extends FlatVectorsReade
|
||||||
}
|
}
|
||||||
|
|
||||||
private static IndexInput openDataInput(
|
private static IndexInput openDataInput(
|
||||||
SegmentReadState state, int versionMeta, String fileExtension, String codecName)
|
SegmentReadState state,
|
||||||
|
int versionMeta,
|
||||||
|
String fileExtension,
|
||||||
|
String codecName,
|
||||||
|
IOContext context)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
String fileName =
|
String fileName =
|
||||||
IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, fileExtension);
|
IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, fileExtension);
|
||||||
IndexInput in = state.directory.openInput(fileName, state.context);
|
IndexInput in = state.directory.openInput(fileName, context);
|
||||||
boolean success = false;
|
boolean success = false;
|
||||||
try {
|
try {
|
||||||
int versionVectorData =
|
int versionVectorData =
|
||||||
|
|
Loading…
Reference in New Issue