Use `ReadAdvice#RANDOM` when appropriate. (#13222)

This switches the following files to `ReadAdvice.RANDOM`:
 - Stored fields data file.
 - Term vectors data file.
 - HNSW graph.
 - Temporary file storing vectors at merge time that we use to construct the
   merged HNSW graph.
 - Vector data files, including quantized data files.

I hesitated using `ReadAdvice.RANDOM` on terms, since they have a random access
pattern when running term queries, but a more sequential access pattern when
running multi-term queries. I erred on the conservative side and did not switch
them to `ReadAdvice.RANDOM` for now.

For simplicity, I'm only touching the current codec, not previous codecs. There
are also some known issues:
 - These files will keep using a `RANDOM` `ReadAdvice` at merge time. We need
   some way for merge instances to get an updated `IOContext`? We have the same
   problem with `IOContext#LOAD` today.
 - With quantized vectors, raw vectors don't have random access pattern, but it
   was challenging to give raw vectors a sequential access pattern when there
   are quantized vectors and a random access pattern otherwise. So they assume a
   random access pattern all the time.
This commit is contained in:
Adrien Grand 2024-04-04 18:31:13 +02:00 committed by GitHub
parent 6104c86abc
commit 4ea2bae119
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 45 additions and 13 deletions

View File

@ -58,6 +58,7 @@ import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.ReadAdvice;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BitUtil;
import org.apache.lucene.util.BytesRef;
@ -128,7 +129,7 @@ public final class Lucene90CompressingStoredFieldsReader extends StoredFieldsRea
ChecksumIndexInput metaIn = null;
try {
// Open the data file
fieldsStream = d.openInput(fieldsStreamFN, context);
fieldsStream = d.openInput(fieldsStreamFN, context.withReadAdvice(ReadAdvice.RANDOM));
version =
CodecUtil.checkIndexHeader(
fieldsStream, formatName, VERSION_START, VERSION_CURRENT, si.getId(), segmentSuffix);

View File

@ -59,6 +59,7 @@ import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.RandomAccessInput;
import org.apache.lucene.store.ReadAdvice;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
@ -134,7 +135,7 @@ public final class Lucene90CompressingTermVectorsReader extends TermVectorsReade
// Open the data file
final String vectorsStreamFN =
IndexFileNames.segmentFileName(segment, segmentSuffix, VECTORS_EXTENSION);
vectorsStream = d.openInput(vectorsStreamFN, context);
vectorsStream = d.openInput(vectorsStreamFN, context.withReadAdvice(ReadAdvice.RANDOM));
version =
CodecUtil.checkIndexHeader(
vectorsStream, formatName, VERSION_START, VERSION_CURRENT, si.getId(), segmentSuffix);

View File

@ -38,7 +38,9 @@ import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.VectorEncoding;
import org.apache.lucene.index.VectorSimilarityFunction;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.ReadAdvice;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.RamUsageEstimator;
@ -66,7 +68,10 @@ public final class Lucene99FlatVectorsReader extends FlatVectorsReader {
state,
versionMeta,
Lucene99FlatVectorsFormat.VECTOR_DATA_EXTENSION,
Lucene99FlatVectorsFormat.VECTOR_DATA_CODEC_NAME);
Lucene99FlatVectorsFormat.VECTOR_DATA_CODEC_NAME,
// Flat formats are used to randomly access vectors from their node ID that is stored
// in the HNSW graph.
state.context.withReadAdvice(ReadAdvice.RANDOM));
success = true;
} finally {
if (success == false) {
@ -102,11 +107,15 @@ public final class Lucene99FlatVectorsReader extends FlatVectorsReader {
}
private static IndexInput openDataInput(
SegmentReadState state, int versionMeta, String fileExtension, String codecName)
SegmentReadState state,
int versionMeta,
String fileExtension,
String codecName,
IOContext context)
throws IOException {
String fileName =
IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, fileExtension);
IndexInput in = state.directory.openInput(fileName, state.context);
IndexInput in = state.directory.openInput(fileName, context);
boolean success = false;
try {
int versionVectorData =

View File

@ -44,8 +44,10 @@ import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.Sorter;
import org.apache.lucene.index.VectorEncoding;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.ReadAdvice;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.RamUsageEstimator;
@ -283,10 +285,13 @@ public final class Lucene99FlatVectorsWriter extends FlatVectorsWriter {
CodecUtil.writeFooter(tempVectorData);
IOUtils.close(tempVectorData);
// copy the temporary file vectors to the actual data file
// This temp file will be accessed in a random-access fashion to construct the HNSW graph.
// Note: don't use the context from the state, which is a flush/merge context, not expecting
// to perform random reads.
vectorDataInput =
segmentWriteState.directory.openInput(
tempVectorData.getName(), segmentWriteState.context);
tempVectorData.getName(), IOContext.DEFAULT.withReadAdvice(ReadAdvice.RANDOM));
// copy the temporary file vectors to the actual data file
vectorData.copyBytes(vectorDataInput, vectorDataInput.length() - CodecUtil.footerLength());
CodecUtil.retrieveChecksum(vectorDataInput);
long vectorDataLength = vectorData.getFilePointer() - vectorDataOffset;

View File

@ -40,8 +40,10 @@ import org.apache.lucene.index.VectorSimilarityFunction;
import org.apache.lucene.search.KnnCollector;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.RandomAccessInput;
import org.apache.lucene.store.ReadAdvice;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.IOUtils;
@ -102,7 +104,8 @@ public final class Lucene99HnswVectorsReader extends KnnVectorsReader
state,
versionMeta,
Lucene99HnswVectorsFormat.VECTOR_INDEX_EXTENSION,
Lucene99HnswVectorsFormat.VECTOR_INDEX_CODEC_NAME);
Lucene99HnswVectorsFormat.VECTOR_INDEX_CODEC_NAME,
state.context.withReadAdvice(ReadAdvice.RANDOM));
success = true;
} finally {
if (success == false) {
@ -112,11 +115,15 @@ public final class Lucene99HnswVectorsReader extends KnnVectorsReader
}
private static IndexInput openDataInput(
SegmentReadState state, int versionMeta, String fileExtension, String codecName)
SegmentReadState state,
int versionMeta,
String fileExtension,
String codecName,
IOContext context)
throws IOException {
String fileName =
IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, fileExtension);
IndexInput in = state.directory.openInput(fileName, state.context);
IndexInput in = state.directory.openInput(fileName, context);
boolean success = false;
try {
int versionVectorData =

View File

@ -36,7 +36,9 @@ import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.VectorEncoding;
import org.apache.lucene.index.VectorSimilarityFunction;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.ReadAdvice;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.RamUsageEstimator;
@ -93,7 +95,10 @@ public final class Lucene99ScalarQuantizedVectorsReader extends FlatVectorsReade
state,
versionMeta,
Lucene99ScalarQuantizedVectorsFormat.VECTOR_DATA_EXTENSION,
Lucene99ScalarQuantizedVectorsFormat.VECTOR_DATA_CODEC_NAME);
Lucene99ScalarQuantizedVectorsFormat.VECTOR_DATA_CODEC_NAME,
// Quantized vectors are accessed randomly from their node ID stored in the HNSW
// graph.
state.context.withReadAdvice(ReadAdvice.RANDOM));
success = true;
} finally {
if (success == false) {
@ -166,11 +171,15 @@ public final class Lucene99ScalarQuantizedVectorsReader extends FlatVectorsReade
}
private static IndexInput openDataInput(
SegmentReadState state, int versionMeta, String fileExtension, String codecName)
SegmentReadState state,
int versionMeta,
String fileExtension,
String codecName,
IOContext context)
throws IOException {
String fileName =
IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, fileExtension);
IndexInput in = state.directory.openInput(fileName, state.context);
IndexInput in = state.directory.openInput(fileName, context);
boolean success = false;
try {
int versionVectorData =