mirror of https://github.com/apache/lucene.git
Use `ReadAdvice#RANDOM` when appropriate. (#13222)
This switches the following files to `ReadAdvice.RANDOM`: - Stored fields data file. - Term vectors data file. - HNSW graph. - Temporary file storing vectors at merge time that we use to construct the merged HNSW graph. - Vector data files, including quantized data files. I hesitated using `ReadAdvice.RANDOM` on terms, since they have a random access pattern when running term queries, but a more sequential access pattern when running multi-term queries. I erred on the conservative side and did not switch them to `ReadAdvice.RANDOM` for now. For simplicity, I'm only touching the current codec, not previous codecs. There are also some known issues: - These files will keep using a `RANDOM` `ReadAdvice` at merge time. We need some way for merge instances to get an updated `IOContext`? We have the same problem with `IOContext#LOAD` today. - With quantized vectors, raw vectors don't have random access pattern, but it was challenging to give raw vectors a sequential access pattern when there are quantized vectors and a random access pattern otherwise. So they assume a random access pattern all the time.
This commit is contained in:
parent
6104c86abc
commit
4ea2bae119
|
@ -58,6 +58,7 @@ import org.apache.lucene.store.DataInput;
|
|||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.ReadAdvice;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.BitUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
@ -128,7 +129,7 @@ public final class Lucene90CompressingStoredFieldsReader extends StoredFieldsRea
|
|||
ChecksumIndexInput metaIn = null;
|
||||
try {
|
||||
// Open the data file
|
||||
fieldsStream = d.openInput(fieldsStreamFN, context);
|
||||
fieldsStream = d.openInput(fieldsStreamFN, context.withReadAdvice(ReadAdvice.RANDOM));
|
||||
version =
|
||||
CodecUtil.checkIndexHeader(
|
||||
fieldsStream, formatName, VERSION_START, VERSION_CURRENT, si.getId(), segmentSuffix);
|
||||
|
|
|
@ -59,6 +59,7 @@ import org.apache.lucene.store.Directory;
|
|||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.RandomAccessInput;
|
||||
import org.apache.lucene.store.ReadAdvice;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
@ -134,7 +135,7 @@ public final class Lucene90CompressingTermVectorsReader extends TermVectorsReade
|
|||
// Open the data file
|
||||
final String vectorsStreamFN =
|
||||
IndexFileNames.segmentFileName(segment, segmentSuffix, VECTORS_EXTENSION);
|
||||
vectorsStream = d.openInput(vectorsStreamFN, context);
|
||||
vectorsStream = d.openInput(vectorsStreamFN, context.withReadAdvice(ReadAdvice.RANDOM));
|
||||
version =
|
||||
CodecUtil.checkIndexHeader(
|
||||
vectorsStream, formatName, VERSION_START, VERSION_CURRENT, si.getId(), segmentSuffix);
|
||||
|
|
|
@ -38,7 +38,9 @@ import org.apache.lucene.index.SegmentReadState;
|
|||
import org.apache.lucene.index.VectorEncoding;
|
||||
import org.apache.lucene.index.VectorSimilarityFunction;
|
||||
import org.apache.lucene.store.ChecksumIndexInput;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.ReadAdvice;
|
||||
import org.apache.lucene.util.Accountable;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
|
@ -66,7 +68,10 @@ public final class Lucene99FlatVectorsReader extends FlatVectorsReader {
|
|||
state,
|
||||
versionMeta,
|
||||
Lucene99FlatVectorsFormat.VECTOR_DATA_EXTENSION,
|
||||
Lucene99FlatVectorsFormat.VECTOR_DATA_CODEC_NAME);
|
||||
Lucene99FlatVectorsFormat.VECTOR_DATA_CODEC_NAME,
|
||||
// Flat formats are used to randomly access vectors from their node ID that is stored
|
||||
// in the HNSW graph.
|
||||
state.context.withReadAdvice(ReadAdvice.RANDOM));
|
||||
success = true;
|
||||
} finally {
|
||||
if (success == false) {
|
||||
|
@ -102,11 +107,15 @@ public final class Lucene99FlatVectorsReader extends FlatVectorsReader {
|
|||
}
|
||||
|
||||
private static IndexInput openDataInput(
|
||||
SegmentReadState state, int versionMeta, String fileExtension, String codecName)
|
||||
SegmentReadState state,
|
||||
int versionMeta,
|
||||
String fileExtension,
|
||||
String codecName,
|
||||
IOContext context)
|
||||
throws IOException {
|
||||
String fileName =
|
||||
IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, fileExtension);
|
||||
IndexInput in = state.directory.openInput(fileName, state.context);
|
||||
IndexInput in = state.directory.openInput(fileName, context);
|
||||
boolean success = false;
|
||||
try {
|
||||
int versionVectorData =
|
||||
|
|
|
@ -44,8 +44,10 @@ import org.apache.lucene.index.SegmentWriteState;
|
|||
import org.apache.lucene.index.Sorter;
|
||||
import org.apache.lucene.index.VectorEncoding;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.store.ReadAdvice;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
|
@ -283,10 +285,13 @@ public final class Lucene99FlatVectorsWriter extends FlatVectorsWriter {
|
|||
CodecUtil.writeFooter(tempVectorData);
|
||||
IOUtils.close(tempVectorData);
|
||||
|
||||
// copy the temporary file vectors to the actual data file
|
||||
// This temp file will be accessed in a random-access fashion to construct the HNSW graph.
|
||||
// Note: don't use the context from the state, which is a flush/merge context, not expecting
|
||||
// to perform random reads.
|
||||
vectorDataInput =
|
||||
segmentWriteState.directory.openInput(
|
||||
tempVectorData.getName(), segmentWriteState.context);
|
||||
tempVectorData.getName(), IOContext.DEFAULT.withReadAdvice(ReadAdvice.RANDOM));
|
||||
// copy the temporary file vectors to the actual data file
|
||||
vectorData.copyBytes(vectorDataInput, vectorDataInput.length() - CodecUtil.footerLength());
|
||||
CodecUtil.retrieveChecksum(vectorDataInput);
|
||||
long vectorDataLength = vectorData.getFilePointer() - vectorDataOffset;
|
||||
|
|
|
@ -40,8 +40,10 @@ import org.apache.lucene.index.VectorSimilarityFunction;
|
|||
import org.apache.lucene.search.KnnCollector;
|
||||
import org.apache.lucene.store.ChecksumIndexInput;
|
||||
import org.apache.lucene.store.DataInput;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.RandomAccessInput;
|
||||
import org.apache.lucene.store.ReadAdvice;
|
||||
import org.apache.lucene.util.Accountable;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
@ -102,7 +104,8 @@ public final class Lucene99HnswVectorsReader extends KnnVectorsReader
|
|||
state,
|
||||
versionMeta,
|
||||
Lucene99HnswVectorsFormat.VECTOR_INDEX_EXTENSION,
|
||||
Lucene99HnswVectorsFormat.VECTOR_INDEX_CODEC_NAME);
|
||||
Lucene99HnswVectorsFormat.VECTOR_INDEX_CODEC_NAME,
|
||||
state.context.withReadAdvice(ReadAdvice.RANDOM));
|
||||
success = true;
|
||||
} finally {
|
||||
if (success == false) {
|
||||
|
@ -112,11 +115,15 @@ public final class Lucene99HnswVectorsReader extends KnnVectorsReader
|
|||
}
|
||||
|
||||
private static IndexInput openDataInput(
|
||||
SegmentReadState state, int versionMeta, String fileExtension, String codecName)
|
||||
SegmentReadState state,
|
||||
int versionMeta,
|
||||
String fileExtension,
|
||||
String codecName,
|
||||
IOContext context)
|
||||
throws IOException {
|
||||
String fileName =
|
||||
IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, fileExtension);
|
||||
IndexInput in = state.directory.openInput(fileName, state.context);
|
||||
IndexInput in = state.directory.openInput(fileName, context);
|
||||
boolean success = false;
|
||||
try {
|
||||
int versionVectorData =
|
||||
|
|
|
@ -36,7 +36,9 @@ import org.apache.lucene.index.SegmentReadState;
|
|||
import org.apache.lucene.index.VectorEncoding;
|
||||
import org.apache.lucene.index.VectorSimilarityFunction;
|
||||
import org.apache.lucene.store.ChecksumIndexInput;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.ReadAdvice;
|
||||
import org.apache.lucene.util.Accountable;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
|
@ -93,7 +95,10 @@ public final class Lucene99ScalarQuantizedVectorsReader extends FlatVectorsReade
|
|||
state,
|
||||
versionMeta,
|
||||
Lucene99ScalarQuantizedVectorsFormat.VECTOR_DATA_EXTENSION,
|
||||
Lucene99ScalarQuantizedVectorsFormat.VECTOR_DATA_CODEC_NAME);
|
||||
Lucene99ScalarQuantizedVectorsFormat.VECTOR_DATA_CODEC_NAME,
|
||||
// Quantized vectors are accessed randomly from their node ID stored in the HNSW
|
||||
// graph.
|
||||
state.context.withReadAdvice(ReadAdvice.RANDOM));
|
||||
success = true;
|
||||
} finally {
|
||||
if (success == false) {
|
||||
|
@ -166,11 +171,15 @@ public final class Lucene99ScalarQuantizedVectorsReader extends FlatVectorsReade
|
|||
}
|
||||
|
||||
private static IndexInput openDataInput(
|
||||
SegmentReadState state, int versionMeta, String fileExtension, String codecName)
|
||||
SegmentReadState state,
|
||||
int versionMeta,
|
||||
String fileExtension,
|
||||
String codecName,
|
||||
IOContext context)
|
||||
throws IOException {
|
||||
String fileName =
|
||||
IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, fileExtension);
|
||||
IndexInput in = state.directory.openInput(fileName, state.context);
|
||||
IndexInput in = state.directory.openInput(fileName, context);
|
||||
boolean success = false;
|
||||
try {
|
||||
int versionVectorData =
|
||||
|
|
Loading…
Reference in New Issue