From 4ea2bae1198bb3052912e08faa3a4c2cdaa67db0 Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Thu, 4 Apr 2024 18:31:13 +0200 Subject: [PATCH] Use `ReadAdvice#RANDOM` when appropriate. (#13222) This switches the following files to `ReadAdvice.RANDOM`: - Stored fields data file. - Term vectors data file. - HNSW graph. - Temporary file storing vectors at merge time that we use to construct the merged HNSW graph. - Vector data files, including quantized data files. I hesitated using `ReadAdvice.RANDOM` on terms, since they have a random access pattern when running term queries, but a more sequential access pattern when running multi-term queries. I erred on the conservative side and did not switch them to `ReadAdvice.RANDOM` for now. For simplicity, I'm only touching the current codec, not previous codecs. There are also some known issues: - These files will keep using a `RANDOM` `ReadAdvice` at merge time. We need some way for merge instances to get an updated `IOContext`? We have the same problem with `IOContext#LOAD` today. - With quantized vectors, raw vectors don't have random access pattern, but it was challenging to give raw vectors a sequential access pattern when there are quantized vectors and a random access pattern otherwise. So they assume a random access pattern all the time. --- .../Lucene90CompressingStoredFieldsReader.java | 3 ++- .../Lucene90CompressingTermVectorsReader.java | 3 ++- .../lucene99/Lucene99FlatVectorsReader.java | 15 ++++++++++++--- .../lucene99/Lucene99FlatVectorsWriter.java | 9 +++++++-- .../lucene99/Lucene99HnswVectorsReader.java | 13 ++++++++++--- .../Lucene99ScalarQuantizedVectorsReader.java | 15 ++++++++++++--- 6 files changed, 45 insertions(+), 13 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingStoredFieldsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingStoredFieldsReader.java index 569ca55400b..1b5c307294b 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingStoredFieldsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingStoredFieldsReader.java @@ -58,6 +58,7 @@ import org.apache.lucene.store.DataInput; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.ReadAdvice; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BitUtil; import org.apache.lucene.util.BytesRef; @@ -128,7 +129,7 @@ public final class Lucene90CompressingStoredFieldsReader extends StoredFieldsRea ChecksumIndexInput metaIn = null; try { // Open the data file - fieldsStream = d.openInput(fieldsStreamFN, context); + fieldsStream = d.openInput(fieldsStreamFN, context.withReadAdvice(ReadAdvice.RANDOM)); version = CodecUtil.checkIndexHeader( fieldsStream, formatName, VERSION_START, VERSION_CURRENT, si.getId(), segmentSuffix); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingTermVectorsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingTermVectorsReader.java index 73b9eafaa4d..b06da63153c 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingTermVectorsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingTermVectorsReader.java @@ -59,6 +59,7 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.RandomAccessInput; +import org.apache.lucene.store.ReadAdvice; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IOUtils; @@ -134,7 +135,7 @@ public final class Lucene90CompressingTermVectorsReader extends TermVectorsReade // Open the data file final String vectorsStreamFN = IndexFileNames.segmentFileName(segment, segmentSuffix, VECTORS_EXTENSION); - vectorsStream = d.openInput(vectorsStreamFN, context); + vectorsStream = d.openInput(vectorsStreamFN, context.withReadAdvice(ReadAdvice.RANDOM)); version = CodecUtil.checkIndexHeader( vectorsStream, formatName, VERSION_START, VERSION_CURRENT, si.getId(), segmentSuffix); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsReader.java index 81ea9b07023..b08d3ceeb91 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsReader.java @@ -38,7 +38,9 @@ import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.VectorEncoding; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.store.ChecksumIndexInput; +import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.ReadAdvice; import org.apache.lucene.util.Accountable; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.RamUsageEstimator; @@ -66,7 +68,10 @@ public final class Lucene99FlatVectorsReader extends FlatVectorsReader { state, versionMeta, Lucene99FlatVectorsFormat.VECTOR_DATA_EXTENSION, - Lucene99FlatVectorsFormat.VECTOR_DATA_CODEC_NAME); + Lucene99FlatVectorsFormat.VECTOR_DATA_CODEC_NAME, + // Flat formats are used to randomly access vectors from their node ID that is stored + // in the HNSW graph. + state.context.withReadAdvice(ReadAdvice.RANDOM)); success = true; } finally { if (success == false) { @@ -102,11 +107,15 @@ public final class Lucene99FlatVectorsReader extends FlatVectorsReader { } private static IndexInput openDataInput( - SegmentReadState state, int versionMeta, String fileExtension, String codecName) + SegmentReadState state, + int versionMeta, + String fileExtension, + String codecName, + IOContext context) throws IOException { String fileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, fileExtension); - IndexInput in = state.directory.openInput(fileName, state.context); + IndexInput in = state.directory.openInput(fileName, context); boolean success = false; try { int versionVectorData = diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java index 9f1239a1781..0491507aaf4 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java @@ -44,8 +44,10 @@ import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.Sorter; import org.apache.lucene.index.VectorEncoding; import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.ReadAdvice; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.RamUsageEstimator; @@ -283,10 +285,13 @@ public final class Lucene99FlatVectorsWriter extends FlatVectorsWriter { CodecUtil.writeFooter(tempVectorData); IOUtils.close(tempVectorData); - // copy the temporary file vectors to the actual data file + // This temp file will be accessed in a random-access fashion to construct the HNSW graph. + // Note: don't use the context from the state, which is a flush/merge context, not expecting + // to perform random reads. vectorDataInput = segmentWriteState.directory.openInput( - tempVectorData.getName(), segmentWriteState.context); + tempVectorData.getName(), IOContext.DEFAULT.withReadAdvice(ReadAdvice.RANDOM)); + // copy the temporary file vectors to the actual data file vectorData.copyBytes(vectorDataInput, vectorDataInput.length() - CodecUtil.footerLength()); CodecUtil.retrieveChecksum(vectorDataInput); long vectorDataLength = vectorData.getFilePointer() - vectorDataOffset; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsReader.java index 517ef765684..a72e42425fb 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsReader.java @@ -40,8 +40,10 @@ import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.search.KnnCollector; import org.apache.lucene.store.ChecksumIndexInput; import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.RandomAccessInput; +import org.apache.lucene.store.ReadAdvice; import org.apache.lucene.util.Accountable; import org.apache.lucene.util.Bits; import org.apache.lucene.util.IOUtils; @@ -102,7 +104,8 @@ public final class Lucene99HnswVectorsReader extends KnnVectorsReader state, versionMeta, Lucene99HnswVectorsFormat.VECTOR_INDEX_EXTENSION, - Lucene99HnswVectorsFormat.VECTOR_INDEX_CODEC_NAME); + Lucene99HnswVectorsFormat.VECTOR_INDEX_CODEC_NAME, + state.context.withReadAdvice(ReadAdvice.RANDOM)); success = true; } finally { if (success == false) { @@ -112,11 +115,15 @@ public final class Lucene99HnswVectorsReader extends KnnVectorsReader } private static IndexInput openDataInput( - SegmentReadState state, int versionMeta, String fileExtension, String codecName) + SegmentReadState state, + int versionMeta, + String fileExtension, + String codecName, + IOContext context) throws IOException { String fileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, fileExtension); - IndexInput in = state.directory.openInput(fileName, state.context); + IndexInput in = state.directory.openInput(fileName, context); boolean success = false; try { int versionVectorData = diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsReader.java index 3c8b8f0c490..dfd447de24c 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsReader.java @@ -36,7 +36,9 @@ import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.VectorEncoding; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.store.ChecksumIndexInput; +import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.ReadAdvice; import org.apache.lucene.util.Accountable; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.RamUsageEstimator; @@ -93,7 +95,10 @@ public final class Lucene99ScalarQuantizedVectorsReader extends FlatVectorsReade state, versionMeta, Lucene99ScalarQuantizedVectorsFormat.VECTOR_DATA_EXTENSION, - Lucene99ScalarQuantizedVectorsFormat.VECTOR_DATA_CODEC_NAME); + Lucene99ScalarQuantizedVectorsFormat.VECTOR_DATA_CODEC_NAME, + // Quantized vectors are accessed randomly from their node ID stored in the HNSW + // graph. + state.context.withReadAdvice(ReadAdvice.RANDOM)); success = true; } finally { if (success == false) { @@ -166,11 +171,15 @@ public final class Lucene99ScalarQuantizedVectorsReader extends FlatVectorsReade } private static IndexInput openDataInput( - SegmentReadState state, int versionMeta, String fileExtension, String codecName) + SegmentReadState state, + int versionMeta, + String fileExtension, + String codecName, + IOContext context) throws IOException { String fileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, fileExtension); - IndexInput in = state.directory.openInput(fileName, state.context); + IndexInput in = state.directory.openInput(fileName, context); boolean success = false; try { int versionVectorData =