From d03c8f16d9d4f87ce24d603e4a9b4a7806f9b533 Mon Sep 17 00:00:00 2001 From: Benjamin Trent Date: Fri, 7 Jul 2023 12:29:55 -0400 Subject: [PATCH] Have byte[] vectors also trigger a timeout in ExitableDirectoryReader (#12423) `ExitableDirectoryReader` did not wrap searching for `byte[]` vectors. Consequently timeouts were not respected with this reader when searching with `byte[]` vectors. This commit fixes that bug. --- lucene/CHANGES.txt | 2 + .../lucene/index/ExitableDirectoryReader.java | 31 ++++++++ .../index/TestExitableDirectoryReader.java | 78 ++++++++++++++++++- 3 files changed, 110 insertions(+), 1 deletion(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index c65b584beb1..26b1c692479 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -157,6 +157,8 @@ Bug Fixes * GITHUB#12413: Fix HNSW graph search bug that potentially leaked unapproved docs (Ben Trent). +* GITHUB#12423: Respect timeouts in ExitableDirectoryReader when searching with byte[] vectors (Ben Trent). + Other --------------------- diff --git a/lucene/core/src/java/org/apache/lucene/index/ExitableDirectoryReader.java b/lucene/core/src/java/org/apache/lucene/index/ExitableDirectoryReader.java index a572b2258af..6c1f2c93236 100644 --- a/lucene/core/src/java/org/apache/lucene/index/ExitableDirectoryReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/ExitableDirectoryReader.java @@ -364,6 +364,37 @@ public class ExitableDirectoryReader extends FilterDirectoryReader { return in.searchNearestVectors(field, target, k, timeoutCheckingAcceptDocs, visitedLimit); } + @Override + public TopDocs searchNearestVectors( + String field, byte[] target, int k, Bits acceptDocs, int visitedLimit) throws IOException { + // when acceptDocs is null due to no doc deleted, we will instantiate a new one that would + // match all docs to allow timeout checking. + final Bits updatedAcceptDocs = + acceptDocs == null ? new Bits.MatchAllBits(maxDoc()) : acceptDocs; + + Bits timeoutCheckingAcceptDocs = + new Bits() { + private static final int MAX_CALLS_BEFORE_QUERY_TIMEOUT_CHECK = 16; + private int calls; + + @Override + public boolean get(int index) { + if (calls++ % MAX_CALLS_BEFORE_QUERY_TIMEOUT_CHECK == 0) { + checkAndThrowForSearchVectors(); + } + + return updatedAcceptDocs.get(index); + } + + @Override + public int length() { + return updatedAcceptDocs.length(); + } + }; + + return in.searchNearestVectors(field, target, k, timeoutCheckingAcceptDocs, visitedLimit); + } + private void checkAndThrowForSearchVectors() { if (queryTimeout.shouldExit()) { throw new ExitingReaderException( diff --git a/lucene/core/src/test/org/apache/lucene/index/TestExitableDirectoryReader.java b/lucene/core/src/test/org/apache/lucene/index/TestExitableDirectoryReader.java index 4a6365b5309..5db5dbbc72c 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestExitableDirectoryReader.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestExitableDirectoryReader.java @@ -25,6 +25,7 @@ import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; import org.apache.lucene.document.IntPoint; +import org.apache.lucene.document.KnnByteVectorField; import org.apache.lucene.document.KnnFloatVectorField; import org.apache.lucene.document.NumericDocValuesField; import org.apache.lucene.document.SortedDocValuesField; @@ -404,7 +405,7 @@ public class TestExitableDirectoryReader extends LuceneTestCase { directory.close(); } - public void testVectorValues() throws IOException { + public void testFloatVectorValues() throws IOException { Directory directory = newDirectory(); IndexWriter writer = new IndexWriter(directory, newIndexWriterConfig(new MockAnalyzer(random()))); @@ -485,6 +486,81 @@ public class TestExitableDirectoryReader extends LuceneTestCase { directory.close(); } + public void testByteVectorValues() throws IOException { + Directory directory = newDirectory(); + IndexWriter writer = + new IndexWriter(directory, newIndexWriterConfig(new MockAnalyzer(random()))); + + int numDoc = atLeast(20); + int deletedDoc = atMost(5); + int dimension = atLeast(3); + + for (int i = 0; i < numDoc; i++) { + Document doc = new Document(); + byte[] value = new byte[dimension]; + random().nextBytes(value); + doc.add(new KnnByteVectorField("vector", value, VectorSimilarityFunction.COSINE)); + doc.add(new StringField("id", Integer.toString(i), Field.Store.YES)); + writer.addDocument(doc); + } + + writer.forceMerge(1); + writer.commit(); + + for (int i = 0; i < deletedDoc; i++) { + writer.deleteDocuments(new Term("id", Integer.toString(i))); + } + + writer.close(); + + QueryTimeout queryTimeout; + if (random().nextBoolean()) { + queryTimeout = immediateQueryTimeout(); + } else { + queryTimeout = infiniteQueryTimeout(); + } + + DirectoryReader directoryReader = DirectoryReader.open(directory); + DirectoryReader exitableDirectoryReader = directoryReader; + exitableDirectoryReader = new ExitableDirectoryReader(directoryReader, queryTimeout); + IndexReader reader = new TestReader(getOnlyLeafReader(exitableDirectoryReader)); + + LeafReaderContext context = reader.leaves().get(0); + LeafReader leaf = context.reader(); + + if (queryTimeout.shouldExit()) { + expectThrows( + ExitingReaderException.class, + () -> { + DocIdSetIterator iter = leaf.getByteVectorValues("vector"); + scanAndRetrieve(leaf, iter); + }); + + expectThrows( + ExitingReaderException.class, + () -> + leaf.searchNearestVectors( + "vector", + TestVectorUtil.randomVectorBytes(dimension), + 5, + leaf.getLiveDocs(), + Integer.MAX_VALUE)); + } else { + DocIdSetIterator iter = leaf.getByteVectorValues("vector"); + scanAndRetrieve(leaf, iter); + + leaf.searchNearestVectors( + "vector", + TestVectorUtil.randomVectorBytes(dimension), + 5, + leaf.getLiveDocs(), + Integer.MAX_VALUE); + } + + reader.close(); + directory.close(); + } + private static void scanAndRetrieve(LeafReader leaf, DocIdSetIterator iter) throws IOException { for (iter.nextDoc(); iter.docID() != DocIdSetIterator.NO_MORE_DOCS && iter.docID() < leaf.maxDoc(); ) {