mirror of https://github.com/apache/lucene.git
LUCENE-10411: Add NN vectors support to ExitableDirectoryReader (#833)
This commit is contained in:
parent
a06460a538
commit
96036bca9f
|
@ -141,6 +141,9 @@ Optimizations
|
||||||
* LUCENE-8836: Speed up calls to TermsEnum#lookupOrd on doc values terms enums
|
* LUCENE-8836: Speed up calls to TermsEnum#lookupOrd on doc values terms enums
|
||||||
and sequences of increasing ords. (Bruno Roustant, Adrien Grand)
|
and sequences of increasing ords. (Bruno Roustant, Adrien Grand)
|
||||||
|
|
||||||
|
* LUCENE-10411: Add nearest neighbors vectors support to ExitableDirectoryReader.
|
||||||
|
(Zach Chen, Adrien Grand, Julie Tibshirani, Tomoko Uchida)
|
||||||
|
|
||||||
* LUCENE-10542: FieldSource exists implementations can avoid value retrieval (Kevin Risden)
|
* LUCENE-10542: FieldSource exists implementations can avoid value retrieval (Kevin Risden)
|
||||||
|
|
||||||
* LUCENE-10534: MinFloatFunction / MaxFloatFunction exists check can be slow (Kevin Risden)
|
* LUCENE-10534: MinFloatFunction / MaxFloatFunction exists check can be slow (Kevin Risden)
|
||||||
|
|
|
@ -20,6 +20,8 @@ import java.io.IOException;
|
||||||
import org.apache.lucene.index.FilterLeafReader.FilterTerms;
|
import org.apache.lucene.index.FilterLeafReader.FilterTerms;
|
||||||
import org.apache.lucene.index.FilterLeafReader.FilterTermsEnum;
|
import org.apache.lucene.index.FilterLeafReader.FilterTermsEnum;
|
||||||
import org.apache.lucene.search.DocIdSetIterator;
|
import org.apache.lucene.search.DocIdSetIterator;
|
||||||
|
import org.apache.lucene.search.TopDocs;
|
||||||
|
import org.apache.lucene.util.Bits;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
import org.apache.lucene.util.automaton.CompiledAutomaton;
|
import org.apache.lucene.util.automaton.CompiledAutomaton;
|
||||||
|
|
||||||
|
@ -323,6 +325,62 @@ public class ExitableDirectoryReader extends FilterDirectoryReader {
|
||||||
: sortedSetDocValues;
|
: sortedSetDocValues;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public VectorValues getVectorValues(String field) throws IOException {
|
||||||
|
final VectorValues vectorValues = in.getVectorValues(field);
|
||||||
|
if (vectorValues == null) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
return (queryTimeout.isTimeoutEnabled())
|
||||||
|
? new ExitableVectorValues(vectorValues)
|
||||||
|
: vectorValues;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TopDocs searchNearestVectors(
|
||||||
|
String field, float[] target, int k, Bits acceptDocs, int visitedLimit) throws IOException {
|
||||||
|
|
||||||
|
// when acceptDocs is null due to no doc deleted, we will instantiate a new one that would
|
||||||
|
// match all docs to allow timeout checking.
|
||||||
|
final Bits updatedAcceptDocs =
|
||||||
|
acceptDocs == null ? new Bits.MatchAllBits(maxDoc()) : acceptDocs;
|
||||||
|
|
||||||
|
Bits timeoutCheckingAcceptDocs =
|
||||||
|
new Bits() {
|
||||||
|
private static final int MAX_CALLS_BEFORE_QUERY_TIMEOUT_CHECK = 10;
|
||||||
|
private int calls;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean get(int index) {
|
||||||
|
if (calls++ % MAX_CALLS_BEFORE_QUERY_TIMEOUT_CHECK == 0) {
|
||||||
|
checkAndThrowForSearchVectors();
|
||||||
|
}
|
||||||
|
|
||||||
|
return updatedAcceptDocs.get(index);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int length() {
|
||||||
|
return updatedAcceptDocs.length();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
return in.searchNearestVectors(field, target, k, timeoutCheckingAcceptDocs, visitedLimit);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void checkAndThrowForSearchVectors() {
|
||||||
|
if (queryTimeout.shouldExit()) {
|
||||||
|
throw new ExitingReaderException(
|
||||||
|
"The request took too long to search nearest vectors. Timeout: "
|
||||||
|
+ queryTimeout.toString()
|
||||||
|
+ ", Reader="
|
||||||
|
+ in);
|
||||||
|
} else if (Thread.interrupted()) {
|
||||||
|
throw new ExitingReaderException(
|
||||||
|
"Interrupted while searching nearest vectors. Reader=" + in);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Throws {@link ExitingReaderException} if {@link QueryTimeout#shouldExit()} returns true, or
|
* Throws {@link ExitingReaderException} if {@link QueryTimeout#shouldExit()} returns true, or
|
||||||
* if {@link Thread#interrupted()} returns true.
|
* if {@link Thread#interrupted()} returns true.
|
||||||
|
@ -338,7 +396,65 @@ public class ExitableDirectoryReader extends FilterDirectoryReader {
|
||||||
+ in);
|
+ in);
|
||||||
} else if (Thread.interrupted()) {
|
} else if (Thread.interrupted()) {
|
||||||
throw new ExitingReaderException(
|
throw new ExitingReaderException(
|
||||||
"Interrupted while iterating over point values. PointValues=" + in);
|
"Interrupted while iterating over doc values. DocValues=" + in);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private class ExitableVectorValues extends FilterVectorValues {
|
||||||
|
private int docToCheck;
|
||||||
|
|
||||||
|
public ExitableVectorValues(VectorValues vectorValues) {
|
||||||
|
super(vectorValues);
|
||||||
|
docToCheck = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int advance(int target) throws IOException {
|
||||||
|
final int advance = super.advance(target);
|
||||||
|
if (advance >= docToCheck) {
|
||||||
|
checkAndThrow();
|
||||||
|
docToCheck = advance + DOCS_BETWEEN_TIMEOUT_CHECK;
|
||||||
|
}
|
||||||
|
return advance;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int nextDoc() throws IOException {
|
||||||
|
final int nextDoc = super.nextDoc();
|
||||||
|
if (nextDoc >= docToCheck) {
|
||||||
|
checkAndThrow();
|
||||||
|
docToCheck = nextDoc + DOCS_BETWEEN_TIMEOUT_CHECK;
|
||||||
|
}
|
||||||
|
return nextDoc;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public float[] vectorValue() throws IOException {
|
||||||
|
checkAndThrow();
|
||||||
|
return in.vectorValue();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public BytesRef binaryValue() throws IOException {
|
||||||
|
checkAndThrow();
|
||||||
|
return in.binaryValue();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Throws {@link ExitingReaderException} if {@link QueryTimeout#shouldExit()} returns true, or
|
||||||
|
* if {@link Thread#interrupted()} returns true.
|
||||||
|
*/
|
||||||
|
private void checkAndThrow() {
|
||||||
|
if (queryTimeout.shouldExit()) {
|
||||||
|
throw new ExitingReaderException(
|
||||||
|
"The request took too long to iterate over vector values. Timeout: "
|
||||||
|
+ queryTimeout.toString()
|
||||||
|
+ ", VectorValues="
|
||||||
|
+ in);
|
||||||
|
} else if (Thread.interrupted()) {
|
||||||
|
throw new ExitingReaderException(
|
||||||
|
"Interrupted while iterating over vector values. VectorValues=" + in);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,75 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.lucene.index;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Objects;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
|
||||||
|
/** Delegates all methods to a wrapped {@link VectorValues}. */
|
||||||
|
public abstract class FilterVectorValues extends VectorValues {
|
||||||
|
|
||||||
|
/** Wrapped values */
|
||||||
|
protected final VectorValues in;
|
||||||
|
|
||||||
|
/** Sole constructor */
|
||||||
|
protected FilterVectorValues(VectorValues in) {
|
||||||
|
Objects.requireNonNull(in);
|
||||||
|
this.in = in;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int docID() {
|
||||||
|
return in.docID();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int nextDoc() throws IOException {
|
||||||
|
return in.nextDoc();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int advance(int target) throws IOException {
|
||||||
|
return in.advance(target);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long cost() {
|
||||||
|
return in.cost();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int dimension() {
|
||||||
|
return in.dimension();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int size() {
|
||||||
|
return in.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public float[] vectorValue() throws IOException {
|
||||||
|
return in.vectorValue();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public BytesRef binaryValue() throws IOException {
|
||||||
|
return in.binaryValue();
|
||||||
|
}
|
||||||
|
}
|
|
@ -16,6 +16,8 @@
|
||||||
*/
|
*/
|
||||||
package org.apache.lucene.index;
|
package org.apache.lucene.index;
|
||||||
|
|
||||||
|
import static com.carrotsearch.randomizedtesting.RandomizedTest.atMost;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import org.apache.lucene.document.*;
|
import org.apache.lucene.document.*;
|
||||||
|
@ -428,6 +430,101 @@ public class TestExitableDirectoryReader extends LuceneTestCase {
|
||||||
directory.close();
|
directory.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testVectorValues() throws IOException {
|
||||||
|
Directory directory = newDirectory();
|
||||||
|
IndexWriter writer =
|
||||||
|
new IndexWriter(directory, newIndexWriterConfig(new MockAnalyzer(random())));
|
||||||
|
|
||||||
|
int numDoc = atLeast(20);
|
||||||
|
int deletedDoc = atMost(5);
|
||||||
|
int dimension = atLeast(3);
|
||||||
|
|
||||||
|
for (int i = 0; i < numDoc; i++) {
|
||||||
|
Document doc = new Document();
|
||||||
|
|
||||||
|
float[] value = new float[dimension];
|
||||||
|
for (int j = 0; j < dimension; j++) {
|
||||||
|
value[j] = random().nextFloat();
|
||||||
|
}
|
||||||
|
FieldType fieldType =
|
||||||
|
KnnVectorField.createFieldType(dimension, VectorSimilarityFunction.COSINE);
|
||||||
|
doc.add(new KnnVectorField("vector", value, fieldType));
|
||||||
|
|
||||||
|
doc.add(new StringField("id", Integer.toString(i), Field.Store.YES));
|
||||||
|
writer.addDocument(doc);
|
||||||
|
}
|
||||||
|
|
||||||
|
writer.forceMerge(1);
|
||||||
|
writer.commit();
|
||||||
|
|
||||||
|
for (int i = 0; i < deletedDoc; i++) {
|
||||||
|
writer.deleteDocuments(new Term("id", Integer.toString(i)));
|
||||||
|
}
|
||||||
|
|
||||||
|
writer.close();
|
||||||
|
|
||||||
|
QueryTimeout queryTimeout;
|
||||||
|
if (random().nextBoolean()) {
|
||||||
|
if (random().nextBoolean()) {
|
||||||
|
queryTimeout = immediateQueryTimeout();
|
||||||
|
} else {
|
||||||
|
queryTimeout = infiniteQueryTimeout();
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
queryTimeout = disabledQueryTimeout();
|
||||||
|
}
|
||||||
|
|
||||||
|
DirectoryReader directoryReader = DirectoryReader.open(directory);
|
||||||
|
DirectoryReader exitableDirectoryReader =
|
||||||
|
new ExitableDirectoryReader(directoryReader, queryTimeout);
|
||||||
|
IndexReader reader = new TestReader(getOnlyLeafReader(exitableDirectoryReader));
|
||||||
|
|
||||||
|
LeafReaderContext context = reader.leaves().get(0);
|
||||||
|
LeafReader leaf = context.reader();
|
||||||
|
|
||||||
|
if (queryTimeout.shouldExit()) {
|
||||||
|
expectThrows(
|
||||||
|
ExitingReaderException.class,
|
||||||
|
() -> {
|
||||||
|
DocIdSetIterator iter = leaf.getVectorValues("vector");
|
||||||
|
scanAndRetrieve(leaf, iter);
|
||||||
|
});
|
||||||
|
|
||||||
|
expectThrows(
|
||||||
|
ExitingReaderException.class,
|
||||||
|
() ->
|
||||||
|
leaf.searchNearestVectors(
|
||||||
|
"vector", new float[dimension], 5, leaf.getLiveDocs(), Integer.MAX_VALUE));
|
||||||
|
} else {
|
||||||
|
DocIdSetIterator iter = leaf.getVectorValues("vector");
|
||||||
|
scanAndRetrieve(leaf, iter);
|
||||||
|
|
||||||
|
leaf.searchNearestVectors(
|
||||||
|
"vector", new float[dimension], 5, leaf.getLiveDocs(), Integer.MAX_VALUE);
|
||||||
|
}
|
||||||
|
|
||||||
|
reader.close();
|
||||||
|
directory.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void scanAndRetrieve(LeafReader leaf, DocIdSetIterator iter) throws IOException {
|
||||||
|
for (iter.nextDoc();
|
||||||
|
iter.docID() != DocIdSetIterator.NO_MORE_DOCS && iter.docID() < leaf.maxDoc(); ) {
|
||||||
|
final int nextDocId = iter.docID() + 1;
|
||||||
|
if (random().nextBoolean() && nextDocId < leaf.maxDoc()) {
|
||||||
|
iter.advance(nextDocId);
|
||||||
|
} else {
|
||||||
|
iter.nextDoc();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (random().nextBoolean()
|
||||||
|
&& iter.docID() != DocIdSetIterator.NO_MORE_DOCS
|
||||||
|
&& iter instanceof VectorValues) {
|
||||||
|
((VectorValues) iter).vectorValue();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private static void scan(LeafReader leaf, DocValuesIterator iter) throws IOException {
|
private static void scan(LeafReader leaf, DocValuesIterator iter) throws IOException {
|
||||||
for (iter.nextDoc();
|
for (iter.nextDoc();
|
||||||
iter.docID() != DocIdSetIterator.NO_MORE_DOCS && iter.docID() < leaf.maxDoc(); ) {
|
iter.docID() != DocIdSetIterator.NO_MORE_DOCS && iter.docID() < leaf.maxDoc(); ) {
|
||||||
|
|
Loading…
Reference in New Issue