LUCENE-9583: extract separate RandomAccessVectorValues interface (#2037)

This commit is contained in:
Michael Sokolov 2020-11-09 10:46:16 -05:00 committed by GitHub
parent be19432b75
commit 8be0cea544
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 151 additions and 91 deletions

View File

@ -27,6 +27,8 @@ import org.apache.lucene.codecs.VectorReader;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.RandomAccessVectorValues;
import org.apache.lucene.index.RandomAccessVectorValuesProducer;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.VectorValues;
import org.apache.lucene.search.TopDocs;
@ -158,7 +160,7 @@ public class SimpleTextVectorReader extends VectorReader {
}
}
private static class SimpleTextVectorValues extends VectorValues implements VectorValues.RandomAccess {
private static class SimpleTextVectorValues extends VectorValues implements RandomAccessVectorValues, RandomAccessVectorValuesProducer {
private final BytesRefBuilder scratch = new BytesRefBuilder();
private final FieldEntry entry;
@ -205,7 +207,7 @@ public class SimpleTextVectorReader extends VectorReader {
}
@Override
public RandomAccess randomAccess() {
public RandomAccessVectorValues randomAccess() {
return this;
}
@ -236,8 +238,8 @@ public class SimpleTextVectorReader extends VectorReader {
}
private void readAllVectors() throws IOException {
for (int i = 0; i < values.length; i++) {
readVector(values[i]);
for (float[] value : values) {
readVector(value);
}
}

View File

@ -26,6 +26,8 @@ import java.util.List;
import org.apache.lucene.index.DocIDMerger;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.MergeState;
import org.apache.lucene.index.RandomAccessVectorValues;
import org.apache.lucene.index.RandomAccessVectorValuesProducer;
import org.apache.lucene.index.VectorValues;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.util.BytesRef;
@ -135,7 +137,7 @@ public abstract class VectorWriter implements Closeable {
* View over multiple VectorValues supporting iterator-style access via DocIdMerger. Maintains a reverse ordinal
* mapping for documents having values in order to support random access by dense ordinal.
*/
private static class VectorValuesMerger extends VectorValues {
private static class VectorValuesMerger extends VectorValues implements RandomAccessVectorValuesProducer {
private final List<VectorValuesSub> subs;
private final DocIDMerger<VectorValuesSub> docIdMerger;
private final int[] ordBase;
@ -198,7 +200,7 @@ public abstract class VectorWriter implements Closeable {
}
@Override
public RandomAccess randomAccess() {
public RandomAccessVectorValues randomAccess() {
return new MergerRandomAccess();
}
@ -227,14 +229,23 @@ public abstract class VectorWriter implements Closeable {
return subs.get(0).values.searchStrategy();
}
class MergerRandomAccess implements VectorValues.RandomAccess {
@Override
public TopDocs search(float[] target, int k, int fanout) throws IOException {
throw new UnsupportedOperationException();
}
private final List<RandomAccess> raSubs;
class MergerRandomAccess implements RandomAccessVectorValues {
private final List<RandomAccessVectorValues> raSubs;
MergerRandomAccess() {
raSubs = new ArrayList<>(subs.size());
for (VectorValuesSub sub : subs) {
raSubs.add(sub.values.randomAccess());
if (sub.values instanceof RandomAccessVectorValuesProducer) {
raSubs.add(((RandomAccessVectorValuesProducer) sub.values).randomAccess());
} else {
throw new IllegalStateException("Cannot merge VectorValues without support for random access");
}
}
}
@ -273,11 +284,6 @@ public abstract class VectorWriter implements Closeable {
throw new UnsupportedOperationException();
}
@Override
public TopDocs search(float[] target, int k, int fanout) throws IOException {
throw new UnsupportedOperationException();
}
}
}
}

View File

@ -29,6 +29,8 @@ import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.RandomAccessVectorValues;
import org.apache.lucene.index.RandomAccessVectorValuesProducer;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.VectorValues;
import org.apache.lucene.search.TopDocs;
@ -196,7 +198,7 @@ public final class Lucene90VectorReader extends VectorReader {
}
/** Read the vector values from the index input. This supports both iterated and random access. */
private final static class OffHeapVectorValues extends VectorValues {
private final class OffHeapVectorValues extends VectorValues implements RandomAccessVectorValuesProducer {
final FieldEntry fieldEntry;
final IndexInput dataIn;
@ -250,6 +252,11 @@ public final class Lucene90VectorReader extends VectorReader {
return binaryValue;
}
@Override
public TopDocs search(float[] target, int k, int fanout) {
throw new UnsupportedOperationException();
}
@Override
public int docID() {
return doc;
@ -277,12 +284,12 @@ public final class Lucene90VectorReader extends VectorReader {
}
@Override
public RandomAccess randomAccess() {
public RandomAccessVectorValues randomAccess() {
return new OffHeapRandomAccess(dataIn.clone());
}
class OffHeapRandomAccess implements VectorValues.RandomAccess {
class OffHeapRandomAccess implements RandomAccessVectorValues {
final IndexInput dataIn;
@ -336,10 +343,6 @@ public final class Lucene90VectorReader extends VectorReader {
dataIn.readBytes(byteBuffer.array(), byteBuffer.arrayOffset(), byteSize);
}
@Override
public TopDocs search(float[] vector, int topK, int fanout) throws IOException {
throw new UnsupportedOperationException();
}
}
}
}

View File

@ -0,0 +1,60 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.index;
import java.io.IOException;
import org.apache.lucene.util.BytesRef;
/**
* Provides random access to vectors by dense ordinal.
*
* @lucene.experimental
*/
public interface RandomAccessVectorValues {
/**
* Return the number of vector values
*/
int size();
/**
* Return the dimension of the returned vector values
*/
int dimension();
/**
* Return the search strategy used to compare these vectors
*/
VectorValues.SearchStrategy searchStrategy();
/**
* Return the vector value indexed at the given ordinal. The provided floating point array may
* be shared and overwritten by subsequent calls to this method and {@link #binaryValue(int)}.
* @param targetOrd a valid ordinal, &ge; 0 and &lt; {@link #size()}.
*/
float[] vectorValue(int targetOrd) throws IOException;
/**
* Return the vector indexed at the given ordinal value as an array of bytes in a BytesRef;
* these are the bytes corresponding to the float array. The provided bytes may be shared and overwritten
* by subsequent calls to this method and {@link #vectorValue(int)}.
* @param targetOrd a valid ordinal, &ge; 0 and &lt; {@link #size()}.
*/
BytesRef binaryValue(int targetOrd) throws IOException;
}

View File

@ -0,0 +1,32 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.index;
/**
* Something (generally a {@link VectorValues}) that provides a {@link RandomAccessVectorValues}.
*
* @lucene.experimental
*/
public interface RandomAccessVectorValuesProducer {
/**
* Return a random access interface over this iterator's vectors. Calling the RandomAccess methods will
* have no effect on the progress of the iteration or the values returned by this iterator. Successive calls
* will retrieve independent copies that do not overwrite each others' returned values.
*/
RandomAccessVectorValues randomAccess();
}

View File

@ -75,68 +75,23 @@ public abstract class VectorValues extends DocIdSetIterator {
}
/**
* Return a random access interface over this iterator's vectors. Calling the RandomAccess methods will
* have no effect on the progress of the iteration or the values returned by this iterator. Successive calls
* will retrieve independent copies that do not overwrite each others' returned values.
* Return the k nearest neighbor documents as determined by comparison of their vector values
* for this field, to the given vector, by the field's search strategy. If the search strategy is
* reversed, lower values indicate nearer vectors, otherwise higher scores indicate nearer
* vectors. Unlike relevance scores, vector scores may be negative.
* @param target the vector-valued query
* @param k the number of docs to return
* @param fanout control the accuracy/speed tradeoff - larger values give better recall at higher cost
* @return the k nearest neighbor documents, along with their (searchStrategy-specific) scores.
*/
public abstract RandomAccess randomAccess();
/**
* Provides random access to vectors by dense ordinal.
*
* @lucene.experimental
*/
public interface RandomAccess {
/**
* Return the number of vector values
*/
int size();
/**
* Return the dimension of the returned vector values
*/
int dimension();
/**
* Return the search strategy used to compare these vectors
*/
SearchStrategy searchStrategy();
/**
* Return the vector value indexed at the given ordinal. The provided floating point array may
* be shared and overwritten by subsequent calls to this method and {@link #binaryValue(int)}.
* @param targetOrd a valid ordinal, &ge; 0 and &lt; {@link #size()}.
*/
float[] vectorValue(int targetOrd) throws IOException;
/**
* Return the vector indexed at the given ordinal value as an array of bytes in a BytesRef;
* these are the bytes corresponding to the float array. The provided bytes may be shared and overwritten
* by subsequent calls to this method and {@link #vectorValue(int)}.
* @param targetOrd a valid ordinal, &ge; 0 and &lt; {@link #size()}.
*/
BytesRef binaryValue(int targetOrd) throws IOException;
/**
* Return the k nearest neighbor documents as determined by comparison of their vector values
* for this field, to the given vector, by the field's search strategy. If the search strategy is
* reversed, lower values indicate nearer vectors, otherwise higher scores indicate nearer
* vectors. Unlike relevance scores, vector scores may be negative.
* @param target the vector-valued query
* @param k the number of docs to return
* @param fanout control the accuracy/speed tradeoff - larger values give better recall at higher cost
* @return the k nearest neighbor documents, along with their (searchStrategy-specific) scores.
*/
TopDocs search(float[] target, int k, int fanout) throws IOException;
}
public abstract TopDocs search(float[] target, int k, int fanout) throws IOException;
/**
* Search strategy. This is a label describing the method used during indexing and searching of the vectors in order to
* determine the nearest neighbors.
*/
public enum SearchStrategy {
/** No search strategy is provided. Note: {@link VectorValues.RandomAccess#search(float[], int, int)}
/** No search strategy is provided. Note: {@link VectorValues#search(float[], int, int)}
* is not supported for fields specifying this strategy. */
NONE,
@ -174,7 +129,7 @@ public abstract class VectorValues extends DocIdSetIterator {
}
@Override
public RandomAccess randomAccess() {
public TopDocs search(float[] target, int k, int fanout) {
throw new UnsupportedOperationException();
}

View File

@ -98,17 +98,17 @@ class VectorValuesWriter {
}
}
static class SortingVectorValues extends VectorValues {
static class SortingVectorValues extends VectorValues implements RandomAccessVectorValuesProducer {
private final VectorValues delegate;
private final VectorValues.RandomAccess randomAccess;
private final RandomAccessVectorValues randomAccess;
private final int[] docIdOffsets;
private final int[] ordMap;
private int docId = -1;
SortingVectorValues(VectorValues delegate, Sorter.DocMap sortMap) throws IOException {
this.delegate = delegate;
randomAccess = delegate.randomAccess();
randomAccess = ((RandomAccessVectorValuesProducer) delegate).randomAccess();
docIdOffsets = new int[sortMap.size()];
int offset = 1; // 0 means no vector for this (field, document)
@ -181,10 +181,16 @@ class VectorValuesWriter {
return size();
}
@Override
public RandomAccess randomAccess() {
RandomAccess ra = delegate.randomAccess();
return new RandomAccess() {
public TopDocs search(float[] target, int k, int fanout) {
throw new UnsupportedOperationException();
}
@Override
public RandomAccessVectorValues randomAccess() {
return new RandomAccessVectorValues() {
@Override
public int size() {
@ -203,7 +209,7 @@ class VectorValuesWriter {
@Override
public float[] vectorValue(int targetOrd) throws IOException {
return ra.vectorValue(ordMap[targetOrd]);
return randomAccess.vectorValue(ordMap[targetOrd]);
}
@Override
@ -211,15 +217,11 @@ class VectorValuesWriter {
throw new UnsupportedOperationException();
}
@Override
public TopDocs search(float[] target, int k, int fanout) {
throw new UnsupportedOperationException();
}
};
}
}
private static class BufferedVectorValues extends VectorValues implements VectorValues.RandomAccess {
private static class BufferedVectorValues extends VectorValues implements RandomAccessVectorValues, RandomAccessVectorValuesProducer {
final DocsWithFieldSet docsWithField;
@ -249,7 +251,7 @@ class VectorValuesWriter {
}
@Override
public RandomAccess randomAccess() {
public RandomAccessVectorValues randomAccess() {
return this;
}

View File

@ -593,7 +593,7 @@ public class TestVectorValues extends LuceneTestCase {
assertEquals(4f, vectorValues.vectorValue()[0], 0);
assertEquals(NO_MORE_DOCS, vectorValues.nextDoc());
VectorValues.RandomAccess ra = vectorValues.randomAccess();
RandomAccessVectorValues ra = ((RandomAccessVectorValuesProducer) vectorValues).randomAccess();
assertEquals(1f, ra.vectorValue(0)[0], 0);
assertEquals(2f, ra.vectorValue(1)[0], 0);
assertEquals(4f, ra.vectorValue(2)[0], 0);