LUCENE-9583: extract separate RandomAccessVectorValues interface (#2037)

This commit is contained in:
Michael Sokolov 2020-11-09 10:46:16 -05:00 committed by GitHub
parent be19432b75
commit 8be0cea544
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 151 additions and 91 deletions

View File

@ -27,6 +27,8 @@ import org.apache.lucene.codecs.VectorReader;
import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.RandomAccessVectorValues;
import org.apache.lucene.index.RandomAccessVectorValuesProducer;
import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.VectorValues; import org.apache.lucene.index.VectorValues;
import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.TopDocs;
@ -158,7 +160,7 @@ public class SimpleTextVectorReader extends VectorReader {
} }
} }
private static class SimpleTextVectorValues extends VectorValues implements VectorValues.RandomAccess { private static class SimpleTextVectorValues extends VectorValues implements RandomAccessVectorValues, RandomAccessVectorValuesProducer {
private final BytesRefBuilder scratch = new BytesRefBuilder(); private final BytesRefBuilder scratch = new BytesRefBuilder();
private final FieldEntry entry; private final FieldEntry entry;
@ -205,7 +207,7 @@ public class SimpleTextVectorReader extends VectorReader {
} }
@Override @Override
public RandomAccess randomAccess() { public RandomAccessVectorValues randomAccess() {
return this; return this;
} }
@ -236,8 +238,8 @@ public class SimpleTextVectorReader extends VectorReader {
} }
private void readAllVectors() throws IOException { private void readAllVectors() throws IOException {
for (int i = 0; i < values.length; i++) { for (float[] value : values) {
readVector(values[i]); readVector(value);
} }
} }

View File

@ -26,6 +26,8 @@ import java.util.List;
import org.apache.lucene.index.DocIDMerger; import org.apache.lucene.index.DocIDMerger;
import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.MergeState; import org.apache.lucene.index.MergeState;
import org.apache.lucene.index.RandomAccessVectorValues;
import org.apache.lucene.index.RandomAccessVectorValuesProducer;
import org.apache.lucene.index.VectorValues; import org.apache.lucene.index.VectorValues;
import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.TopDocs;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
@ -135,7 +137,7 @@ public abstract class VectorWriter implements Closeable {
* View over multiple VectorValues supporting iterator-style access via DocIdMerger. Maintains a reverse ordinal * View over multiple VectorValues supporting iterator-style access via DocIdMerger. Maintains a reverse ordinal
* mapping for documents having values in order to support random access by dense ordinal. * mapping for documents having values in order to support random access by dense ordinal.
*/ */
private static class VectorValuesMerger extends VectorValues { private static class VectorValuesMerger extends VectorValues implements RandomAccessVectorValuesProducer {
private final List<VectorValuesSub> subs; private final List<VectorValuesSub> subs;
private final DocIDMerger<VectorValuesSub> docIdMerger; private final DocIDMerger<VectorValuesSub> docIdMerger;
private final int[] ordBase; private final int[] ordBase;
@ -198,7 +200,7 @@ public abstract class VectorWriter implements Closeable {
} }
@Override @Override
public RandomAccess randomAccess() { public RandomAccessVectorValues randomAccess() {
return new MergerRandomAccess(); return new MergerRandomAccess();
} }
@ -227,14 +229,23 @@ public abstract class VectorWriter implements Closeable {
return subs.get(0).values.searchStrategy(); return subs.get(0).values.searchStrategy();
} }
class MergerRandomAccess implements VectorValues.RandomAccess { @Override
public TopDocs search(float[] target, int k, int fanout) throws IOException {
throw new UnsupportedOperationException();
}
private final List<RandomAccess> raSubs; class MergerRandomAccess implements RandomAccessVectorValues {
private final List<RandomAccessVectorValues> raSubs;
MergerRandomAccess() { MergerRandomAccess() {
raSubs = new ArrayList<>(subs.size()); raSubs = new ArrayList<>(subs.size());
for (VectorValuesSub sub : subs) { for (VectorValuesSub sub : subs) {
raSubs.add(sub.values.randomAccess()); if (sub.values instanceof RandomAccessVectorValuesProducer) {
raSubs.add(((RandomAccessVectorValuesProducer) sub.values).randomAccess());
} else {
throw new IllegalStateException("Cannot merge VectorValues without support for random access");
}
} }
} }
@ -273,11 +284,6 @@ public abstract class VectorWriter implements Closeable {
throw new UnsupportedOperationException(); throw new UnsupportedOperationException();
} }
@Override
public TopDocs search(float[] target, int k, int fanout) throws IOException {
throw new UnsupportedOperationException();
}
} }
} }
} }

View File

@ -29,6 +29,8 @@ import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.RandomAccessVectorValues;
import org.apache.lucene.index.RandomAccessVectorValuesProducer;
import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.VectorValues; import org.apache.lucene.index.VectorValues;
import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.TopDocs;
@ -196,7 +198,7 @@ public final class Lucene90VectorReader extends VectorReader {
} }
/** Read the vector values from the index input. This supports both iterated and random access. */ /** Read the vector values from the index input. This supports both iterated and random access. */
private final static class OffHeapVectorValues extends VectorValues { private final class OffHeapVectorValues extends VectorValues implements RandomAccessVectorValuesProducer {
final FieldEntry fieldEntry; final FieldEntry fieldEntry;
final IndexInput dataIn; final IndexInput dataIn;
@ -250,6 +252,11 @@ public final class Lucene90VectorReader extends VectorReader {
return binaryValue; return binaryValue;
} }
@Override
public TopDocs search(float[] target, int k, int fanout) {
throw new UnsupportedOperationException();
}
@Override @Override
public int docID() { public int docID() {
return doc; return doc;
@ -277,12 +284,12 @@ public final class Lucene90VectorReader extends VectorReader {
} }
@Override @Override
public RandomAccess randomAccess() { public RandomAccessVectorValues randomAccess() {
return new OffHeapRandomAccess(dataIn.clone()); return new OffHeapRandomAccess(dataIn.clone());
} }
class OffHeapRandomAccess implements VectorValues.RandomAccess { class OffHeapRandomAccess implements RandomAccessVectorValues {
final IndexInput dataIn; final IndexInput dataIn;
@ -336,10 +343,6 @@ public final class Lucene90VectorReader extends VectorReader {
dataIn.readBytes(byteBuffer.array(), byteBuffer.arrayOffset(), byteSize); dataIn.readBytes(byteBuffer.array(), byteBuffer.arrayOffset(), byteSize);
} }
@Override
public TopDocs search(float[] vector, int topK, int fanout) throws IOException {
throw new UnsupportedOperationException();
}
} }
} }
} }

View File

@ -0,0 +1,60 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.index;
import java.io.IOException;
import org.apache.lucene.util.BytesRef;
/**
* Provides random access to vectors by dense ordinal.
*
* @lucene.experimental
*/
public interface RandomAccessVectorValues {
/**
* Return the number of vector values
*/
int size();
/**
* Return the dimension of the returned vector values
*/
int dimension();
/**
* Return the search strategy used to compare these vectors
*/
VectorValues.SearchStrategy searchStrategy();
/**
* Return the vector value indexed at the given ordinal. The provided floating point array may
* be shared and overwritten by subsequent calls to this method and {@link #binaryValue(int)}.
* @param targetOrd a valid ordinal, &ge; 0 and &lt; {@link #size()}.
*/
float[] vectorValue(int targetOrd) throws IOException;
/**
* Return the vector indexed at the given ordinal value as an array of bytes in a BytesRef;
* these are the bytes corresponding to the float array. The provided bytes may be shared and overwritten
* by subsequent calls to this method and {@link #vectorValue(int)}.
* @param targetOrd a valid ordinal, &ge; 0 and &lt; {@link #size()}.
*/
BytesRef binaryValue(int targetOrd) throws IOException;
}

View File

@ -0,0 +1,32 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.index;
/**
* Something (generally a {@link VectorValues}) that provides a {@link RandomAccessVectorValues}.
*
* @lucene.experimental
*/
public interface RandomAccessVectorValuesProducer {
/**
* Return a random access interface over this iterator's vectors. Calling the RandomAccess methods will
* have no effect on the progress of the iteration or the values returned by this iterator. Successive calls
* will retrieve independent copies that do not overwrite each others' returned values.
*/
RandomAccessVectorValues randomAccess();
}

View File

@ -74,50 +74,6 @@ public abstract class VectorValues extends DocIdSetIterator {
throw new UnsupportedOperationException(); throw new UnsupportedOperationException();
} }
/**
* Return a random access interface over this iterator's vectors. Calling the RandomAccess methods will
* have no effect on the progress of the iteration or the values returned by this iterator. Successive calls
* will retrieve independent copies that do not overwrite each others' returned values.
*/
public abstract RandomAccess randomAccess();
/**
* Provides random access to vectors by dense ordinal.
*
* @lucene.experimental
*/
public interface RandomAccess {
/**
* Return the number of vector values
*/
int size();
/**
* Return the dimension of the returned vector values
*/
int dimension();
/**
* Return the search strategy used to compare these vectors
*/
SearchStrategy searchStrategy();
/**
* Return the vector value indexed at the given ordinal. The provided floating point array may
* be shared and overwritten by subsequent calls to this method and {@link #binaryValue(int)}.
* @param targetOrd a valid ordinal, &ge; 0 and &lt; {@link #size()}.
*/
float[] vectorValue(int targetOrd) throws IOException;
/**
* Return the vector indexed at the given ordinal value as an array of bytes in a BytesRef;
* these are the bytes corresponding to the float array. The provided bytes may be shared and overwritten
* by subsequent calls to this method and {@link #vectorValue(int)}.
* @param targetOrd a valid ordinal, &ge; 0 and &lt; {@link #size()}.
*/
BytesRef binaryValue(int targetOrd) throws IOException;
/** /**
* Return the k nearest neighbor documents as determined by comparison of their vector values * Return the k nearest neighbor documents as determined by comparison of their vector values
* for this field, to the given vector, by the field's search strategy. If the search strategy is * for this field, to the given vector, by the field's search strategy. If the search strategy is
@ -128,15 +84,14 @@ public abstract class VectorValues extends DocIdSetIterator {
* @param fanout control the accuracy/speed tradeoff - larger values give better recall at higher cost * @param fanout control the accuracy/speed tradeoff - larger values give better recall at higher cost
* @return the k nearest neighbor documents, along with their (searchStrategy-specific) scores. * @return the k nearest neighbor documents, along with their (searchStrategy-specific) scores.
*/ */
TopDocs search(float[] target, int k, int fanout) throws IOException; public abstract TopDocs search(float[] target, int k, int fanout) throws IOException;
}
/** /**
* Search strategy. This is a label describing the method used during indexing and searching of the vectors in order to * Search strategy. This is a label describing the method used during indexing and searching of the vectors in order to
* determine the nearest neighbors. * determine the nearest neighbors.
*/ */
public enum SearchStrategy { public enum SearchStrategy {
/** No search strategy is provided. Note: {@link VectorValues.RandomAccess#search(float[], int, int)} /** No search strategy is provided. Note: {@link VectorValues#search(float[], int, int)}
* is not supported for fields specifying this strategy. */ * is not supported for fields specifying this strategy. */
NONE, NONE,
@ -174,7 +129,7 @@ public abstract class VectorValues extends DocIdSetIterator {
} }
@Override @Override
public RandomAccess randomAccess() { public TopDocs search(float[] target, int k, int fanout) {
throw new UnsupportedOperationException(); throw new UnsupportedOperationException();
} }

View File

@ -98,17 +98,17 @@ class VectorValuesWriter {
} }
} }
static class SortingVectorValues extends VectorValues { static class SortingVectorValues extends VectorValues implements RandomAccessVectorValuesProducer {
private final VectorValues delegate; private final VectorValues delegate;
private final VectorValues.RandomAccess randomAccess; private final RandomAccessVectorValues randomAccess;
private final int[] docIdOffsets; private final int[] docIdOffsets;
private final int[] ordMap; private final int[] ordMap;
private int docId = -1; private int docId = -1;
SortingVectorValues(VectorValues delegate, Sorter.DocMap sortMap) throws IOException { SortingVectorValues(VectorValues delegate, Sorter.DocMap sortMap) throws IOException {
this.delegate = delegate; this.delegate = delegate;
randomAccess = delegate.randomAccess(); randomAccess = ((RandomAccessVectorValuesProducer) delegate).randomAccess();
docIdOffsets = new int[sortMap.size()]; docIdOffsets = new int[sortMap.size()];
int offset = 1; // 0 means no vector for this (field, document) int offset = 1; // 0 means no vector for this (field, document)
@ -181,10 +181,16 @@ class VectorValuesWriter {
return size(); return size();
} }
@Override @Override
public RandomAccess randomAccess() { public TopDocs search(float[] target, int k, int fanout) {
RandomAccess ra = delegate.randomAccess(); throw new UnsupportedOperationException();
return new RandomAccess() { }
@Override
public RandomAccessVectorValues randomAccess() {
return new RandomAccessVectorValues() {
@Override @Override
public int size() { public int size() {
@ -203,7 +209,7 @@ class VectorValuesWriter {
@Override @Override
public float[] vectorValue(int targetOrd) throws IOException { public float[] vectorValue(int targetOrd) throws IOException {
return ra.vectorValue(ordMap[targetOrd]); return randomAccess.vectorValue(ordMap[targetOrd]);
} }
@Override @Override
@ -211,15 +217,11 @@ class VectorValuesWriter {
throw new UnsupportedOperationException(); throw new UnsupportedOperationException();
} }
@Override
public TopDocs search(float[] target, int k, int fanout) {
throw new UnsupportedOperationException();
}
}; };
} }
} }
private static class BufferedVectorValues extends VectorValues implements VectorValues.RandomAccess { private static class BufferedVectorValues extends VectorValues implements RandomAccessVectorValues, RandomAccessVectorValuesProducer {
final DocsWithFieldSet docsWithField; final DocsWithFieldSet docsWithField;
@ -249,7 +251,7 @@ class VectorValuesWriter {
} }
@Override @Override
public RandomAccess randomAccess() { public RandomAccessVectorValues randomAccess() {
return this; return this;
} }

View File

@ -593,7 +593,7 @@ public class TestVectorValues extends LuceneTestCase {
assertEquals(4f, vectorValues.vectorValue()[0], 0); assertEquals(4f, vectorValues.vectorValue()[0], 0);
assertEquals(NO_MORE_DOCS, vectorValues.nextDoc()); assertEquals(NO_MORE_DOCS, vectorValues.nextDoc());
VectorValues.RandomAccess ra = vectorValues.randomAccess(); RandomAccessVectorValues ra = ((RandomAccessVectorValuesProducer) vectorValues).randomAccess();
assertEquals(1f, ra.vectorValue(0)[0], 0); assertEquals(1f, ra.vectorValue(0)[0], 0);
assertEquals(2f, ra.vectorValue(1)[0], 0); assertEquals(2f, ra.vectorValue(1)[0], 0);
assertEquals(4f, ra.vectorValue(2)[0], 0); assertEquals(4f, ra.vectorValue(2)[0], 0);