mirror of https://github.com/apache/lucene.git
LUCENE-9573 Add Vectors to TestBackwardsCompatibility (#616)
This patch adds KNN vectors for testing backward compatible indices - Add a KnnVectorField to documents when creating a new backward compatible index - Add knn vectors search and check for vector values to the testing of search of backward compatible indices - Add tests for knn vector search when changing backward compatible indices (merging them and adding new documents to them)
This commit is contained in:
parent
df12e2b195
commit
8dfdb261e7
|
@ -16,6 +16,9 @@
|
|||
*/
|
||||
package org.apache.lucene.backward_index;
|
||||
|
||||
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
|
||||
import static org.apache.lucene.util.Version.LUCENE_10_0_0;
|
||||
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
@ -50,6 +53,7 @@ import org.apache.lucene.document.FieldType;
|
|||
import org.apache.lucene.document.FloatDocValuesField;
|
||||
import org.apache.lucene.document.FloatPoint;
|
||||
import org.apache.lucene.document.IntPoint;
|
||||
import org.apache.lucene.document.KnnVectorField;
|
||||
import org.apache.lucene.document.LongPoint;
|
||||
import org.apache.lucene.document.NumericDocValuesField;
|
||||
import org.apache.lucene.document.SortedDocValuesField;
|
||||
|
@ -90,10 +94,13 @@ import org.apache.lucene.index.StandardDirectoryReader;
|
|||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.index.VectorSimilarityFunction;
|
||||
import org.apache.lucene.index.VectorValues;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.search.DocValuesFieldExistsQuery;
|
||||
import org.apache.lucene.search.FieldDoc;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.KnnVectorQuery;
|
||||
import org.apache.lucene.search.MatchAllDocsQuery;
|
||||
import org.apache.lucene.search.NormsFieldExistsQuery;
|
||||
import org.apache.lucene.search.ScoreDoc;
|
||||
|
@ -153,6 +160,16 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
|
|||
// Then move those 2 zip files to your trunk checkout and add them
|
||||
// to the oldNames array.
|
||||
|
||||
private static final int DOCS_COUNT = 35;
|
||||
private static final int DELETED_ID = 7;
|
||||
|
||||
// change version to 9 when we update 9 backward indices to include KNN field
|
||||
private static final int KNN_VECTOR_MIN_SUPPORTED_VERSION = LUCENE_10_0_0.major;
|
||||
private static final String KNN_VECTOR_FIELD = "knn_field";
|
||||
private static final FieldType KNN_VECTOR_FIELD_TYPE =
|
||||
KnnVectorField.createFieldType(3, VectorSimilarityFunction.COSINE);
|
||||
private static final float[] KNN_VECTOR = {0.2f, -0.1f, 0.1f};
|
||||
|
||||
public void testCreateCFS() throws IOException {
|
||||
createIndex("index.cfs", true, false);
|
||||
}
|
||||
|
@ -1037,10 +1054,11 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
public void testIndexOldIndexNoAdds() throws IOException {
|
||||
public void testIndexOldIndexNoAdds() throws Exception {
|
||||
for (String name : oldNames) {
|
||||
Directory dir = newDirectory(oldIndexDirs.get(name));
|
||||
changeIndexNoAdds(random(), dir);
|
||||
Version version = Version.parse(name.substring(0, name.indexOf('-')));
|
||||
changeIndexNoAdds(random(), dir, version);
|
||||
dir.close();
|
||||
}
|
||||
}
|
||||
|
@ -1080,7 +1098,7 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
|
|||
final Bits liveDocs = MultiBits.getLiveDocs(reader);
|
||||
assertNotNull(liveDocs);
|
||||
|
||||
for (int i = 0; i < 35; i++) {
|
||||
for (int i = 0; i < DOCS_COUNT; i++) {
|
||||
if (liveDocs.get(i)) {
|
||||
Document d = reader.document(i);
|
||||
List<IndexableField> fields = d.getFields();
|
||||
|
@ -1110,8 +1128,7 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
|
|||
Terms tfv = tfvFields.terms("utf8");
|
||||
assertNotNull("docID=" + i + " index=" + oldName, tfv);
|
||||
} else {
|
||||
// Only ID 7 is deleted
|
||||
assertEquals(7, i);
|
||||
assertEquals(DELETED_ID, i);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1137,7 +1154,7 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
|
|||
SortedNumericDocValues dvSortedNumeric =
|
||||
MultiDocValues.getSortedNumericValues(reader, "dvSortedNumeric");
|
||||
|
||||
for (int i = 0; i < 35; i++) {
|
||||
for (int i = 0; i < DOCS_COUNT; i++) {
|
||||
int id = Integer.parseInt(reader.document(i).get("id"));
|
||||
assertEquals(i, dvByte.nextDoc());
|
||||
assertEquals(id, dvByte.longValue());
|
||||
|
@ -1278,9 +1295,51 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
|
|||
34,
|
||||
searcher.getIndexReader());
|
||||
|
||||
// test vector values and KNN search
|
||||
if (minIndexMajorVersion >= KNN_VECTOR_MIN_SUPPORTED_VERSION) {
|
||||
// test vector values
|
||||
int cnt = 0;
|
||||
for (LeafReaderContext ctx : reader.leaves()) {
|
||||
VectorValues values = ctx.reader().getVectorValues(KNN_VECTOR_FIELD);
|
||||
if (values != null) {
|
||||
assertEquals(KNN_VECTOR_FIELD_TYPE.vectorDimension(), values.dimension());
|
||||
for (int doc = values.nextDoc(); doc != NO_MORE_DOCS; doc = values.nextDoc()) {
|
||||
float[] expectedVector = {KNN_VECTOR[0], KNN_VECTOR[1], KNN_VECTOR[2] + 0.1f * cnt};
|
||||
assertArrayEquals(
|
||||
"vectors do not match for doc=" + cnt, expectedVector, values.vectorValue(), 0);
|
||||
cnt++;
|
||||
}
|
||||
}
|
||||
}
|
||||
assertEquals(DOCS_COUNT, cnt);
|
||||
|
||||
// test KNN search
|
||||
ScoreDoc[] scoreDocs = assertKNNSearch(searcher, KNN_VECTOR, 10, 10, "0");
|
||||
for (int i = 0; i < scoreDocs.length; i++) {
|
||||
int id = Integer.parseInt(reader.document(scoreDocs[i].doc).get("id"));
|
||||
int expectedId = i < DELETED_ID ? i : i + 1;
|
||||
assertEquals(expectedId, id);
|
||||
}
|
||||
}
|
||||
|
||||
reader.close();
|
||||
}
|
||||
|
||||
private static ScoreDoc[] assertKNNSearch(
|
||||
IndexSearcher searcher,
|
||||
float[] queryVector,
|
||||
int k,
|
||||
int expectedHitsCount,
|
||||
String expectedFirstDocId)
|
||||
throws IOException {
|
||||
ScoreDoc[] hits =
|
||||
searcher.search(new KnnVectorQuery(KNN_VECTOR_FIELD, queryVector, k), k).scoreDocs;
|
||||
assertEquals("wrong number of hits", expectedHitsCount, hits.length);
|
||||
Document d = searcher.doc(hits[0].doc);
|
||||
assertEquals("wrong first document", expectedFirstDocId, d.get("id"));
|
||||
return hits;
|
||||
}
|
||||
|
||||
public void changeIndexWithAdds(Random random, Directory dir, Version nameVersion)
|
||||
throws IOException {
|
||||
SegmentInfos infos = SegmentInfos.readLatestCommit(dir);
|
||||
|
@ -1296,7 +1355,7 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
|
|||
.setMergePolicy(newLogMergePolicy()));
|
||||
// add 10 docs
|
||||
for (int i = 0; i < 10; i++) {
|
||||
addDoc(writer, 35 + i);
|
||||
addDoc(writer, DOCS_COUNT + i);
|
||||
}
|
||||
|
||||
// make sure writer sees right total -- writer seems not to know about deletes in .del?
|
||||
|
@ -1304,13 +1363,25 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
|
|||
assertEquals("wrong doc count", expected, writer.getDocStats().numDocs);
|
||||
writer.close();
|
||||
|
||||
// make sure searching sees right # hits
|
||||
// make sure searching sees right # hits for term search
|
||||
IndexReader reader = DirectoryReader.open(dir);
|
||||
IndexSearcher searcher = newSearcher(reader);
|
||||
ScoreDoc[] hits = searcher.search(new TermQuery(new Term("content", "aaa")), 1000).scoreDocs;
|
||||
Document d = searcher.getIndexReader().document(hits[0].doc);
|
||||
assertEquals("wrong first document", "0", d.get("id"));
|
||||
doTestHits(hits, 44, searcher.getIndexReader());
|
||||
|
||||
if (nameVersion.major >= KNN_VECTOR_MIN_SUPPORTED_VERSION) {
|
||||
// make sure KNN search sees all hits (graph may not be used if k is big)
|
||||
assertKNNSearch(searcher, KNN_VECTOR, 1000, 44, "0");
|
||||
// make sure KNN search using HNSW graph sees newly added docs
|
||||
assertKNNSearch(
|
||||
searcher,
|
||||
new float[] {KNN_VECTOR[0], KNN_VECTOR[1], KNN_VECTOR[2] + 0.1f * 44},
|
||||
10,
|
||||
10,
|
||||
"44");
|
||||
}
|
||||
reader.close();
|
||||
|
||||
// fully merge
|
||||
|
@ -1325,22 +1396,43 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
|
|||
|
||||
reader = DirectoryReader.open(dir);
|
||||
searcher = newSearcher(reader);
|
||||
// make sure searching sees right # hits fot term search
|
||||
hits = searcher.search(new TermQuery(new Term("content", "aaa")), 1000).scoreDocs;
|
||||
assertEquals("wrong number of hits", 44, hits.length);
|
||||
d = searcher.doc(hits[0].doc);
|
||||
doTestHits(hits, 44, searcher.getIndexReader());
|
||||
assertEquals("wrong first document", "0", d.get("id"));
|
||||
|
||||
if (nameVersion.major >= KNN_VECTOR_MIN_SUPPORTED_VERSION) {
|
||||
// make sure KNN search sees all hits
|
||||
assertKNNSearch(searcher, KNN_VECTOR, 1000, 44, "0");
|
||||
// make sure KNN search using HNSW graph sees newly added docs
|
||||
assertKNNSearch(
|
||||
searcher,
|
||||
new float[] {KNN_VECTOR[0], KNN_VECTOR[1], KNN_VECTOR[2] + 0.1f * 44},
|
||||
10,
|
||||
10,
|
||||
"44");
|
||||
}
|
||||
reader.close();
|
||||
}
|
||||
|
||||
public void changeIndexNoAdds(Random random, Directory dir) throws IOException {
|
||||
// make sure searching sees right # hits
|
||||
public void changeIndexNoAdds(Random random, Directory dir, Version nameVersion)
|
||||
throws IOException {
|
||||
// make sure searching sees right # hits for term search
|
||||
DirectoryReader reader = DirectoryReader.open(dir);
|
||||
IndexSearcher searcher = newSearcher(reader);
|
||||
ScoreDoc[] hits = searcher.search(new TermQuery(new Term("content", "aaa")), 1000).scoreDocs;
|
||||
assertEquals("wrong number of hits", 34, hits.length);
|
||||
Document d = searcher.doc(hits[0].doc);
|
||||
assertEquals("wrong first document", "0", d.get("id"));
|
||||
|
||||
if (nameVersion.major >= KNN_VECTOR_MIN_SUPPORTED_VERSION) {
|
||||
// make sure KNN search sees all hits
|
||||
assertKNNSearch(searcher, KNN_VECTOR, 1000, 34, "0");
|
||||
// make sure KNN search using HNSW graph retrieves correct results
|
||||
assertKNNSearch(searcher, KNN_VECTOR, 10, 10, "0");
|
||||
}
|
||||
reader.close();
|
||||
|
||||
// fully merge
|
||||
|
@ -1352,9 +1444,17 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
|
|||
|
||||
reader = DirectoryReader.open(dir);
|
||||
searcher = newSearcher(reader);
|
||||
// make sure searching sees right # hits fot term search
|
||||
hits = searcher.search(new TermQuery(new Term("content", "aaa")), 1000).scoreDocs;
|
||||
assertEquals("wrong number of hits", 34, hits.length);
|
||||
doTestHits(hits, 34, searcher.getIndexReader());
|
||||
// make sure searching sees right # hits for KNN search
|
||||
if (nameVersion.major >= KNN_VECTOR_MIN_SUPPORTED_VERSION) {
|
||||
// make sure KNN search sees all hits
|
||||
assertKNNSearch(searcher, KNN_VECTOR, 1000, 34, "0");
|
||||
// make sure KNN search using HNSW graph retrieves correct results
|
||||
assertKNNSearch(searcher, KNN_VECTOR, 10, 10, "0");
|
||||
}
|
||||
reader.close();
|
||||
}
|
||||
|
||||
|
@ -1372,10 +1472,10 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
|
|||
.setMergePolicy(NoMergePolicy.INSTANCE);
|
||||
IndexWriter writer = new IndexWriter(dir, conf);
|
||||
|
||||
for (int i = 0; i < 35; i++) {
|
||||
for (int i = 0; i < DOCS_COUNT; i++) {
|
||||
addDoc(writer, i);
|
||||
}
|
||||
assertEquals("wrong doc count", 35, writer.getDocStats().maxDoc);
|
||||
assertEquals("wrong doc count", DOCS_COUNT, writer.getDocStats().maxDoc);
|
||||
if (fullyMerged) {
|
||||
writer.forceMerge(1);
|
||||
}
|
||||
|
@ -1399,7 +1499,7 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
|
|||
.setMaxBufferedDocs(10)
|
||||
.setMergePolicy(NoMergePolicy.INSTANCE);
|
||||
writer = new IndexWriter(dir, conf);
|
||||
Term searchTerm = new Term("id", "7");
|
||||
Term searchTerm = new Term("id", String.valueOf(DELETED_ID));
|
||||
writer.deleteDocuments(searchTerm);
|
||||
writer.close();
|
||||
}
|
||||
|
@ -1469,6 +1569,10 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
|
|||
customType4.setStoreTermVectorOffsets(true);
|
||||
customType4.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
|
||||
doc.add(new Field("content6", "here is more content with aaa aaa aaa", customType4));
|
||||
|
||||
float[] vector = {KNN_VECTOR[0], KNN_VECTOR[1], KNN_VECTOR[2] + 0.1f * id};
|
||||
doc.add(new KnnVectorField(KNN_VECTOR_FIELD, vector, KNN_VECTOR_FIELD_TYPE));
|
||||
|
||||
// TODO:
|
||||
// index different norms types via similarity (we use a random one currently?!)
|
||||
// remove any analyzer randomness, explicitly add payloads for certain fields.
|
||||
|
@ -1514,7 +1618,7 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
|
|||
|
||||
// should be found exactly
|
||||
assertEquals(TermsEnum.SeekStatus.FOUND, terms.seekCeil(aaaTerm));
|
||||
assertEquals(35, countDocs(TestUtil.docs(random(), terms, null, PostingsEnum.NONE)));
|
||||
assertEquals(DOCS_COUNT, countDocs(TestUtil.docs(random(), terms, null, PostingsEnum.NONE)));
|
||||
assertNull(terms.next());
|
||||
|
||||
// should hit end of field
|
||||
|
@ -1524,11 +1628,11 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
|
|||
// should seek to aaa
|
||||
assertEquals(TermsEnum.SeekStatus.NOT_FOUND, terms.seekCeil(new BytesRef("a")));
|
||||
assertTrue(terms.term().bytesEquals(aaaTerm));
|
||||
assertEquals(35, countDocs(TestUtil.docs(random(), terms, null, PostingsEnum.NONE)));
|
||||
assertEquals(DOCS_COUNT, countDocs(TestUtil.docs(random(), terms, null, PostingsEnum.NONE)));
|
||||
assertNull(terms.next());
|
||||
|
||||
assertEquals(TermsEnum.SeekStatus.FOUND, terms.seekCeil(aaaTerm));
|
||||
assertEquals(35, countDocs(TestUtil.docs(random(), terms, null, PostingsEnum.NONE)));
|
||||
assertEquals(DOCS_COUNT, countDocs(TestUtil.docs(random(), terms, null, PostingsEnum.NONE)));
|
||||
assertNull(terms.next());
|
||||
|
||||
r.close();
|
||||
|
|
Loading…
Reference in New Issue