mirror of https://github.com/apache/lucene.git
Refactor and javadoc update for KNN vector writer classes (#13548)
This commit is contained in:
parent
3304b60c9c
commit
ceb4539609
|
@ -258,7 +258,8 @@ New Features
|
||||||
|
|
||||||
Improvements
|
Improvements
|
||||||
---------------------
|
---------------------
|
||||||
(No changes)
|
|
||||||
|
* GITHUB#13548: Refactor and javadoc update for KNN vector writer classes. (Patrick Zhai)
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
---------------------
|
---------------------
|
||||||
|
|
|
@ -20,14 +20,18 @@ package org.apache.lucene.codecs;
|
||||||
import java.io.Closeable;
|
import java.io.Closeable;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Objects;
|
||||||
import org.apache.lucene.index.ByteVectorValues;
|
import org.apache.lucene.index.ByteVectorValues;
|
||||||
import org.apache.lucene.index.DocIDMerger;
|
import org.apache.lucene.index.DocIDMerger;
|
||||||
|
import org.apache.lucene.index.DocsWithFieldSet;
|
||||||
import org.apache.lucene.index.FieldInfo;
|
import org.apache.lucene.index.FieldInfo;
|
||||||
import org.apache.lucene.index.FloatVectorValues;
|
import org.apache.lucene.index.FloatVectorValues;
|
||||||
import org.apache.lucene.index.MergeState;
|
import org.apache.lucene.index.MergeState;
|
||||||
import org.apache.lucene.index.Sorter;
|
import org.apache.lucene.index.Sorter;
|
||||||
import org.apache.lucene.index.VectorEncoding;
|
import org.apache.lucene.index.VectorEncoding;
|
||||||
|
import org.apache.lucene.internal.hppc.IntIntHashMap;
|
||||||
import org.apache.lucene.search.DocIdSetIterator;
|
import org.apache.lucene.search.DocIdSetIterator;
|
||||||
import org.apache.lucene.search.VectorScorer;
|
import org.apache.lucene.search.VectorScorer;
|
||||||
import org.apache.lucene.util.Accountable;
|
import org.apache.lucene.util.Accountable;
|
||||||
|
@ -139,6 +143,60 @@ public abstract class KnnVectorsWriter implements Accountable, Closeable {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Given old doc ids and an id mapping, maps old ordinal to new ordinal. Note: this method return
|
||||||
|
* nothing and output are written to parameters
|
||||||
|
*
|
||||||
|
* @param oldDocIds the old or current document ordinals. Must not be null.
|
||||||
|
* @param sortMap the document sorting map for how to make the new ordinals. Must not be null.
|
||||||
|
* @param old2NewOrd int[] maps from old ord to new ord
|
||||||
|
* @param new2OldOrd int[] maps from new ord to old ord
|
||||||
|
* @param newDocsWithField set of new doc ids which has the value
|
||||||
|
*/
|
||||||
|
public static void mapOldOrdToNewOrd(
|
||||||
|
DocsWithFieldSet oldDocIds,
|
||||||
|
Sorter.DocMap sortMap,
|
||||||
|
int[] old2NewOrd,
|
||||||
|
int[] new2OldOrd,
|
||||||
|
DocsWithFieldSet newDocsWithField)
|
||||||
|
throws IOException {
|
||||||
|
// TODO: a similar function exists in IncrementalHnswGraphMerger#getNewOrdMapping
|
||||||
|
// maybe we can do a further refactoring
|
||||||
|
Objects.requireNonNull(oldDocIds);
|
||||||
|
Objects.requireNonNull(sortMap);
|
||||||
|
assert (old2NewOrd != null || new2OldOrd != null || newDocsWithField != null);
|
||||||
|
assert (old2NewOrd == null || old2NewOrd.length == oldDocIds.cardinality());
|
||||||
|
assert (new2OldOrd == null || new2OldOrd.length == oldDocIds.cardinality());
|
||||||
|
IntIntHashMap newIdToOldOrd = new IntIntHashMap();
|
||||||
|
DocIdSetIterator iterator = oldDocIds.iterator();
|
||||||
|
int[] newDocIds = new int[oldDocIds.cardinality()];
|
||||||
|
int oldOrd = 0;
|
||||||
|
for (int oldDocId = iterator.nextDoc();
|
||||||
|
oldDocId != DocIdSetIterator.NO_MORE_DOCS;
|
||||||
|
oldDocId = iterator.nextDoc()) {
|
||||||
|
int newId = sortMap.oldToNew(oldDocId);
|
||||||
|
newIdToOldOrd.put(newId, oldOrd);
|
||||||
|
newDocIds[oldOrd] = newId;
|
||||||
|
oldOrd++;
|
||||||
|
}
|
||||||
|
|
||||||
|
Arrays.sort(newDocIds);
|
||||||
|
int newOrd = 0;
|
||||||
|
for (int newDocId : newDocIds) {
|
||||||
|
int currOldOrd = newIdToOldOrd.get(newDocId);
|
||||||
|
if (old2NewOrd != null) {
|
||||||
|
old2NewOrd[currOldOrd] = newOrd;
|
||||||
|
}
|
||||||
|
if (new2OldOrd != null) {
|
||||||
|
new2OldOrd[newOrd] = currOldOrd;
|
||||||
|
}
|
||||||
|
if (newDocsWithField != null) {
|
||||||
|
newDocsWithField.add(newDocId);
|
||||||
|
}
|
||||||
|
newOrd++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/** View over multiple vector values supporting iterator-style access via DocIdMerger. */
|
/** View over multiple vector values supporting iterator-style access via DocIdMerger. */
|
||||||
public static final class MergedVectorValues {
|
public static final class MergedVectorValues {
|
||||||
private MergedVectorValues() {}
|
private MergedVectorValues() {}
|
||||||
|
|
|
@ -40,7 +40,7 @@ public class OrdToDocDISIReaderConfiguration {
|
||||||
* <p>Within outputMeta the format is as follows:
|
* <p>Within outputMeta the format is as follows:
|
||||||
*
|
*
|
||||||
* <ul>
|
* <ul>
|
||||||
* <li><b>[int8]</b> if equals to -2, empty - no vectory values. If equals to -1, dense – all
|
* <li><b>[int8]</b> if equals to -2, empty - no vector values. If equals to -1, dense – all
|
||||||
* documents have values for a field. If equals to 0, sparse – some documents missing
|
* documents have values for a field. If equals to 0, sparse – some documents missing
|
||||||
* values.
|
* values.
|
||||||
* <li>DocIds were encoded by {@link IndexedDISI#writeBitSet(DocIdSetIterator, IndexOutput,
|
* <li>DocIds were encoded by {@link IndexedDISI#writeBitSet(DocIdSetIterator, IndexOutput,
|
||||||
|
|
|
@ -56,8 +56,8 @@ import org.apache.lucene.store.IndexOutput;
|
||||||
* <li><b>[vlong]</b> length of this field's vectors, in bytes
|
* <li><b>[vlong]</b> length of this field's vectors, in bytes
|
||||||
* <li><b>[vint]</b> dimension of this field's vectors
|
* <li><b>[vint]</b> dimension of this field's vectors
|
||||||
* <li><b>[int]</b> the number of documents having values for this field
|
* <li><b>[int]</b> the number of documents having values for this field
|
||||||
* <li><b>[int8]</b> if equals to -1, dense – all documents have values for a field. If equals to
|
* <li><b>[int8]</b> if equals to -2, empty - no vector values. If equals to -1, dense – all
|
||||||
* 0, sparse – some documents missing values.
|
* documents have values for a field. If equals to 0, sparse – some documents missing values.
|
||||||
* <li>DocIds were encoded by {@link IndexedDISI#writeBitSet(DocIdSetIterator, IndexOutput, byte)}
|
* <li>DocIds were encoded by {@link IndexedDISI#writeBitSet(DocIdSetIterator, IndexOutput, byte)}
|
||||||
* <li>OrdToDoc was encoded by {@link org.apache.lucene.util.packed.DirectMonotonicWriter}, note
|
* <li>OrdToDoc was encoded by {@link org.apache.lucene.util.packed.DirectMonotonicWriter}, note
|
||||||
* that only in sparse case
|
* that only in sparse case
|
||||||
|
|
|
@ -44,7 +44,6 @@ import org.apache.lucene.index.MergeState;
|
||||||
import org.apache.lucene.index.SegmentWriteState;
|
import org.apache.lucene.index.SegmentWriteState;
|
||||||
import org.apache.lucene.index.Sorter;
|
import org.apache.lucene.index.Sorter;
|
||||||
import org.apache.lucene.index.VectorEncoding;
|
import org.apache.lucene.index.VectorEncoding;
|
||||||
import org.apache.lucene.search.DocIdSetIterator;
|
|
||||||
import org.apache.lucene.store.IOContext;
|
import org.apache.lucene.store.IOContext;
|
||||||
import org.apache.lucene.store.IndexInput;
|
import org.apache.lucene.store.IndexInput;
|
||||||
import org.apache.lucene.store.IndexOutput;
|
import org.apache.lucene.store.IndexOutput;
|
||||||
|
@ -191,27 +190,10 @@ public final class Lucene99FlatVectorsWriter extends FlatVectorsWriter {
|
||||||
|
|
||||||
private void writeSortingField(FieldWriter<?> fieldData, int maxDoc, Sorter.DocMap sortMap)
|
private void writeSortingField(FieldWriter<?> fieldData, int maxDoc, Sorter.DocMap sortMap)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
final int[] docIdOffsets = new int[sortMap.size()];
|
final int[] ordMap = new int[fieldData.docsWithField.cardinality()]; // new ord to old ord
|
||||||
int offset = 1; // 0 means no vector for this (field, document)
|
|
||||||
DocIdSetIterator iterator = fieldData.docsWithField.iterator();
|
|
||||||
for (int docID = iterator.nextDoc();
|
|
||||||
docID != DocIdSetIterator.NO_MORE_DOCS;
|
|
||||||
docID = iterator.nextDoc()) {
|
|
||||||
int newDocID = sortMap.oldToNew(docID);
|
|
||||||
docIdOffsets[newDocID] = offset++;
|
|
||||||
}
|
|
||||||
DocsWithFieldSet newDocsWithField = new DocsWithFieldSet();
|
DocsWithFieldSet newDocsWithField = new DocsWithFieldSet();
|
||||||
final int[] ordMap = new int[offset - 1]; // new ord to old ord
|
mapOldOrdToNewOrd(fieldData.docsWithField, sortMap, null, ordMap, newDocsWithField);
|
||||||
int ord = 0;
|
|
||||||
int doc = 0;
|
|
||||||
for (int docIdOffset : docIdOffsets) {
|
|
||||||
if (docIdOffset != 0) {
|
|
||||||
ordMap[ord] = docIdOffset - 1;
|
|
||||||
newDocsWithField.add(doc);
|
|
||||||
ord++;
|
|
||||||
}
|
|
||||||
doc++;
|
|
||||||
}
|
|
||||||
|
|
||||||
// write vector values
|
// write vector values
|
||||||
long vectorDataOffset =
|
long vectorDataOffset =
|
||||||
|
|
|
@ -24,14 +24,11 @@ import org.apache.lucene.codecs.KnnVectorsReader;
|
||||||
import org.apache.lucene.codecs.KnnVectorsWriter;
|
import org.apache.lucene.codecs.KnnVectorsWriter;
|
||||||
import org.apache.lucene.codecs.hnsw.FlatVectorScorerUtil;
|
import org.apache.lucene.codecs.hnsw.FlatVectorScorerUtil;
|
||||||
import org.apache.lucene.codecs.hnsw.FlatVectorsFormat;
|
import org.apache.lucene.codecs.hnsw.FlatVectorsFormat;
|
||||||
import org.apache.lucene.codecs.lucene90.IndexedDISI;
|
|
||||||
import org.apache.lucene.index.MergePolicy;
|
import org.apache.lucene.index.MergePolicy;
|
||||||
import org.apache.lucene.index.MergeScheduler;
|
import org.apache.lucene.index.MergeScheduler;
|
||||||
import org.apache.lucene.index.SegmentReadState;
|
import org.apache.lucene.index.SegmentReadState;
|
||||||
import org.apache.lucene.index.SegmentWriteState;
|
import org.apache.lucene.index.SegmentWriteState;
|
||||||
import org.apache.lucene.search.DocIdSetIterator;
|
|
||||||
import org.apache.lucene.search.TaskExecutor;
|
import org.apache.lucene.search.TaskExecutor;
|
||||||
import org.apache.lucene.store.IndexOutput;
|
|
||||||
import org.apache.lucene.util.hnsw.HnswGraph;
|
import org.apache.lucene.util.hnsw.HnswGraph;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -69,11 +66,6 @@ import org.apache.lucene.util.hnsw.HnswGraph;
|
||||||
* <li><b>[vlong]</b> length of this field's index data, in bytes
|
* <li><b>[vlong]</b> length of this field's index data, in bytes
|
||||||
* <li><b>[vint]</b> dimension of this field's vectors
|
* <li><b>[vint]</b> dimension of this field's vectors
|
||||||
* <li><b>[int]</b> the number of documents having values for this field
|
* <li><b>[int]</b> the number of documents having values for this field
|
||||||
* <li><b>[int8]</b> if equals to -1, dense – all documents have values for a field. If equals to
|
|
||||||
* 0, sparse – some documents missing values.
|
|
||||||
* <li>DocIds were encoded by {@link IndexedDISI#writeBitSet(DocIdSetIterator, IndexOutput, byte)}
|
|
||||||
* <li>OrdToDoc was encoded by {@link org.apache.lucene.util.packed.DirectMonotonicWriter}, note
|
|
||||||
* that only in sparse case
|
|
||||||
* <li><b>[vint]</b> the maximum number of connections (neighbours) that each node can have
|
* <li><b>[vint]</b> the maximum number of connections (neighbours) that each node can have
|
||||||
* <li><b>[vint]</b> number of levels in the graph
|
* <li><b>[vint]</b> number of levels in the graph
|
||||||
* <li>Graph nodes by level. For each level
|
* <li>Graph nodes by level. For each level
|
||||||
|
|
|
@ -196,29 +196,10 @@ public final class Lucene99HnswVectorsWriter extends KnnVectorsWriter {
|
||||||
|
|
||||||
private void writeSortingField(FieldWriter<?> fieldData, Sorter.DocMap sortMap)
|
private void writeSortingField(FieldWriter<?> fieldData, Sorter.DocMap sortMap)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
final int[] docIdOffsets = new int[sortMap.size()];
|
final int[] ordMap = new int[fieldData.docsWithField.cardinality()]; // new ord to old ord
|
||||||
int offset = 1; // 0 means no vector for this (field, document)
|
final int[] oldOrdMap = new int[fieldData.docsWithField.cardinality()]; // old ord to new ord
|
||||||
DocIdSetIterator iterator = fieldData.docsWithField.iterator();
|
|
||||||
for (int docID = iterator.nextDoc();
|
mapOldOrdToNewOrd(fieldData.docsWithField, sortMap, oldOrdMap, ordMap, null);
|
||||||
docID != DocIdSetIterator.NO_MORE_DOCS;
|
|
||||||
docID = iterator.nextDoc()) {
|
|
||||||
int newDocID = sortMap.oldToNew(docID);
|
|
||||||
docIdOffsets[newDocID] = offset++;
|
|
||||||
}
|
|
||||||
DocsWithFieldSet newDocsWithField = new DocsWithFieldSet();
|
|
||||||
final int[] ordMap = new int[offset - 1]; // new ord to old ord
|
|
||||||
final int[] oldOrdMap = new int[offset - 1]; // old ord to new ord
|
|
||||||
int ord = 0;
|
|
||||||
int doc = 0;
|
|
||||||
for (int docIdOffset : docIdOffsets) {
|
|
||||||
if (docIdOffset != 0) {
|
|
||||||
ordMap[ord] = docIdOffset - 1;
|
|
||||||
oldOrdMap[docIdOffset - 1] = ord;
|
|
||||||
newDocsWithField.add(doc);
|
|
||||||
ord++;
|
|
||||||
}
|
|
||||||
doc++;
|
|
||||||
}
|
|
||||||
// write graph
|
// write graph
|
||||||
long vectorIndexOffset = vectorIndex.getFilePointer();
|
long vectorIndexOffset = vectorIndex.getFilePointer();
|
||||||
OnHeapHnswGraph graph = fieldData.getGraph();
|
OnHeapHnswGraph graph = fieldData.getGraph();
|
||||||
|
|
|
@ -399,27 +399,10 @@ public final class Lucene99ScalarQuantizedVectorsWriter extends FlatVectorsWrite
|
||||||
|
|
||||||
private void writeSortingField(FieldWriter fieldData, int maxDoc, Sorter.DocMap sortMap)
|
private void writeSortingField(FieldWriter fieldData, int maxDoc, Sorter.DocMap sortMap)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
final int[] docIdOffsets = new int[sortMap.size()];
|
final int[] ordMap = new int[fieldData.docsWithField.cardinality()]; // new ord to old ord
|
||||||
int offset = 1; // 0 means no vector for this (field, document)
|
|
||||||
DocIdSetIterator iterator = fieldData.docsWithField.iterator();
|
|
||||||
for (int docID = iterator.nextDoc();
|
|
||||||
docID != DocIdSetIterator.NO_MORE_DOCS;
|
|
||||||
docID = iterator.nextDoc()) {
|
|
||||||
int newDocID = sortMap.oldToNew(docID);
|
|
||||||
docIdOffsets[newDocID] = offset++;
|
|
||||||
}
|
|
||||||
DocsWithFieldSet newDocsWithField = new DocsWithFieldSet();
|
DocsWithFieldSet newDocsWithField = new DocsWithFieldSet();
|
||||||
final int[] ordMap = new int[offset - 1]; // new ord to old ord
|
mapOldOrdToNewOrd(fieldData.docsWithField, sortMap, null, ordMap, newDocsWithField);
|
||||||
int ord = 0;
|
|
||||||
int doc = 0;
|
|
||||||
for (int docIdOffset : docIdOffsets) {
|
|
||||||
if (docIdOffset != 0) {
|
|
||||||
ordMap[ord] = docIdOffset - 1;
|
|
||||||
newDocsWithField.add(doc);
|
|
||||||
ord++;
|
|
||||||
}
|
|
||||||
doc++;
|
|
||||||
}
|
|
||||||
|
|
||||||
// write vector values
|
// write vector values
|
||||||
long vectorDataOffset = quantizedVectorData.alignFilePointer(Float.BYTES);
|
long vectorDataOffset = quantizedVectorData.alignFilePointer(Float.BYTES);
|
||||||
|
|
|
@ -356,7 +356,7 @@ final class FieldUpdatesBuffer {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
BytesRef nextTerm() throws IOException {
|
private BytesRef nextTerm() throws IOException {
|
||||||
if (lookAheadTermIterator != null) {
|
if (lookAheadTermIterator != null) {
|
||||||
if (bufferedUpdate.termValue == null) {
|
if (bufferedUpdate.termValue == null) {
|
||||||
lookAheadTermIterator.next();
|
lookAheadTermIterator.next();
|
||||||
|
|
Loading…
Reference in New Issue