LUCENE-10408: Write doc IDs of KNN vectors as ints rather than vints. (#708)

Since doc IDs with a vector are loaded as an int[] in memory, this changes the
on-disk format of vectors to align with the in-memory representation by using
ints instead of vints to represent doc IDs. This might make vectors a bit
larger on disk, but also a bit faster to open.

I made the same change to how we encode nodes on levels for the same reason.
This commit is contained in:
Adrien Grand 2022-02-24 13:36:10 +01:00 committed by GitHub
parent 550d1305db
commit 44d7d962ae
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 6 additions and 6 deletions

View File

@ -69,13 +69,13 @@ import org.apache.lucene.util.hnsw.HnswGraph;
* <li><b>[int]</b> the number of documents having values for this field * <li><b>[int]</b> the number of documents having values for this field
* <li><b>[int8]</b> if equals to -1, dense all documents have values for a field. If equals to * <li><b>[int8]</b> if equals to -1, dense all documents have values for a field. If equals to
* 0, sparse some documents missing values. * 0, sparse some documents missing values.
* <li><b>array[vint]</b> for sparse case, the docids of documents having vectors, in order * <li><b>array[int]</b> for sparse case, the docids of documents having vectors, in order
* <li><b>[int]</b> the maximum number of connections (neigbours) that each node can have * <li><b>[int]</b> the maximum number of connections (neigbours) that each node can have
* <li><b>[int]</b> number of levels in the graph * <li><b>[int]</b> number of levels in the graph
* <li>Graph nodes by level. For each level * <li>Graph nodes by level. For each level
* <ul> * <ul>
* <li><b>[int]</b> the number of nodes on this level * <li><b>[int]</b> the number of nodes on this level
* <li><b>array[vint]</b> for levels greater than 0 list of nodes on this level, stored as * <li><b>array[int]</b> for levels greater than 0 list of nodes on this level, stored as
* the the level 0th nodes ordinals. * the the level 0th nodes ordinals.
* </ul> * </ul>
* </ul> * </ul>

View File

@ -347,7 +347,7 @@ public final class Lucene91HnswVectorsReader extends KnnVectorsReader {
// as not all docs have vector values, fill a mapping from dense vector ordinals to docIds // as not all docs have vector values, fill a mapping from dense vector ordinals to docIds
ordToDoc = new int[size]; ordToDoc = new int[size];
for (int i = 0; i < size; i++) { for (int i = 0; i < size; i++) {
int doc = input.readVInt(); int doc = input.readInt();
ordToDoc[i] = doc; ordToDoc[i] = doc;
} }
} }
@ -366,7 +366,7 @@ public final class Lucene91HnswVectorsReader extends KnnVectorsReader {
} else { } else {
nodesByLevel[level] = new int[numNodesOnLevel]; nodesByLevel[level] = new int[numNodesOnLevel];
for (int i = 0; i < numNodesOnLevel; i++) { for (int i = 0; i < numNodesOnLevel; i++) {
nodesByLevel[level][i] = input.readVInt(); nodesByLevel[level][i] = input.readInt();
} }
} }
} }

View File

@ -213,7 +213,7 @@ public final class Lucene91HnswVectorsWriter extends KnnVectorsWriter {
meta.writeByte((byte) 0); // sparse marker, some documents don't have vector values meta.writeByte((byte) 0); // sparse marker, some documents don't have vector values
DocIdSetIterator iter = docsWithField.iterator(); DocIdSetIterator iter = docsWithField.iterator();
for (int doc = iter.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = iter.nextDoc()) { for (int doc = iter.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = iter.nextDoc()) {
meta.writeVInt(doc); meta.writeInt(doc);
} }
} }
@ -229,7 +229,7 @@ public final class Lucene91HnswVectorsWriter extends KnnVectorsWriter {
if (level > 0) { if (level > 0) {
while (nodesOnLevel.hasNext()) { while (nodesOnLevel.hasNext()) {
int node = nodesOnLevel.nextInt(); int node = nodesOnLevel.nextInt();
meta.writeVInt(node); // list of nodes on a level meta.writeInt(node); // list of nodes on a level
} }
} }
} }