diff --git a/lucene/backward-codecs/src/java/module-info.java b/lucene/backward-codecs/src/java/module-info.java index 5dfe4efb4ab..ae4bd84fa62 100644 --- a/lucene/backward-codecs/src/java/module-info.java +++ b/lucene/backward-codecs/src/java/module-info.java @@ -29,6 +29,7 @@ module org.apache.lucene.backward_codecs { exports org.apache.lucene.backward_codecs.lucene84; exports org.apache.lucene.backward_codecs.lucene86; exports org.apache.lucene.backward_codecs.lucene87; + exports org.apache.lucene.backward_codecs.lucene90; exports org.apache.lucene.backward_codecs.packed; exports org.apache.lucene.backward_codecs.store; @@ -37,9 +38,12 @@ module org.apache.lucene.backward_codecs { provides org.apache.lucene.codecs.PostingsFormat with org.apache.lucene.backward_codecs.lucene50.Lucene50PostingsFormat, org.apache.lucene.backward_codecs.lucene84.Lucene84PostingsFormat; + provides org.apache.lucene.codecs.KnnVectorsFormat with + org.apache.lucene.backward_codecs.lucene90.Lucene90HnswVectorsFormat; provides org.apache.lucene.codecs.Codec with org.apache.lucene.backward_codecs.lucene80.Lucene80Codec, org.apache.lucene.backward_codecs.lucene84.Lucene84Codec, org.apache.lucene.backward_codecs.lucene86.Lucene86Codec, - org.apache.lucene.backward_codecs.lucene87.Lucene87Codec; + org.apache.lucene.backward_codecs.lucene87.Lucene87Codec, + org.apache.lucene.backward_codecs.lucene90.Lucene90Codec; } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90Codec.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90Codec.java similarity index 90% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90Codec.java rename to lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90Codec.java index 97b8c1b7f34..40ba06606bd 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90Codec.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90Codec.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.lucene90; +package org.apache.lucene.backward_codecs.lucene90; import java.util.Objects; import org.apache.lucene.codecs.Codec; @@ -30,6 +30,16 @@ import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.SegmentInfoFormat; import org.apache.lucene.codecs.StoredFieldsFormat; import org.apache.lucene.codecs.TermVectorsFormat; +import org.apache.lucene.codecs.lucene90.Lucene90CompoundFormat; +import org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat; +import org.apache.lucene.codecs.lucene90.Lucene90FieldInfosFormat; +import org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat; +import org.apache.lucene.codecs.lucene90.Lucene90NormsFormat; +import org.apache.lucene.codecs.lucene90.Lucene90PointsFormat; +import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat; +import org.apache.lucene.codecs.lucene90.Lucene90SegmentInfoFormat; +import org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat; +import org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat; import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat; import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat; diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswGraph.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswGraph.java new file mode 100644 index 00000000000..d8d28eca16b --- /dev/null +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswGraph.java @@ -0,0 +1,216 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.backward_codecs.lucene90; + +import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.SplittableRandom; +import org.apache.lucene.index.KnnGraphValues; +import org.apache.lucene.index.RandomAccessVectorValues; +import org.apache.lucene.index.VectorSimilarityFunction; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.SparseFixedBitSet; +import org.apache.lucene.util.hnsw.BoundsChecker; +import org.apache.lucene.util.hnsw.NeighborArray; +import org.apache.lucene.util.hnsw.NeighborQueue; + +/** + * Navigable Small-world graph. Provides efficient approximate nearest neighbor search for high + * dimensional vectors. See Approximate nearest + * neighbor algorithm based on navigable small world graphs [2014] and this paper [2018] for details. + * + *
The nomenclature is a bit different here from what's used in those papers: + * + *
numSeed
is the equivalent of m
in the 2014 paper; it controls the
+ * number of random entry points to sample.
+ * beamWidth
in {@link Lucene90HnswGraphBuilder} has the same meaning as
+ * efConst
in the 2018 paper. It is the number of nearest neighbor candidates to track
+ * while searching the graph for each newly inserted node.
+ * maxConn
has the same meaning as M
in the later paper; it controls
+ * how many of the efConst
neighbors are connected to the new node
+ * Note: The graph may be searched by multiple threads concurrently, but updates are not + * thread-safe. Also note: there is no notion of deletions. Document searching built on top of this + * must do its own deletion-filtering. + * + *
Graph building logic is preserved here only for tests.
+ */
+public final class Lucene90HnswGraph extends KnnGraphValues {
+
+ private final int maxConn;
+
+ // Each entry lists the top maxConn neighbors of a node. The nodes correspond to vectors added to
+ // HnswBuilder, and the
+ // node values are the ordinals of those vectors.
+ private final List This class is preserved here only for tests.
+ */
+public final class Lucene90HnswGraphBuilder {
+
+ /** Default random seed for level generation * */
+ private static final long DEFAULT_RAND_SEED = System.currentTimeMillis();
+ /** A name for the HNSW component for the info-stream * */
+ public static final String HNSW_COMPONENT = "HNSW";
+
+ /** Random seed for level generation; public to expose for testing * */
+ public static long randSeed = DEFAULT_RAND_SEED;
+
+ private final int maxConn;
+ private final int beamWidth;
+ private final NeighborArray scratch;
+
+ private final VectorSimilarityFunction similarityFunction;
+ private final RandomAccessVectorValues vectorValues;
+ private final SplittableRandom random;
+ private final BoundsChecker bound;
+ final Lucene90HnswGraph hnsw;
+
+ private InfoStream infoStream = InfoStream.getDefault();
+
+ // we need two sources of vectors in order to perform diversity check comparisons without
+ // colliding
+ private RandomAccessVectorValues buildVectors;
+
+ /**
+ * Reads all the vectors from a VectorValues, builds a graph connecting them by their dense
+ * ordinals, using the given hyperparameter settings, and returns the resulting graph.
+ *
+ * @param vectors the vectors whose relations are represented by the graph - must provide a
+ * different view over those vectors than the one used to add via addGraphNode.
+ * @param maxConn the number of connections to make when adding a new graph node; roughly speaking
+ * the graph fanout.
+ * @param beamWidth the size of the beam search to use when finding nearest neighbors.
+ * @param seed the seed for a random number generator used during graph construction. Provide this
+ * to ensure repeatable construction.
+ */
+ public Lucene90HnswGraphBuilder(
+ RandomAccessVectorValuesProducer vectors,
+ VectorSimilarityFunction similarityFunction,
+ int maxConn,
+ int beamWidth,
+ long seed) {
+ vectorValues = vectors.randomAccess();
+ buildVectors = vectors.randomAccess();
+ this.similarityFunction = Objects.requireNonNull(similarityFunction);
+ if (maxConn <= 0) {
+ throw new IllegalArgumentException("maxConn must be positive");
+ }
+ if (beamWidth <= 0) {
+ throw new IllegalArgumentException("beamWidth must be positive");
+ }
+ this.maxConn = maxConn;
+ this.beamWidth = beamWidth;
+ this.hnsw = new Lucene90HnswGraph(maxConn);
+ bound = BoundsChecker.create(similarityFunction.reversed);
+ random = new SplittableRandom(seed);
+ scratch = new NeighborArray(Math.max(beamWidth, maxConn + 1));
+ }
+
+ /**
+ * Reads all the vectors from two copies of a random access VectorValues. Providing two copies
+ * enables efficient retrieval without extra data copying, while avoiding collision of the
+ * returned values.
+ *
+ * @param vectors the vectors for which to build a nearest neighbors graph. Must be an independet
+ * accessor for the vectors
+ */
+ public Lucene90HnswGraph build(RandomAccessVectorValues vectors) throws IOException {
+ if (vectors == vectorValues) {
+ throw new IllegalArgumentException(
+ "Vectors to build must be independent of the source of vectors provided to HnswGraphBuilder()");
+ }
+ if (infoStream.isEnabled(HNSW_COMPONENT)) {
+ infoStream.message(HNSW_COMPONENT, "build graph from " + vectors.size() + " vectors");
+ }
+ long start = System.nanoTime(), t = start;
+ // start at node 1! node 0 is added implicitly, in the constructor
+ for (int node = 1; node < vectors.size(); node++) {
+ addGraphNode(vectors.vectorValue(node));
+ if (node % 10000 == 0) {
+ if (infoStream.isEnabled(HNSW_COMPONENT)) {
+ long now = System.nanoTime();
+ infoStream.message(
+ HNSW_COMPONENT,
+ String.format(
+ Locale.ROOT,
+ "built %d in %d/%d ms",
+ node,
+ ((now - t) / 1_000_000),
+ ((now - start) / 1_000_000)));
+ t = now;
+ }
+ }
+ }
+ return hnsw;
+ }
+
+ /** Set info-stream to output debugging information * */
+ public void setInfoStream(InfoStream infoStream) {
+ this.infoStream = infoStream;
+ }
+
+ /** Inserts a doc with vector value to the graph */
+ void addGraphNode(float[] value) throws IOException {
+ // We pass 'null' for acceptOrds because there are no deletions while building the graph
+ NeighborQueue candidates =
+ Lucene90HnswGraph.search(
+ value, beamWidth, beamWidth, vectorValues, similarityFunction, hnsw, null, random);
+
+ int node = hnsw.addNode();
+
+ /* connect neighbors to the new node, using a diversity heuristic that chooses successive
+ * nearest neighbors that are closer to the new node than they are to the previously-selected
+ * neighbors
+ */
+ addDiverseNeighbors(node, candidates);
+ }
+
+ /* TODO: we are not maintaining nodes in strict score order; the forward links
+ * are added in sorted order, but the reverse implicit ones are not. Diversity heuristic should
+ * work better if we keep the neighbor arrays sorted. Possibly we should switch back to a heap?
+ * But first we should just see if sorting makes a significant difference.
+ */
+ private void addDiverseNeighbors(int node, NeighborQueue candidates) throws IOException {
+ /* For each of the beamWidth nearest candidates (going from best to worst), select it only if it
+ * is closer to target than it is to any of the already-selected neighbors (ie selected in this method,
+ * since the node is new and has no prior neighbors).
+ */
+ NeighborArray neighbors = hnsw.getNeighbors(node);
+ assert neighbors.size() == 0; // new node
+ popToScratch(candidates);
+ selectDiverse(neighbors, scratch);
+
+ // Link the selected nodes to the new node, and the new node to the selected nodes (again
+ // applying diversity heuristic)
+ int size = neighbors.size();
+ for (int i = 0; i < size; i++) {
+ int nbr = neighbors.node()[i];
+ NeighborArray nbrNbr = hnsw.getNeighbors(nbr);
+ nbrNbr.add(node, neighbors.score()[i]);
+ if (nbrNbr.size() > maxConn) {
+ diversityUpdate(nbrNbr);
+ }
+ }
+ }
+
+ private void selectDiverse(NeighborArray neighbors, NeighborArray candidates) throws IOException {
+ // Select the best maxConn neighbors of the new node, applying the diversity heuristic
+ for (int i = candidates.size() - 1; neighbors.size() < maxConn && i >= 0; i--) {
+ // compare each neighbor (in distance order) against the closer neighbors selected so far,
+ // only adding it if it is closer to the target than to any of the other selected neighbors
+ int cNode = candidates.node()[i];
+ float cScore = candidates.score()[i];
+ assert cNode < hnsw.size();
+ if (diversityCheck(vectorValues.vectorValue(cNode), cScore, neighbors, buildVectors)) {
+ neighbors.add(cNode, cScore);
+ }
+ }
+ }
+
+ private void popToScratch(NeighborQueue candidates) {
+ scratch.clear();
+ int candidateCount = candidates.size();
+ // extract all the Neighbors from the queue into an array; these will now be
+ // sorted from worst to best
+ for (int i = 0; i < candidateCount; i++) {
+ float score = candidates.topScore();
+ scratch.add(candidates.pop(), score);
+ }
+ }
+
+ /**
+ * @param candidate the vector of a new candidate neighbor of a node n
+ * @param score the score of the new candidate and node n, to be compared with scores of the
+ * candidate and n's neighbors
+ * @param neighbors the neighbors selected so far
+ * @param vectorValues source of values used for making comparisons between candidate and existing
+ * neighbors
+ * @return whether the candidate is diverse given the existing neighbors
+ */
+ private boolean diversityCheck(
+ float[] candidate,
+ float score,
+ NeighborArray neighbors,
+ RandomAccessVectorValues vectorValues)
+ throws IOException {
+ bound.set(score);
+ for (int i = 0; i < neighbors.size(); i++) {
+ float diversityCheck =
+ similarityFunction.compare(candidate, vectorValues.vectorValue(neighbors.node()[i]));
+ if (bound.check(diversityCheck) == false) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ private void diversityUpdate(NeighborArray neighbors) throws IOException {
+ assert neighbors.size() == maxConn + 1;
+ int replacePoint = findNonDiverse(neighbors);
+ if (replacePoint == -1) {
+ // none found; check score against worst existing neighbor
+ bound.set(neighbors.score()[0]);
+ if (bound.check(neighbors.score()[maxConn])) {
+ // drop the new neighbor; it is not competitive and there were no diversity failures
+ neighbors.removeLast();
+ return;
+ } else {
+ replacePoint = 0;
+ }
+ }
+ neighbors.node()[replacePoint] = neighbors.node()[maxConn];
+ neighbors.score()[replacePoint] = neighbors.score()[maxConn];
+ neighbors.removeLast();
+ }
+
+ // scan neighbors looking for diversity violations
+ private int findNonDiverse(NeighborArray neighbors) throws IOException {
+ for (int i = neighbors.size() - 1; i >= 0; i--) {
+ // check each neighbor against its better-scoring neighbors. If it fails diversity check with
+ // them, drop it
+ int nbrNode = neighbors.node()[i];
+ bound.set(neighbors.score()[i]);
+ float[] nbrVector = vectorValues.vectorValue(nbrNode);
+ for (int j = maxConn; j > i; j--) {
+ float diversityCheck =
+ similarityFunction.compare(nbrVector, buildVectors.vectorValue(neighbors.node()[j]));
+ if (bound.check(diversityCheck) == false) {
+ // node j is too similar to node i given its score relative to the base node
+ // replace it with the new node, which is at [maxConn]
+ return i;
+ }
+ }
+ }
+ return -1;
+ }
+}
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90HnswVectorsFormat.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsFormat.java
similarity index 88%
rename from lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90HnswVectorsFormat.java
rename to lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsFormat.java
index 90875d9e3f5..c622d9cef6f 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90HnswVectorsFormat.java
+++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsFormat.java
@@ -15,7 +15,7 @@
* limitations under the License.
*/
-package org.apache.lucene.codecs.lucene90;
+package org.apache.lucene.backward_codecs.lucene90;
import java.io.IOException;
import org.apache.lucene.codecs.KnnVectorsFormat;
@@ -65,7 +65,7 @@ import org.apache.lucene.util.hnsw.HnswGraph;
*
* @lucene.experimental
*/
-public final class Lucene90HnswVectorsFormat extends KnnVectorsFormat {
+public class Lucene90HnswVectorsFormat extends KnnVectorsFormat {
static final String META_CODEC_NAME = "Lucene90HnswVectorsFormatMeta";
static final String VECTOR_DATA_CODEC_NAME = "Lucene90HnswVectorsFormatData";
@@ -77,26 +77,33 @@ public final class Lucene90HnswVectorsFormat extends KnnVectorsFormat {
static final int VERSION_START = 0;
static final int VERSION_CURRENT = VERSION_START;
+ /** Default number of maximum connections per node */
public static final int DEFAULT_MAX_CONN = 16;
+ /**
+ * Default number of the size of the queue maintained while searching and the number of random
+ * entry points to sample during a graph construction.
+ */
public static final int DEFAULT_BEAM_WIDTH = 100;
/**
* Controls how many of the nearest neighbor candidates are connected to the new node. Defaults to
* {@link Lucene90HnswVectorsFormat#DEFAULT_MAX_CONN}. See {@link HnswGraph} for more details.
*/
- private final int maxConn;
+ final int maxConn;
/**
* The number of candidate neighbors to track while searching the graph for each newly inserted
* node. Defaults to to {@link Lucene90HnswVectorsFormat#DEFAULT_BEAM_WIDTH}. See {@link
* HnswGraph} for details.
*/
- private final int beamWidth;
+ final int beamWidth;
+ /** A constructor for vectors format with default parameters */
public Lucene90HnswVectorsFormat() {
this(DEFAULT_MAX_CONN, DEFAULT_BEAM_WIDTH);
}
+ /** A constructor for vectors format */
public Lucene90HnswVectorsFormat(int maxConn, int beamWidth) {
super("Lucene90HnswVectorsFormat");
this.maxConn = maxConn;
@@ -105,7 +112,7 @@ public final class Lucene90HnswVectorsFormat extends KnnVectorsFormat {
@Override
public KnnVectorsWriter fieldsWriter(SegmentWriteState state) throws IOException {
- return new Lucene90HnswVectorsWriter(state, maxConn, beamWidth);
+ throw new UnsupportedOperationException("Old codecs may only be used for reading");
}
@Override
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90HnswVectorsReader.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsReader.java
similarity index 96%
rename from lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90HnswVectorsReader.java
rename to lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsReader.java
index bb62ab9dd95..7669a8d38ff 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90HnswVectorsReader.java
+++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsReader.java
@@ -15,7 +15,7 @@
* limitations under the License.
*/
-package org.apache.lucene.codecs.lucene90;
+package org.apache.lucene.backward_codecs.lucene90;
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
@@ -47,7 +47,6 @@ import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.RamUsageEstimator;
-import org.apache.lucene.util.hnsw.HnswGraph;
import org.apache.lucene.util.hnsw.NeighborQueue;
/**
@@ -244,7 +243,7 @@ public final class Lucene90HnswVectorsReader extends KnnVectorsReader {
// use a seed that is fixed for the index so we get reproducible results for the same query
final SplittableRandom random = new SplittableRandom(checksumSeed);
NeighborQueue results =
- HnswGraph.search(
+ Lucene90HnswGraph.search(
target,
k,
k,
@@ -291,6 +290,7 @@ public final class Lucene90HnswVectorsReader extends KnnVectorsReader {
};
}
+ /** Get knn graph values; used for testing */
public KnnGraphValues getGraphValues(String field) throws IOException {
FieldInfo info = fieldInfos.fieldInfo(field);
if (info == null) {
@@ -480,7 +480,7 @@ public final class Lucene90HnswVectorsReader extends KnnVectorsReader {
}
@Override
- public void seek(int targetOrd) throws IOException {
+ public void seek(int level, int targetOrd) throws IOException {
// unsafe; no bounds checking
dataIn.seek(entry.ordOffsets[targetOrd]);
arcCount = dataIn.readInt();
@@ -502,5 +502,20 @@ public final class Lucene90HnswVectorsReader extends KnnVectorsReader {
arc += dataIn.readVInt();
return arc;
}
+
+ @Override
+ public int numLevels() {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public int entryNode() {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public NodesIterator getNodesOnLevel(int level) {
+ throw new UnsupportedOperationException();
+ }
}
}
diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/package-info.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/package-info.java
new file mode 100644
index 00000000000..5ad2dbcc82c
--- /dev/null
+++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/package-info.java
@@ -0,0 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/** Lucene 9.0 file format. */
+package org.apache.lucene.backward_codecs.lucene90;
diff --git a/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.Codec b/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.Codec
index c9a6322f0b1..72e05ab3198 100644
--- a/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.Codec
+++ b/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.Codec
@@ -17,3 +17,4 @@ org.apache.lucene.backward_codecs.lucene80.Lucene80Codec
org.apache.lucene.backward_codecs.lucene84.Lucene84Codec
org.apache.lucene.backward_codecs.lucene86.Lucene86Codec
org.apache.lucene.backward_codecs.lucene87.Lucene87Codec
+org.apache.lucene.backward_codecs.lucene90.Lucene90Codec
diff --git a/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.KnnVectorsFormat b/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.KnnVectorsFormat
new file mode 100644
index 00000000000..17d89f3be7f
--- /dev/null
+++ b/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.KnnVectorsFormat
@@ -0,0 +1,16 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+org.apache.lucene.backward_codecs.lucene90.Lucene90HnswVectorsFormat
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90HnswVectorsWriter.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsWriter.java
similarity index 96%
rename from lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90HnswVectorsWriter.java
rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsWriter.java
index 0fbd1be23dd..d76e5efb635 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90HnswVectorsWriter.java
+++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsWriter.java
@@ -15,7 +15,7 @@
* limitations under the License.
*/
-package org.apache.lucene.codecs.lucene90;
+package org.apache.lucene.backward_codecs.lucene90;
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
@@ -35,8 +35,6 @@ import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
-import org.apache.lucene.util.hnsw.HnswGraph;
-import org.apache.lucene.util.hnsw.HnswGraphBuilder;
import org.apache.lucene.util.hnsw.NeighborArray;
/**
@@ -235,11 +233,15 @@ public final class Lucene90HnswVectorsWriter extends KnnVectorsWriter {
int maxConn,
int beamWidth)
throws IOException {
- HnswGraphBuilder hnswGraphBuilder =
- new HnswGraphBuilder(
- vectorValues, similarityFunction, maxConn, beamWidth, HnswGraphBuilder.randSeed);
+ Lucene90HnswGraphBuilder hnswGraphBuilder =
+ new Lucene90HnswGraphBuilder(
+ vectorValues,
+ similarityFunction,
+ maxConn,
+ beamWidth,
+ Lucene90HnswGraphBuilder.randSeed);
hnswGraphBuilder.setInfoStream(segmentWriteState.infoStream);
- HnswGraph graph = hnswGraphBuilder.build(vectorValues.randomAccess());
+ Lucene90HnswGraph graph = hnswGraphBuilder.build(vectorValues.randomAccess());
for (int ord = 0; ord < offsets.length; ord++) {
// write graph
diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/Lucene90RWHnswVectorsFormat.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/Lucene90RWHnswVectorsFormat.java
new file mode 100644
index 00000000000..dd720c18319
--- /dev/null
+++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/Lucene90RWHnswVectorsFormat.java
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.backward_codecs.lucene90;
+
+import java.io.IOException;
+import org.apache.lucene.codecs.KnnVectorsWriter;
+import org.apache.lucene.index.SegmentWriteState;
+
+public class Lucene90RWHnswVectorsFormat extends Lucene90HnswVectorsFormat {
+
+ public Lucene90RWHnswVectorsFormat(int maxConn, int beamWidth) {
+ super(maxConn, beamWidth);
+ }
+
+ @Override
+ public KnnVectorsWriter fieldsWriter(SegmentWriteState state) throws IOException {
+ return new Lucene90HnswVectorsWriter(state, maxConn, beamWidth);
+ }
+
+ @Override
+ public String toString() {
+ return "Lucene90RWHnswVectorsFormat(name = Lucene90RWHnswVectorsFormat, maxConn = "
+ + maxConn
+ + ", beamWidth="
+ + beamWidth
+ + ")";
+ }
+}
diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90HnswVectorsFormat.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/TestLucene90HnswVectorsFormat.java
similarity index 82%
rename from lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90HnswVectorsFormat.java
rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/TestLucene90HnswVectorsFormat.java
index 7baca8ca540..8fe6dc24624 100644
--- a/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90HnswVectorsFormat.java
+++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/TestLucene90HnswVectorsFormat.java
@@ -14,11 +14,11 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.lucene.codecs.lucene90;
+package org.apache.lucene.backward_codecs.lucene90;
import static com.carrotsearch.randomizedtesting.RandomizedTest.randomIntBetween;
-import static org.apache.lucene.codecs.lucene90.Lucene90HnswVectorsFormat.DEFAULT_BEAM_WIDTH;
-import static org.apache.lucene.codecs.lucene90.Lucene90HnswVectorsFormat.DEFAULT_MAX_CONN;
+import static org.apache.lucene.backward_codecs.lucene90.Lucene90HnswVectorsFormat.DEFAULT_BEAM_WIDTH;
+import static org.apache.lucene.backward_codecs.lucene90.Lucene90HnswVectorsFormat.DEFAULT_MAX_CONN;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.KnnVectorsFormat;
@@ -38,11 +38,11 @@ public class TestLucene90HnswVectorsFormat extends BaseKnnVectorsFormatTestCase
new Lucene90Codec() {
@Override
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
- return new Lucene90HnswVectorsFormat(maxConn, beamWidth);
+ return new Lucene90RWHnswVectorsFormat(maxConn, beamWidth);
}
};
String expectedString =
- "Lucene90HnswVectorsFormat(name = Lucene90HnswVectorsFormat, maxConn = "
+ "Lucene90RWHnswVectorsFormat(name = Lucene90RWHnswVectorsFormat, maxConn = "
+ maxConn
+ ", beamWidth="
+ beamWidth
diff --git a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java
index d7ae013f381..1f76e829676 100644
--- a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java
+++ b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java
@@ -27,7 +27,7 @@ import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.PostingsFormat;
-import org.apache.lucene.codecs.lucene90.Lucene90Codec;
+import org.apache.lucene.codecs.lucene91.Lucene91Codec;
import org.apache.lucene.index.ConcurrentMergeScheduler;
import org.apache.lucene.index.IndexCommit;
import org.apache.lucene.index.IndexDeletionPolicy;
@@ -152,7 +152,7 @@ public class CreateIndexTask extends PerfTask {
try {
final PostingsFormat postingsFormatChosen = PostingsFormat.forName(postingsFormat);
iwConf.setCodec(
- new Lucene90Codec() {
+ new Lucene91Codec() {
@Override
public PostingsFormat getPostingsFormatForField(String field) {
return postingsFormatChosen;
diff --git a/lucene/core/src/java/module-info.java b/lucene/core/src/java/module-info.java
index 02fc3ff0821..f4139e7eaa3 100644
--- a/lucene/core/src/java/module-info.java
+++ b/lucene/core/src/java/module-info.java
@@ -27,6 +27,7 @@ module org.apache.lucene.core {
exports org.apache.lucene.analysis.tokenattributes;
exports org.apache.lucene.codecs;
exports org.apache.lucene.codecs.compressing;
+ exports org.apache.lucene.codecs.lucene91;
exports org.apache.lucene.codecs.lucene90;
exports org.apache.lucene.codecs.lucene90.blocktree;
exports org.apache.lucene.codecs.lucene90.compressing;
@@ -59,11 +60,11 @@ module org.apache.lucene.core {
provides org.apache.lucene.analysis.TokenizerFactory with
org.apache.lucene.analysis.standard.StandardTokenizerFactory;
provides org.apache.lucene.codecs.Codec with
- org.apache.lucene.codecs.lucene90.Lucene90Codec;
+ org.apache.lucene.codecs.lucene91.Lucene91Codec;
provides org.apache.lucene.codecs.DocValuesFormat with
org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat;
provides org.apache.lucene.codecs.KnnVectorsFormat with
- org.apache.lucene.codecs.lucene90.Lucene90HnswVectorsFormat;
+ org.apache.lucene.codecs.lucene91.Lucene91HnswVectorsFormat;
provides org.apache.lucene.codecs.PostingsFormat with
org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat;
provides org.apache.lucene.index.SortFieldProvider with
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/Codec.java b/lucene/core/src/java/org/apache/lucene/codecs/Codec.java
index 69ecde2e26c..176cc57cfb5 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/Codec.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/Codec.java
@@ -55,7 +55,7 @@ public abstract class Codec implements NamedSPILoader.NamedSPI {
return LOADER;
}
- static Codec defaultCodec = LOADER.lookup("Lucene90");
+ static Codec defaultCodec = LOADER.lookup("Lucene91");
}
private final String name;
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsFormat.java
index 4b58f2dc6c0..69b1a426535 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsFormat.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsFormat.java
@@ -85,7 +85,7 @@ public abstract class KnnVectorsFormat implements NamedSPILoader.NamedSPI {
@Override
public KnnVectorsWriter fieldsWriter(SegmentWriteState state) {
throw new UnsupportedOperationException(
- "Attempt to write EMPTY VectorValues: maybe you forgot to use codec=Lucene90");
+ "Attempt to write EMPTY VectorValues: maybe you forgot to use codec=Lucene91");
}
@Override
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/package-info.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/package-info.java
index d8c6828263f..e8fdeb41080 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/package-info.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/package-info.java
@@ -15,405 +15,5 @@
* limitations under the License.
*/
-/**
- * Lucene 9.0 file format.
- *
- * This document defines the index file formats used in this version of Lucene. If you are using
- * a different version of Lucene, please consult the copy of This document attempts to provide a high-level definition of the Apache Lucene file formats.
- * The fundamental concepts in Lucene are index, document, field and term.
- *
- * An index contains a sequence of documents.
- *
- * The same sequence of bytes in two different fields is considered a different term. Thus terms
- * are represented as a pair: the string naming the field, and the bytes within the field.
- *
- * Lucene's index stores terms and statistics about those terms in order to make term-based
- * search more efficient. Lucene's terms index falls into the family of indexes known as an
- * inverted index. This is because it can list, for a term, the documents that contain it.
- * This is the inverse of the natural relationship, in which documents list terms.
- *
- * In Lucene, fields may be stored, in which case their text is stored in the index
- * literally, in a non-inverted manner. Fields that are inverted are called indexed. A field
- * may be both stored and indexed.
- *
- * The text of a field may be tokenized into terms to be indexed, or the text of a field
- * may be used literally as a term to be indexed. Most fields are tokenized, but sometimes it is
- * useful for certain identifier fields to be indexed literally.
- *
- * See the {@link org.apache.lucene.document.Field Field} java docs for more information on
- * Fields.
- *
- * Lucene indexes may be composed of multiple sub-indexes, or segments. Each segment is a
- * fully independent index, which could be searched separately. Indexes evolve by:
- *
- * Searches may involve multiple segments and/or multiple indexes, each index potentially
- * composed of a set of segments.
- *
- * Internally, Lucene refers to documents by an integer document number. The first
- * document added to an index is numbered zero, and each subsequent document added gets a number one
- * greater than the previous.
- *
- * Note that a document's number may change, so caution should be taken when storing these
- * numbers outside of Lucene. In particular, numbers may change in the following situations:
- *
- * The numbers stored in each segment are unique only within the segment, and must be
- * converted before they can be used in a larger context. The standard technique is to
- * allocate each segment a range of values, based on the range of numbers used in that
- * segment. To convert a document number from a segment to an external value, the segment's
- * base document number is added. To convert an external value back to a
- * segment-specific value, the segment is identified by the range that the external value is
- * in, and the segment's base value is subtracted. For example two five document segments
- * might be combined, so that the first segment has a base value of zero, and the second of
- * five. Document three from the second segment would have an external value of eight.
- * When documents are deleted, gaps are created in the numbering. These are eventually
- * removed as the index evolves through merging. Deleted documents are dropped when segments
- * are merged. A freshly-merged segment thus has no gaps in its numbering.
- * Each segment index maintains the following:
- *
- * Details on each of these are provided in their linked pages. All files belonging to a segment have the same name with varying extensions. The extensions
- * correspond to the different file formats described below. When using the Compound File format
- * (default for small segments) these files (except for the Segment info file, the Lock file, and
- * Deleted documents file) are collapsed into a single .cfs file (see below for details)
- *
- * Typically, all segments in an index are stored in a single directory, although this is not
- * required.
- *
- * File names are never re-used. That is, when any file is saved to the Directory it is given a
- * never before used filename. This is achieved using a simple generations approach. For example,
- * the first segments file is segments_1, then segments_2, etc. The generation is a sequential long
- * integer represented in alpha-numeric (base 36) form. The following table summarizes the names and extensions of the files in Lucene:
- *
- * Compatibility notes are provided in this document, describing how file formats have changed
- * from prior versions:
- *
- * Lucene uses a Java If you want to reuse functionality of this codec in another codec, extend {@link FilterCodec}.
+ *
+ * @see org.apache.lucene.codecs.lucene91 package documentation for file format details.
+ * @lucene.experimental
+ */
+public class Lucene91Codec extends Codec {
+
+ /** Configuration option for the codec. */
+ public enum Mode {
+ /** Trade compression ratio for retrieval speed. */
+ BEST_SPEED(Lucene90StoredFieldsFormat.Mode.BEST_SPEED),
+ /** Trade retrieval speed for compression ratio. */
+ BEST_COMPRESSION(Lucene90StoredFieldsFormat.Mode.BEST_COMPRESSION);
+
+ private final Lucene90StoredFieldsFormat.Mode storedMode;
+
+ private Mode(Lucene90StoredFieldsFormat.Mode storedMode) {
+ this.storedMode = Objects.requireNonNull(storedMode);
+ }
+ }
+
+ private final TermVectorsFormat vectorsFormat = new Lucene90TermVectorsFormat();
+ private final FieldInfosFormat fieldInfosFormat = new Lucene90FieldInfosFormat();
+ private final SegmentInfoFormat segmentInfosFormat = new Lucene90SegmentInfoFormat();
+ private final LiveDocsFormat liveDocsFormat = new Lucene90LiveDocsFormat();
+ private final CompoundFormat compoundFormat = new Lucene90CompoundFormat();
+ private final NormsFormat normsFormat = new Lucene90NormsFormat();
+
+ private final PostingsFormat defaultPostingsFormat;
+ private final PostingsFormat postingsFormat =
+ new PerFieldPostingsFormat() {
+ @Override
+ public PostingsFormat getPostingsFormatForField(String field) {
+ return Lucene91Codec.this.getPostingsFormatForField(field);
+ }
+ };
+
+ private final DocValuesFormat defaultDVFormat;
+ private final DocValuesFormat docValuesFormat =
+ new PerFieldDocValuesFormat() {
+ @Override
+ public DocValuesFormat getDocValuesFormatForField(String field) {
+ return Lucene91Codec.this.getDocValuesFormatForField(field);
+ }
+ };
+
+ private final KnnVectorsFormat defaultKnnVectorsFormat;
+ private final KnnVectorsFormat knnVectorsFormat =
+ new PerFieldKnnVectorsFormat() {
+ @Override
+ public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
+ return Lucene91Codec.this.getKnnVectorsFormatForField(field);
+ }
+ };
+
+ private final StoredFieldsFormat storedFieldsFormat;
+
+ /** Instantiates a new codec. */
+ public Lucene91Codec() {
+ this(Mode.BEST_SPEED);
+ }
+
+ /**
+ * Instantiates a new codec, specifying the stored fields compression mode to use.
+ *
+ * @param mode stored fields compression mode to use for newly flushed/merged segments.
+ */
+ public Lucene91Codec(Mode mode) {
+ super("Lucene91");
+ this.storedFieldsFormat =
+ new Lucene90StoredFieldsFormat(Objects.requireNonNull(mode).storedMode);
+ this.defaultPostingsFormat = new Lucene90PostingsFormat();
+ this.defaultDVFormat = new Lucene90DocValuesFormat();
+ this.defaultKnnVectorsFormat = new Lucene91HnswVectorsFormat();
+ }
+
+ @Override
+ public final StoredFieldsFormat storedFieldsFormat() {
+ return storedFieldsFormat;
+ }
+
+ @Override
+ public final TermVectorsFormat termVectorsFormat() {
+ return vectorsFormat;
+ }
+
+ @Override
+ public final PostingsFormat postingsFormat() {
+ return postingsFormat;
+ }
+
+ @Override
+ public final FieldInfosFormat fieldInfosFormat() {
+ return fieldInfosFormat;
+ }
+
+ @Override
+ public final SegmentInfoFormat segmentInfoFormat() {
+ return segmentInfosFormat;
+ }
+
+ @Override
+ public final LiveDocsFormat liveDocsFormat() {
+ return liveDocsFormat;
+ }
+
+ @Override
+ public final CompoundFormat compoundFormat() {
+ return compoundFormat;
+ }
+
+ @Override
+ public final PointsFormat pointsFormat() {
+ return new Lucene90PointsFormat();
+ }
+
+ @Override
+ public final KnnVectorsFormat knnVectorsFormat() {
+ return knnVectorsFormat;
+ }
+
+ /**
+ * Returns the postings format that should be used for writing new segments of The default implementation always returns "Lucene90".
+ *
+ * WARNING: if you subclass, you are responsible for index backwards compatibility:
+ * future version of Lucene are only guaranteed to be able to read the default implementation,
+ */
+ public PostingsFormat getPostingsFormatForField(String field) {
+ return defaultPostingsFormat;
+ }
+
+ /**
+ * Returns the docvalues format that should be used for writing new segments of The default implementation always returns "Lucene90".
+ *
+ * WARNING: if you subclass, you are responsible for index backwards compatibility:
+ * future version of Lucene are only guaranteed to be able to read the default implementation.
+ */
+ public DocValuesFormat getDocValuesFormatForField(String field) {
+ return defaultDVFormat;
+ }
+
+ /**
+ * Returns the vectors format that should be used for writing new segments of The default implementation always returns "Lucene91".
+ *
+ * WARNING: if you subclass, you are responsible for index backwards compatibility:
+ * future version of Lucene are only guaranteed to be able to read the default implementation.
+ */
+ public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
+ return defaultKnnVectorsFormat;
+ }
+
+ @Override
+ public final DocValuesFormat docValuesFormat() {
+ return docValuesFormat;
+ }
+
+ @Override
+ public final NormsFormat normsFormat() {
+ return normsFormat;
+ }
+}
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene91/Lucene91HnswVectorsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene91/Lucene91HnswVectorsFormat.java
new file mode 100644
index 00000000000..2e3fc6c298f
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene91/Lucene91HnswVectorsFormat.java
@@ -0,0 +1,143 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.codecs.lucene91;
+
+import java.io.IOException;
+import org.apache.lucene.codecs.KnnVectorsFormat;
+import org.apache.lucene.codecs.KnnVectorsReader;
+import org.apache.lucene.codecs.KnnVectorsWriter;
+import org.apache.lucene.index.SegmentReadState;
+import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.util.hnsw.HnswGraph;
+
+/**
+ * Lucene 9.0 vector format, which encodes numeric vector values and an optional associated graph
+ * connecting the documents having values. The graph is used to power HNSW search. The format
+ * consists of three files:
+ *
+ * This file stores all the floating-point vector data ordered by field, document ordinal, and
+ * vector dimension. The floats are stored in little-endian byte order.
+ *
+ * Stores graphs connecting the documents for each field organized as a list of nodes' neighbours
+ * as following:
+ *
+ * For each field:
+ *
+ * This document defines the index file formats used in this version of Lucene. If you are using
+ * a different version of Lucene, please consult the copy of This document attempts to provide a high-level definition of the Apache Lucene file formats.
+ * The fundamental concepts in Lucene are index, document, field and term.
+ *
+ * An index contains a sequence of documents.
+ *
+ * The same sequence of bytes in two different fields is considered a different term. Thus terms
+ * are represented as a pair: the string naming the field, and the bytes within the field.
+ *
+ * Lucene's index stores terms and statistics about those terms in order to make term-based
+ * search more efficient. Lucene's terms index falls into the family of indexes known as an
+ * inverted index. This is because it can list, for a term, the documents that contain it.
+ * This is the inverse of the natural relationship, in which documents list terms.
+ *
+ * In Lucene, fields may be stored, in which case their text is stored in the index
+ * literally, in a non-inverted manner. Fields that are inverted are called indexed. A field
+ * may be both stored and indexed.
+ *
+ * The text of a field may be tokenized into terms to be indexed, or the text of a field
+ * may be used literally as a term to be indexed. Most fields are tokenized, but sometimes it is
+ * useful for certain identifier fields to be indexed literally.
+ *
+ * See the {@link org.apache.lucene.document.Field Field} java docs for more information on
+ * Fields.
+ *
+ * Lucene indexes may be composed of multiple sub-indexes, or segments. Each segment is a
+ * fully independent index, which could be searched separately. Indexes evolve by:
+ *
+ * Searches may involve multiple segments and/or multiple indexes, each index potentially
+ * composed of a set of segments.
+ *
+ * Internally, Lucene refers to documents by an integer document number. The first
+ * document added to an index is numbered zero, and each subsequent document added gets a number one
+ * greater than the previous.
+ *
+ * Note that a document's number may change, so caution should be taken when storing these
+ * numbers outside of Lucene. In particular, numbers may change in the following situations:
+ *
+ * The numbers stored in each segment are unique only within the segment, and must be
+ * converted before they can be used in a larger context. The standard technique is to
+ * allocate each segment a range of values, based on the range of numbers used in that
+ * segment. To convert a document number from a segment to an external value, the segment's
+ * base document number is added. To convert an external value back to a
+ * segment-specific value, the segment is identified by the range that the external value is
+ * in, and the segment's base value is subtracted. For example two five document segments
+ * might be combined, so that the first segment has a base value of zero, and the second of
+ * five. Document three from the second segment would have an external value of eight.
+ * When documents are deleted, gaps are created in the numbering. These are eventually
+ * removed as the index evolves through merging. Deleted documents are dropped when segments
+ * are merged. A freshly-merged segment thus has no gaps in its numbering.
+ * Each segment index maintains the following:
+ *
+ * Details on each of these are provided in their linked pages. All files belonging to a segment have the same name with varying extensions. The extensions
+ * correspond to the different file formats described below. When using the Compound File format
+ * (default for small segments) these files (except for the Segment info file, the Lock file, and
+ * Deleted documents file) are collapsed into a single .cfs file (see below for details)
+ *
+ * Typically, all segments in an index are stored in a single directory, although this is not
+ * required.
+ *
+ * File names are never re-used. That is, when any file is saved to the Directory it is given a
+ * never before used filename. This is achieved using a simple generations approach. For example,
+ * the first segments file is segments_1, then segments_2, etc. The generation is a sequential long
+ * integer represented in alpha-numeric (base 36) form. The following table summarizes the names and extensions of the files in Lucene:
+ *
+ * Compatibility notes are provided in this document, describing how file formats have changed
+ * from prior versions:
+ *
+ * Lucene uses a Java The nomenclature is a bit different here from what's used in those papers:
+ * The nomenclature is a bit different here from what's used in the paper:
*
* Note: The graph may be searched by multiple threads concurrently, but updates are not
@@ -56,75 +55,120 @@ import org.apache.lucene.util.SparseFixedBitSet;
public final class HnswGraph extends KnnGraphValues {
private final int maxConn;
+ private int numLevels; // the current number of levels in the graph
+ private int entryNode; // the current graph entry node on the top level
- // Each entry lists the top maxConn neighbors of a node. The nodes correspond to vectors added to
- // HnswBuilder, and the
- // node values are the ordinals of those vectors.
- private final ListApache Lucene - Index File Formats
- *
- *
- *
- *
- *
- *
- *
- *
- * Introduction
- *
- * docs/
that was distributed
- * with the version you are using.
- *
- * Definitions
- *
- *
- *
- *
- * Inverted Indexing
- *
- * Types of Fields
- *
- * Segments
- *
- *
- *
- *
- * Document Numbers
- *
- *
- *
- *
- * Index Structure Overview
- *
- *
- *
- *
- * File Naming
- *
- * Summary of File Extensions
- *
- *
- *
- *
- *
- *
- * Name
- * Extension
- * Brief Description
- *
- *
- * {@link org.apache.lucene.index.SegmentInfos Segments File}
- * segments_N
- * Stores information about a commit point
- *
- *
- * Lock File
- * write.lock
- * The Write lock prevents multiple IndexWriters from writing to the same
- * file.
- *
- *
- * {@link org.apache.lucene.codecs.lucene90.Lucene90SegmentInfoFormat Segment Info}
- * .si
- * Stores metadata about a segment
- *
- *
- * {@link org.apache.lucene.codecs.lucene90.Lucene90CompoundFormat Compound File}
- * .cfs, .cfe
- * An optional "virtual" file consisting of all the other index files for
- * systems that frequently run out of file handles.
- *
- *
- * {@link org.apache.lucene.codecs.lucene90.Lucene90FieldInfosFormat Fields}
- * .fnm
- * Stores information about the fields
- *
- *
- * {@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Index}
- * .fdx
- * Contains pointers to field data
- *
- *
- * {@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Data}
- * .fdt
- * The stored fields for documents
- *
- *
- * {@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Term Dictionary}
- * .tim
- * The term dictionary, stores term info
- *
- *
- * {@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Term Index}
- * .tip
- * The index into the Term Dictionary
- *
- *
- * {@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Frequencies}
- * .doc
- * Contains the list of docs which contain each term along with frequency
- *
- *
- * {@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Positions}
- * .pos
- * Stores position information about where a term occurs in the index
- *
- *
- * {@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Payloads}
- * .pay
- * Stores additional per-position metadata information such as character offsets and user payloads
- *
- *
- * {@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Norms}
- * .nvd, .nvm
- * Encodes length and boost factors for docs and fields
- *
- *
- * {@link org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat Per-Document Values}
- * .dvd, .dvm
- * Encodes additional scoring factors or other per-document information.
- *
- *
- * {@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Index}
- * .tvx
- * Stores offset into the document data file
- *
- *
- * {@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Data}
- * .tvd
- * Contains term vector data.
- *
- *
- * {@link org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat Live Documents}
- * .liv
- * Info about what documents are live
- *
- *
- * {@link org.apache.lucene.codecs.lucene90.Lucene90PointsFormat Point values}
- * .dii, .dim
- * Holds indexed points
- *
- *
- * {@link org.apache.lucene.codecs.lucene90.Lucene90HnswVectorsFormat Vector values}
- * .vec, .vem
- * Holds indexed vectors;
- * .vec
files contain the raw vector data, and
- * .vem
the vector metadataLock File
- *
- * The write lock, which is stored in the index directory by default, is named "write.lock". If the
- * lock directory is different from the index directory then the write lock will be named
- * "XXXX-write.lock" where XXXX is a unique prefix derived from the full path to the index
- * directory. When this file is present, a writer is currently modifying the index (adding or
- * removing documents). This lock file ensures that only one writer is modifying the index at a
- * time.
- *
- * History
- *
- *
- *
- *
- *
- *
- * Limitations
- *
- * int
to refer to document numbers, and the index file format
- * uses an Int32
on-disk to store document numbers. This is a limitation of both the
- * index file format and the current implementation. Eventually these should be replaced with either
- * UInt64
values, or better yet, {@link org.apache.lucene.store.DataOutput#writeVInt
- * VInt} values which have no limit. field
.
+ *
+ * field
+ * .
+ *
+ * field
+ *
+ * .vec (vector data) file
+ *
+ * .vex (vector index)
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ * .vem (vector metadata) file
+ *
+ *
+ *
+ *
+ * @lucene.experimental
+ */
+public final class Lucene91HnswVectorsFormat extends KnnVectorsFormat {
+
+ static final String META_CODEC_NAME = "Lucene91HnswVectorsFormatMeta";
+ static final String VECTOR_DATA_CODEC_NAME = "Lucene91HnswVectorsFormatData";
+ static final String VECTOR_INDEX_CODEC_NAME = "Lucene91HnswVectorsFormatIndex";
+ static final String META_EXTENSION = "vem";
+ static final String VECTOR_DATA_EXTENSION = "vec";
+ static final String VECTOR_INDEX_EXTENSION = "vex";
+
+ static final int VERSION_START = 0;
+ static final int VERSION_CURRENT = VERSION_START;
+
+ /** Default number of maximum connections per node */
+ public static final int DEFAULT_MAX_CONN = 16;
+ /**
+ * Default number of the size of the queue maintained while searching during a graph construction.
+ */
+ public static final int DEFAULT_BEAM_WIDTH = 100;
+
+ /**
+ * Controls how many of the nearest neighbor candidates are connected to the new node. Defaults to
+ * {@link Lucene91HnswVectorsFormat#DEFAULT_MAX_CONN}. See {@link HnswGraph} for more details.
+ */
+ private final int maxConn;
+
+ /**
+ * The number of candidate neighbors to track while searching the graph for each newly inserted
+ * node. Defaults to to {@link Lucene91HnswVectorsFormat#DEFAULT_BEAM_WIDTH}. See {@link
+ * HnswGraph} for details.
+ */
+ private final int beamWidth;
+
+ public Lucene91HnswVectorsFormat() {
+ this(DEFAULT_MAX_CONN, DEFAULT_BEAM_WIDTH);
+ }
+
+ public Lucene91HnswVectorsFormat(int maxConn, int beamWidth) {
+ super("Lucene91HnswVectorsFormat");
+ this.maxConn = maxConn;
+ this.beamWidth = beamWidth;
+ }
+
+ @Override
+ public KnnVectorsWriter fieldsWriter(SegmentWriteState state) throws IOException {
+ return new Lucene91HnswVectorsWriter(state, maxConn, beamWidth);
+ }
+
+ @Override
+ public KnnVectorsReader fieldsReader(SegmentReadState state) throws IOException {
+ return new Lucene91HnswVectorsReader(state);
+ }
+
+ @Override
+ public String toString() {
+ return "Lucene91HnswVectorsFormat(name = Lucene91HnswVectorsFormat, maxConn = "
+ + maxConn
+ + ", beamWidth="
+ + beamWidth
+ + ")";
+ }
+}
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene91/Lucene91HnswVectorsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene91/Lucene91HnswVectorsReader.java
new file mode 100644
index 00000000000..92135f41aca
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene91/Lucene91HnswVectorsReader.java
@@ -0,0 +1,554 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.codecs.lucene91;
+
+import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Map;
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.codecs.KnnVectorsReader;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.KnnGraphValues;
+import org.apache.lucene.index.RandomAccessVectorValues;
+import org.apache.lucene.index.RandomAccessVectorValuesProducer;
+import org.apache.lucene.index.SegmentReadState;
+import org.apache.lucene.index.VectorSimilarityFunction;
+import org.apache.lucene.index.VectorValues;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.search.TotalHits;
+import org.apache.lucene.store.ChecksumIndexInput;
+import org.apache.lucene.store.DataInput;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.hnsw.HnswGraph;
+import org.apache.lucene.util.hnsw.NeighborQueue;
+
+/**
+ * Reads vectors from the index segments along with index data structures supporting KNN search.
+ *
+ * @lucene.experimental
+ */
+public final class Lucene91HnswVectorsReader extends KnnVectorsReader {
+
+ private final FieldInfos fieldInfos;
+ private final Map
+ *
+ * Apache Lucene - Index File Formats
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ * Introduction
+ *
+ * docs/
that was distributed
+ * with the version you are using.
+ *
+ * Definitions
+ *
+ *
+ *
+ *
+ * Inverted Indexing
+ *
+ * Types of Fields
+ *
+ * Segments
+ *
+ *
+ *
+ *
+ * Document Numbers
+ *
+ *
+ *
+ *
+ * Index Structure Overview
+ *
+ *
+ *
+ *
+ * File Naming
+ *
+ * Summary of File Extensions
+ *
+ *
+ *
+ *
+ *
+ *
+ * Name
+ * Extension
+ * Brief Description
+ *
+ *
+ * {@link org.apache.lucene.index.SegmentInfos Segments File}
+ * segments_N
+ * Stores information about a commit point
+ *
+ *
+ * Lock File
+ * write.lock
+ * The Write lock prevents multiple IndexWriters from writing to the same
+ * file.
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene90.Lucene90SegmentInfoFormat Segment Info}
+ * .si
+ * Stores metadata about a segment
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene90.Lucene90CompoundFormat Compound File}
+ * .cfs, .cfe
+ * An optional "virtual" file consisting of all the other index files for
+ * systems that frequently run out of file handles.
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene90.Lucene90FieldInfosFormat Fields}
+ * .fnm
+ * Stores information about the fields
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Index}
+ * .fdx
+ * Contains pointers to field data
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Data}
+ * .fdt
+ * The stored fields for documents
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Term Dictionary}
+ * .tim
+ * The term dictionary, stores term info
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Term Index}
+ * .tip
+ * The index into the Term Dictionary
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Frequencies}
+ * .doc
+ * Contains the list of docs which contain each term along with frequency
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Positions}
+ * .pos
+ * Stores position information about where a term occurs in the index
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat Payloads}
+ * .pay
+ * Stores additional per-position metadata information such as character offsets and user payloads
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Norms}
+ * .nvd, .nvm
+ * Encodes length and boost factors for docs and fields
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat Per-Document Values}
+ * .dvd, .dvm
+ * Encodes additional scoring factors or other per-document information.
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Index}
+ * .tvx
+ * Stores offset into the document data file
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Data}
+ * .tvd
+ * Contains term vector data.
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat Live Documents}
+ * .liv
+ * Info about what documents are live
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene90.Lucene90PointsFormat Point values}
+ * .dii, .dim
+ * Holds indexed points
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene91.Lucene91HnswVectorsFormat Vector values}
+ * .vec, .vem
+ * Holds indexed vectors;
+ * .vec
files contain the raw vector data, and
+ * .vem
the vector metadataLock File
+ *
+ * The write lock, which is stored in the index directory by default, is named "write.lock". If the
+ * lock directory is different from the index directory then the write lock will be named
+ * "XXXX-write.lock" where XXXX is a unique prefix derived from the full path to the index
+ * directory. When this file is present, a writer is currently modifying the index (adding or
+ * removing documents). This lock file ensures that only one writer is modifying the index at a
+ * time.
+ *
+ * History
+ *
+ *
+ *
+ *
+ *
+ *
+ * Limitations
+ *
+ * int
to refer to document numbers, and the index file format
+ * uses an Int32
on-disk to store document numbers. This is a limitation of both the
+ * index file format and the current implementation. Eventually these should be replaced with either
+ * UInt64
values, or better yet, {@link org.apache.lucene.store.DataOutput#writeVInt
+ * VInt} values which have no limit. Hyperparameters
*
*
- *
*
* numSeed
is the equivalent of m
in the 2012 paper; it controls the
- * number of random entry points to sample.
* beamWidth
in {@link HnswGraphBuilder} has the same meaning as efConst
- *
in the 2016 paper. It is the number of nearest neighbor candidates to track while
+ * in the paper. It is the number of nearest neighbor candidates to track while
* searching the graph for each newly inserted node.
- * maxConn
has the same meaning as M
in the later paper; it controls
- * how many of the efConst
neighbors are connected to the new node
+ * maxConn
has the same meaning as M
in the paper; it controls how
+ * many of the efConst
neighbors are connected to the new node
* > graph;
// KnnGraphValues iterator members
private int upto;
private NeighborArray cur;
- HnswGraph(int maxConn) {
- graph = new ArrayList<>();
- // Typically with diversity criteria we see nodes not fully occupied; average fanout seems to be
- // about 1/2 maxConn. There is some indexing time penalty for under-allocating, but saves RAM
- graph.add(new NeighborArray(Math.max(32, maxConn / 4)));
+ HnswGraph(int maxConn, int levelOfFirstNode) {
this.maxConn = maxConn;
+ this.numLevels = levelOfFirstNode + 1;
+ this.graph = new ArrayList<>(numLevels);
+ this.entryNode = 0;
+ for (int i = 0; i < numLevels; i++) {
+ graph.add(new ArrayList<>());
+ // Typically with diversity criteria we see nodes not fully occupied;
+ // average fanout seems to be about 1/2 maxConn.
+ // There is some indexing time penalty for under-allocating, but saves RAM
+ graph.get(i).add(new NeighborArray(Math.max(32, maxConn / 4)));
+ }
+
+ this.nodesByLevel = new ArrayList<>(numLevels);
+ nodesByLevel.add(null); // we don't need this for 0th level, as it contains all nodes
+ for (int l = 1; l < numLevels; l++) {
+ nodesByLevel.add(new int[] {0});
+ }
}
/**
- * Searches for the nearest neighbors of a query vector.
+ * Searches HNSW graph for the nearest neighbors of a query vector.
*
* @param query search query vector
* @param topK the number of nodes to be returned
- * @param numSeed the size of the queue maintained while searching, and controls the number of
- * random entry points to sample
* @param vectors vector values
* @param graphValues the graph values. May represent the entire graph, or a level in a
* hierarchical graph.
* @param acceptOrds {@link Bits} that represents the allowed document ordinals to match, or
* {@code null} if they are all allowed to match.
- * @param random a source of randomness, used for generating entry points to the graph
* @return a priority queue holding the closest neighbors found
*/
public static NeighborQueue search(
float[] query,
int topK,
- int numSeed,
RandomAccessVectorValues vectors,
VectorSimilarityFunction similarityFunction,
KnnGraphValues graphValues,
- Bits acceptOrds,
- SplittableRandom random)
+ Bits acceptOrds)
throws IOException {
+
+ NeighborQueue results;
+ int[] eps = new int[] {graphValues.entryNode()};
+ for (int level = graphValues.numLevels() - 1; level >= 1; level--) {
+ results = searchLevel(query, 1, level, eps, vectors, similarityFunction, graphValues, null);
+ eps[0] = results.pop();
+ }
+ results =
+ searchLevel(query, topK, 0, eps, vectors, similarityFunction, graphValues, acceptOrds);
+ return results;
+ }
+
+ /**
+ * Searches for the nearest neighbors of a query vector in a given level
+ *
+ * @param query search query vector
+ * @param topK the number of nearest to query results to return
+ * @param level level to search
+ * @param eps the entry points for search at this level expressed as level 0th ordinals
+ * @param vectors vector values
+ * @param similarityFunction similarity function
+ * @param graphValues the graph values
+ * @param acceptOrds {@link Bits} that represents the allowed document ordinals to match, or
+ * {@code null} if they are all allowed to match.
+ * @return a priority queue holding the closest neighbors found
+ */
+ static NeighborQueue searchLevel(
+ float[] query,
+ int topK,
+ int level,
+ final int[] eps,
+ RandomAccessVectorValues vectors,
+ VectorSimilarityFunction similarityFunction,
+ KnnGraphValues graphValues,
+ Bits acceptOrds)
+ throws IOException {
+
int size = graphValues.size();
-
// MIN heap, holding the top results
- NeighborQueue results = new NeighborQueue(numSeed, similarityFunction.reversed);
+ NeighborQueue results = new NeighborQueue(topK, similarityFunction.reversed);
// MAX heap, from which to pull the candidate nodes
- NeighborQueue candidates = new NeighborQueue(numSeed, !similarityFunction.reversed);
-
+ NeighborQueue candidates = new NeighborQueue(topK, !similarityFunction.reversed);
// set of ordinals that have been visited by search on this layer, used to avoid backtracking
SparseFixedBitSet visited = new SparseFixedBitSet(size);
- // get initial candidates at random
- int boundedNumSeed = Math.min(numSeed, 2 * size);
- for (int i = 0; i < boundedNumSeed; i++) {
- int entryPoint = random.nextInt(size);
- if (visited.getAndSet(entryPoint) == false) {
- // explore the topK starting points of some random numSeed probes
- float score = similarityFunction.compare(query, vectors.vectorValue(entryPoint));
- candidates.add(entryPoint, score);
- if (acceptOrds == null || acceptOrds.get(entryPoint)) {
- results.add(entryPoint, score);
+ for (int ep : eps) {
+ if (visited.getAndSet(ep) == false) {
+ float score = similarityFunction.compare(query, vectors.vectorValue(ep));
+ candidates.add(ep, score);
+ if (acceptOrds == null || acceptOrds.get(ep)) {
+ results.add(ep, score);
}
}
}
// Set the bound to the worst current result and below reject any newly-generated candidates
- // failing
- // to exceed this bound
+ // failing to exceed this bound
BoundsChecker bound = BoundsChecker.create(similarityFunction.reversed);
bound.set(results.topScore());
while (candidates.size() > 0) {
@@ -136,7 +180,7 @@ public final class HnswGraph extends KnnGraphValues {
}
}
int topCandidateNode = candidates.pop();
- graphValues.seek(topCandidateNode);
+ graphValues.seek(level, topCandidateNode);
int friendOrd;
while ((friendOrd = graphValues.nextNeighbor()) != NO_MORE_DOCS) {
assert friendOrd < size : "friendOrd=" + friendOrd + "; size=" + size;
@@ -145,7 +189,7 @@ public final class HnswGraph extends KnnGraphValues {
}
float score = similarityFunction.compare(query, vectors.vectorValue(friendOrd));
- if (results.size() < numSeed || bound.check(score) == false) {
+ if (results.size() < topK || bound.check(score) == false) {
candidates.add(friendOrd, score);
if (acceptOrds == null || acceptOrds.get(friendOrd)) {
results.insertWithOverflow(friendOrd, score);
@@ -164,25 +208,60 @@ public final class HnswGraph extends KnnGraphValues {
/**
* Returns the {@link NeighborQueue} connected to the given node.
*
- * @param node the node whose neighbors are returned
+ * @param level level of the graph
+ * @param node the node whose neighbors are returned, represented as an ordinal on the level 0.
*/
- public NeighborArray getNeighbors(int node) {
- return graph.get(node);
+ public NeighborArray getNeighbors(int level, int node) {
+ if (level == 0) {
+ return graph.get(level).get(node);
+ }
+ int nodeIndex = Arrays.binarySearch(nodesByLevel.get(level), 0, graph.get(level).size(), node);
+ assert nodeIndex >= 0;
+ return graph.get(level).get(nodeIndex);
}
@Override
public int size() {
- return graph.size();
+ return graph.get(0).size(); // all nodes are located on the 0th level
}
- int addNode() {
- graph.add(new NeighborArray(maxConn + 1));
- return graph.size() - 1;
+ /**
+ * Add node on the given level
+ *
+ * @param level level to add a node on
+ * @param node the node to add, represented as an ordinal on the level 0.
+ */
+ public void addNode(int level, int node) {
+ if (level > 0) {
+ // if the new node introduces a new level, add more levels to the graph,
+ // and make this node the graph's new entry point
+ if (level >= numLevels) {
+ for (int i = numLevels; i <= level; i++) {
+ graph.add(new ArrayList<>());
+ nodesByLevel.add(new int[] {node});
+ }
+ numLevels = level + 1;
+ entryNode = node;
+ } else {
+ // Add this node id to this level's nodes
+ int[] nodes = nodesByLevel.get(level);
+ int idx = graph.get(level).size();
+ if (idx < nodes.length) {
+ nodes[idx] = node;
+ } else {
+ nodes = ArrayUtil.grow(nodes);
+ nodes[idx] = node;
+ nodesByLevel.set(level, nodes);
+ }
+ }
+ }
+
+ graph.get(level).add(new NeighborArray(maxConn + 1));
}
@Override
- public void seek(int targetNode) {
- cur = getNeighbors(targetNode);
+ public void seek(int level, int targetNode) {
+ cur = getNeighbors(level, targetNode);
upto = -1;
}
@@ -193,4 +272,34 @@ public final class HnswGraph extends KnnGraphValues {
}
return NO_MORE_DOCS;
}
+
+ /**
+ * Returns the current number of levels in the graph
+ *
+ * @return the current number of levels in the graph
+ */
+ @Override
+ public int numLevels() {
+ return numLevels;
+ }
+
+ /**
+ * Returns the graph's current entry node on the top level shown as ordinals of the nodes on 0th
+ * level
+ *
+ * @return the graph's current entry node on the top level
+ */
+ @Override
+ public int entryNode() {
+ return entryNode;
+ }
+
+ @Override
+ public NodesIterator getNodesOnLevel(int level) {
+ if (level == 0) {
+ return new NodesIterator(size());
+ } else {
+ return new NodesIterator(nodesByLevel.get(level), graph.get(level).size());
+ }
+ }
}
diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphBuilder.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphBuilder.java
index f5cfc6a854f..041fd181a1e 100644
--- a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphBuilder.java
+++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphBuilder.java
@@ -17,6 +17,8 @@
package org.apache.lucene.util.hnsw;
+import static java.lang.Math.log;
+
import java.io.IOException;
import java.util.Locale;
import java.util.Objects;
@@ -32,15 +34,17 @@ import org.apache.lucene.util.InfoStream;
*/
public final class HnswGraphBuilder {
- // default random seed for level generation
+ /** Default random seed for level generation * */
private static final long DEFAULT_RAND_SEED = System.currentTimeMillis();
+ /** A name for the HNSW component for the info-stream * */
public static final String HNSW_COMPONENT = "HNSW";
- // expose for testing.
+ /** Random seed for level generation; public to expose for testing * */
public static long randSeed = DEFAULT_RAND_SEED;
private final int maxConn;
private final int beamWidth;
+ private final double ml;
private final NeighborArray scratch;
private final VectorSimilarityFunction similarityFunction;
@@ -84,9 +88,12 @@ public final class HnswGraphBuilder {
}
this.maxConn = maxConn;
this.beamWidth = beamWidth;
- this.hnsw = new HnswGraph(maxConn);
+ // normalization factor for level generation; currently not configurable
+ this.ml = 1 / Math.log(1.0 * maxConn);
+ this.random = new SplittableRandom(seed);
+ int levelOfFirstNode = getRandomGraphLevel(ml, random);
+ this.hnsw = new HnswGraph(maxConn, levelOfFirstNode);
bound = BoundsChecker.create(similarityFunction.reversed);
- random = new SplittableRandom(seed);
scratch = new NeighborArray(Math.max(beamWidth, maxConn + 1));
}
@@ -109,43 +116,58 @@ public final class HnswGraphBuilder {
long start = System.nanoTime(), t = start;
// start at node 1! node 0 is added implicitly, in the constructor
for (int node = 1; node < vectors.size(); node++) {
- addGraphNode(vectors.vectorValue(node));
- if (node % 10000 == 0) {
- if (infoStream.isEnabled(HNSW_COMPONENT)) {
- long now = System.nanoTime();
- infoStream.message(
- HNSW_COMPONENT,
- String.format(
- Locale.ROOT,
- "built %d in %d/%d ms",
- node,
- ((now - t) / 1_000_000),
- ((now - start) / 1_000_000)));
- t = now;
- }
+ addGraphNode(node, vectors.vectorValue(node));
+ if ((node % 10000 == 0) && infoStream.isEnabled(HNSW_COMPONENT)) {
+ t = printGraphBuildStatus(node, start, t);
}
}
return hnsw;
}
+ /** Set info-stream to output debugging information * */
public void setInfoStream(InfoStream infoStream) {
this.infoStream = infoStream;
}
/** Inserts a doc with vector value to the graph */
- void addGraphNode(float[] value) throws IOException {
- // We pass 'null' for acceptOrds because there are no deletions while building the graph
- NeighborQueue candidates =
- HnswGraph.search(
- value, beamWidth, beamWidth, vectorValues, similarityFunction, hnsw, null, random);
+ void addGraphNode(int node, float[] value) throws IOException {
+ NeighborQueue candidates;
+ final int nodeLevel = getRandomGraphLevel(ml, random);
+ int curMaxLevel = hnsw.numLevels() - 1;
+ int[] eps = new int[] {hnsw.entryNode()};
- int node = hnsw.addNode();
+ // if a node introduces new levels to the graph, add this new node on new levels
+ for (int level = nodeLevel; level > curMaxLevel; level--) {
+ hnsw.addNode(level, node);
+ }
+ // for levels > nodeLevel search with topk = 1
+ for (int level = curMaxLevel; level > nodeLevel; level--) {
+ candidates =
+ HnswGraph.searchLevel(value, 1, level, eps, vectorValues, similarityFunction, hnsw, null);
+ eps = new int[] {candidates.pop()};
+ }
+ // for levels <= nodeLevel search with topk = beamWidth, and add connections
+ for (int level = Math.min(nodeLevel, curMaxLevel); level >= 0; level--) {
+ candidates =
+ HnswGraph.searchLevel(
+ value, beamWidth, level, eps, vectorValues, similarityFunction, hnsw, null);
+ eps = candidates.nodes();
+ hnsw.addNode(level, node);
+ addDiverseNeighbors(level, node, candidates);
+ }
+ }
- /* connect neighbors to the new node, using a diversity heuristic that chooses successive
- * nearest neighbors that are closer to the new node than they are to the previously-selected
- * neighbors
- */
- addDiverseNeighbors(node, candidates);
+ private long printGraphBuildStatus(int node, long start, long t) {
+ long now = System.nanoTime();
+ infoStream.message(
+ HNSW_COMPONENT,
+ String.format(
+ Locale.ROOT,
+ "built %d in %d/%d ms",
+ node,
+ ((now - t) / 1_000_000),
+ ((now - start) / 1_000_000)));
+ return now;
}
/* TODO: we are not maintaining nodes in strict score order; the forward links
@@ -153,12 +175,13 @@ public final class HnswGraphBuilder {
* work better if we keep the neighbor arrays sorted. Possibly we should switch back to a heap?
* But first we should just see if sorting makes a significant difference.
*/
- private void addDiverseNeighbors(int node, NeighborQueue candidates) throws IOException {
+ private void addDiverseNeighbors(int level, int node, NeighborQueue candidates)
+ throws IOException {
/* For each of the beamWidth nearest candidates (going from best to worst), select it only if it
* is closer to target than it is to any of the already-selected neighbors (ie selected in this method,
* since the node is new and has no prior neighbors).
*/
- NeighborArray neighbors = hnsw.getNeighbors(node);
+ NeighborArray neighbors = hnsw.getNeighbors(level, node);
assert neighbors.size() == 0; // new node
popToScratch(candidates);
selectDiverse(neighbors, scratch);
@@ -168,7 +191,7 @@ public final class HnswGraphBuilder {
int size = neighbors.size();
for (int i = 0; i < size; i++) {
int nbr = neighbors.node[i];
- NeighborArray nbrNbr = hnsw.getNeighbors(nbr);
+ NeighborArray nbrNbr = hnsw.getNeighbors(level, nbr);
nbrNbr.add(node, neighbors.score[i]);
if (nbrNbr.size() > maxConn) {
diversityUpdate(nbrNbr);
@@ -266,4 +289,12 @@ public final class HnswGraphBuilder {
}
return -1;
}
+
+ private static int getRandomGraphLevel(double ml, SplittableRandom random) {
+ double randDouble;
+ do {
+ randDouble = random.nextDouble(); // avoid 0 value, as log(0) is undefined
+ } while (randDouble == 0.0);
+ return ((int) (-log(randDouble) * ml));
+ }
}
diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/NeighborArray.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/NeighborArray.java
index 9deaa64113c..40125750309 100644
--- a/lucene/core/src/java/org/apache/lucene/util/hnsw/NeighborArray.java
+++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/NeighborArray.java
@@ -32,7 +32,7 @@ public class NeighborArray {
float[] score;
int[] node;
- NeighborArray(int maxSize) {
+ public NeighborArray(int maxSize) {
node = new int[maxSize];
score = new float[maxSize];
}
@@ -60,11 +60,15 @@ public class NeighborArray {
return node;
}
+ public float[] score() {
+ return score;
+ }
+
public void clear() {
size = 0;
}
- void removeLast() {
+ public void removeLast() {
size--;
}
diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/NeighborQueue.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/NeighborQueue.java
index 0d2b94d1c5f..6f08a7bf329 100644
--- a/lucene/core/src/java/org/apache/lucene/util/hnsw/NeighborQueue.java
+++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/NeighborQueue.java
@@ -54,7 +54,7 @@ public class NeighborQueue {
// Used to track the number of neighbors visited during a single graph traversal
private int visitedCount;
- NeighborQueue(int initialSize, boolean reversed) {
+ public NeighborQueue(int initialSize, boolean reversed) {
this.heap = new LongHeap(initialSize);
this.order = reversed ? Order.REVERSED : Order.NATURAL;
}
@@ -119,7 +119,7 @@ public class NeighborQueue {
return visitedCount;
}
- void setVisitedCount(int visitedCount) {
+ public void setVisitedCount(int visitedCount) {
this.visitedCount = visitedCount;
}
diff --git a/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.Codec b/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.Codec
index 7eec415dffa..6e977e42f1e 100644
--- a/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.Codec
+++ b/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.Codec
@@ -13,4 +13,4 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-org.apache.lucene.codecs.lucene90.Lucene90Codec
+org.apache.lucene.codecs.lucene91.Lucene91Codec
diff --git a/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.KnnVectorsFormat b/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.KnnVectorsFormat
index e7158f195c1..692145891fb 100644
--- a/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.KnnVectorsFormat
+++ b/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.KnnVectorsFormat
@@ -13,4 +13,4 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-org.apache.lucene.codecs.lucene90.Lucene90HnswVectorsFormat
\ No newline at end of file
+org.apache.lucene.codecs.lucene91.Lucene91HnswVectorsFormat
diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90StoredFieldsFormatHighCompression.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90StoredFieldsFormatHighCompression.java
index a1cb543bc05..6b747ea00a1 100644
--- a/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90StoredFieldsFormatHighCompression.java
+++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90StoredFieldsFormatHighCompression.java
@@ -18,7 +18,8 @@ package org.apache.lucene.codecs.lucene90;
import com.carrotsearch.randomizedtesting.generators.RandomPicks;
import org.apache.lucene.codecs.Codec;
-import org.apache.lucene.codecs.lucene90.Lucene90Codec.Mode;
+import org.apache.lucene.codecs.lucene91.Lucene91Codec;
+import org.apache.lucene.codecs.lucene91.Lucene91Codec.Mode;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.index.DirectoryReader;
@@ -30,7 +31,7 @@ import org.apache.lucene.tests.index.BaseStoredFieldsFormatTestCase;
public class TestLucene90StoredFieldsFormatHighCompression extends BaseStoredFieldsFormatTestCase {
@Override
protected Codec getCodec() {
- return new Lucene90Codec(Mode.BEST_COMPRESSION);
+ return new Lucene91Codec(Mode.BEST_COMPRESSION);
}
/**
@@ -40,7 +41,7 @@ public class TestLucene90StoredFieldsFormatHighCompression extends BaseStoredFie
Directory dir = newDirectory();
for (int i = 0; i < 10; i++) {
IndexWriterConfig iwc = newIndexWriterConfig();
- iwc.setCodec(new Lucene90Codec(RandomPicks.randomFrom(random(), Mode.values())));
+ iwc.setCodec(new Lucene91Codec(RandomPicks.randomFrom(random(), Mode.values())));
IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig());
Document doc = new Document();
doc.add(new StoredField("field1", "value1"));
@@ -69,7 +70,7 @@ public class TestLucene90StoredFieldsFormatHighCompression extends BaseStoredFie
expectThrows(
NullPointerException.class,
() -> {
- new Lucene90Codec(null);
+ new Lucene91Codec(null);
});
expectThrows(
diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene91/TestLucene91HnswVectorsFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene91/TestLucene91HnswVectorsFormat.java
new file mode 100644
index 00000000000..fe828f018cb
--- /dev/null
+++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene91/TestLucene91HnswVectorsFormat.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene91;
+
+import static com.carrotsearch.randomizedtesting.RandomizedTest.randomIntBetween;
+import static org.apache.lucene.codecs.lucene91.Lucene91HnswVectorsFormat.DEFAULT_BEAM_WIDTH;
+import static org.apache.lucene.codecs.lucene91.Lucene91HnswVectorsFormat.DEFAULT_MAX_CONN;
+
+import org.apache.lucene.codecs.Codec;
+import org.apache.lucene.codecs.KnnVectorsFormat;
+import org.apache.lucene.tests.index.BaseKnnVectorsFormatTestCase;
+import org.apache.lucene.tests.util.TestUtil;
+
+public class TestLucene91HnswVectorsFormat extends BaseKnnVectorsFormatTestCase {
+ @Override
+ protected Codec getCodec() {
+ return TestUtil.getDefaultCodec();
+ }
+
+ public void testToString() {
+ int maxConn = randomIntBetween(DEFAULT_MAX_CONN - 10, DEFAULT_MAX_CONN + 10);
+ int beamWidth = randomIntBetween(DEFAULT_BEAM_WIDTH - 50, DEFAULT_BEAM_WIDTH + 50);
+ Codec customCodec =
+ new Lucene91Codec() {
+ @Override
+ public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
+ return new Lucene91HnswVectorsFormat(maxConn, beamWidth);
+ }
+ };
+ String expectedString =
+ "Lucene91HnswVectorsFormat(name = Lucene91HnswVectorsFormat, maxConn = "
+ + maxConn
+ + ", beamWidth="
+ + beamWidth
+ + ")";
+ assertEquals(
+ expectedString,
+ ((Lucene91Codec) customCodec).getKnnVectorsFormatForField("bogus_field").toString());
+ }
+}
diff --git a/lucene/core/src/test/org/apache/lucene/index/TestKnnGraph.java b/lucene/core/src/test/org/apache/lucene/index/TestKnnGraph.java
index 46b6b6d920b..e2366aab1fb 100644
--- a/lucene/core/src/test/org/apache/lucene/index/TestKnnGraph.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestKnnGraph.java
@@ -16,6 +16,7 @@
*/
package org.apache.lucene.index;
+import static com.carrotsearch.randomizedtesting.RandomizedTest.randomIntBetween;
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
import static org.apache.lucene.util.hnsw.HnswGraphBuilder.randSeed;
@@ -26,11 +27,12 @@ import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
+import java.util.concurrent.CountDownLatch;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.KnnVectorsFormat;
-import org.apache.lucene.codecs.lucene90.Lucene90Codec;
-import org.apache.lucene.codecs.lucene90.Lucene90HnswVectorsFormat;
-import org.apache.lucene.codecs.lucene90.Lucene90HnswVectorsReader;
+import org.apache.lucene.codecs.lucene91.Lucene91Codec;
+import org.apache.lucene.codecs.lucene91.Lucene91HnswVectorsFormat;
+import org.apache.lucene.codecs.lucene91.Lucene91HnswVectorsReader;
import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
@@ -38,13 +40,19 @@ import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.KnnVectorField;
import org.apache.lucene.document.SortedDocValuesField;
import org.apache.lucene.document.StringField;
+import org.apache.lucene.index.KnnGraphValues.NodesIterator;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.KnnVectorQuery;
import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.SearcherFactory;
+import org.apache.lucene.search.SearcherManager;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.tests.util.LuceneTestCase;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.VectorUtil;
import org.apache.lucene.util.hnsw.HnswGraphBuilder;
import org.junit.After;
@@ -55,7 +63,7 @@ public class TestKnnGraph extends LuceneTestCase {
private static final String KNN_GRAPH_FIELD = "vector";
- private static int maxConn = Lucene90HnswVectorsFormat.DEFAULT_MAX_CONN;
+ private static int maxConn = Lucene91HnswVectorsFormat.DEFAULT_MAX_CONN;
private Codec codec;
private VectorSimilarityFunction similarityFunction;
@@ -68,11 +76,11 @@ public class TestKnnGraph extends LuceneTestCase {
}
codec =
- new Lucene90Codec() {
+ new Lucene91Codec() {
@Override
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
- return new Lucene90HnswVectorsFormat(
- maxConn, Lucene90HnswVectorsFormat.DEFAULT_BEAM_WIDTH);
+ return new Lucene91HnswVectorsFormat(
+ maxConn, Lucene91HnswVectorsFormat.DEFAULT_BEAM_WIDTH);
}
};
@@ -82,7 +90,7 @@ public class TestKnnGraph extends LuceneTestCase {
@After
public void cleanup() {
- maxConn = Lucene90HnswVectorsFormat.DEFAULT_MAX_CONN;
+ maxConn = Lucene91HnswVectorsFormat.DEFAULT_MAX_CONN;
}
/** Basic test of creating documents in a graph */
@@ -153,21 +161,63 @@ public class TestKnnGraph extends LuceneTestCase {
int dimension = atLeast(10);
float[][] values = randomVectors(numDoc, dimension);
int mergePoint = random().nextInt(numDoc);
- int[][] mergedGraph = getIndexedGraph(values, mergePoint, seed);
- int[][] singleSegmentGraph = getIndexedGraph(values, -1, seed);
+ int[][][] mergedGraph = getIndexedGraph(values, mergePoint, seed);
+ int[][][] singleSegmentGraph = getIndexedGraph(values, -1, seed);
assertGraphEquals(singleSegmentGraph, mergedGraph);
}
- private void assertGraphEquals(int[][] expected, int[][] actual) {
- assertEquals("graph sizes differ", expected.length, actual.length);
- for (int i = 0; i < expected.length; i++) {
- assertArrayEquals("difference at ord=" + i, expected[i], actual[i]);
+ /** Test writing and reading of multiple vector fields * */
+ public void testMultipleVectorFields() throws Exception {
+ int numVectorFields = randomIntBetween(2, 5);
+ int numDoc = atLeast(100);
+ int[] dims = new int[numVectorFields];
+ float[][][] values = new float[numVectorFields][][];
+ for (int field = 0; field < numVectorFields; field++) {
+ dims[field] = atLeast(3);
+ values[field] = randomVectors(numDoc, dims[field]);
+ }
+
+ try (Directory dir = newDirectory();
+ IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(null).setCodec(codec))) {
+ for (int docID = 0; docID < numDoc; docID++) {
+ Document doc = new Document();
+ for (int field = 0; field < numVectorFields; field++) {
+ float[] vector = values[field][docID];
+ if (vector != null) {
+ FieldType fieldType = KnnVectorField.createFieldType(vector.length, similarityFunction);
+ doc.add(new KnnVectorField(KNN_GRAPH_FIELD + field, vector, fieldType));
+ }
+ }
+ String idString = Integer.toString(docID);
+ doc.add(new StringField("id", idString, Field.Store.YES));
+ iw.addDocument(doc);
+ }
+ for (int field = 0; field < numVectorFields; field++) {
+ assertConsistentGraph(iw, values[field], KNN_GRAPH_FIELD + field);
+ }
}
}
- private int[][] getIndexedGraph(float[][] values, int mergePoint, long seed) throws IOException {
+ private void assertGraphEquals(int[][][] expected, int[][][] actual) {
+ assertEquals("graph sizes differ", expected.length, actual.length);
+ for (int level = 0; level < expected.length; level++) {
+ for (int node = 0; node < expected[level].length; node++) {
+ assertArrayEquals("difference at ord=" + node, expected[level][node], actual[level][node]);
+ }
+ }
+ }
+
+ /**
+ * Return a naive representation of an HNSW graph as a 3 dimensional array: 1st dim represents a
+ * graph layer. Each layer contains an array of arrays – a list of nodes and for each node a list
+ * of the node's neighbours. 2nd dim represents a node on a layer, and contains the node's
+ * neighbourhood, or {@code null} if a node is not present on this layer. 3rd dim represents
+ * neighbours of a node.
+ */
+ private int[][][] getIndexedGraph(float[][] values, int mergePoint, long seed)
+ throws IOException {
HnswGraphBuilder.randSeed = seed;
- int[][] graph;
+ int[][][] graph;
try (Directory dir = newDirectory()) {
IndexWriterConfig iwc = newIndexWriterConfig();
iwc.setMergePolicy(new LogDocMergePolicy()); // for predictable segment ordering when merging
@@ -186,8 +236,8 @@ public class TestKnnGraph extends LuceneTestCase {
PerFieldKnnVectorsFormat.FieldsReader perFieldReader =
(PerFieldKnnVectorsFormat.FieldsReader)
((CodecReader) getOnlyLeafReader(reader)).getVectorReader();
- Lucene90HnswVectorsReader vectorReader =
- (Lucene90HnswVectorsReader) perFieldReader.getFieldReader(KNN_GRAPH_FIELD);
+ Lucene91HnswVectorsReader vectorReader =
+ (Lucene91HnswVectorsReader) perFieldReader.getFieldReader(KNN_GRAPH_FIELD);
graph = copyGraph(vectorReader.getGraphValues(KNN_GRAPH_FIELD));
}
}
@@ -208,18 +258,23 @@ public class TestKnnGraph extends LuceneTestCase {
return values;
}
- int[][] copyGraph(KnnGraphValues values) throws IOException {
- int size = values.size();
- int[][] graph = new int[size][];
+ int[][][] copyGraph(KnnGraphValues graphValues) throws IOException {
+ int[][][] graph = new int[graphValues.numLevels()][][];
+ int size = graphValues.size();
int[] scratch = new int[maxConn];
- for (int node = 0; node < size; node++) {
- int n, count = 0;
- values.seek(node);
- while ((n = values.nextNeighbor()) != NO_MORE_DOCS) {
- scratch[count++] = n;
- // graph[node][i++] = n;
+
+ for (int level = 0; level < graphValues.numLevels(); level++) {
+ NodesIterator nodesItr = graphValues.getNodesOnLevel(level);
+ graph[level] = new int[size][];
+ while (nodesItr.hasNext()) {
+ int node = nodesItr.nextInt();
+ graphValues.seek(level, node);
+ int n, count = 0;
+ while ((n = graphValues.nextNeighbor()) != NO_MORE_DOCS) {
+ scratch[count++] = n;
+ }
+ graph[level][node] = ArrayUtil.copyOfSubArray(scratch, 0, count);
}
- graph[node] = ArrayUtil.copyOfSubArray(scratch, 0, count);
}
return graph;
}
@@ -232,31 +287,7 @@ public class TestKnnGraph extends LuceneTestCase {
config.setCodec(codec); // test is not compatible with simpletext
try (Directory dir = newDirectory();
IndexWriter iw = new IndexWriter(dir, config)) {
- // Add a document for every cartesian point in an NxN square so we can
- // easily know which are the nearest neighbors to every point. Insert by iterating
- // using a prime number that is not a divisor of N*N so that we will hit each point once,
- // and chosen so that points will be inserted in a deterministic
- // but somewhat distributed pattern
- int n = 5, stepSize = 17;
- float[][] values = new float[n * n][];
- int index = 0;
- for (int i = 0; i < values.length; i++) {
- // System.out.printf("%d: (%d, %d)\n", i, index % n, index / n);
- int x = index % n, y = index / n;
- values[i] = new float[] {x, y};
- index = (index + stepSize) % (n * n);
- add(iw, i, values[i]);
- if (i == 13) {
- // create 2 segments
- iw.commit();
- }
- }
- boolean forceMerge = random().nextBoolean();
- // System.out.println("");
- if (forceMerge) {
- iw.forceMerge(1);
- }
- assertConsistentGraph(iw, values);
+ indexData(iw);
try (DirectoryReader dr = DirectoryReader.open(iw)) {
// results are ordered by score (descending) and docid (ascending);
// This is the insertion order:
@@ -279,6 +310,77 @@ public class TestKnnGraph extends LuceneTestCase {
}
}
+ private void indexData(IndexWriter iw) throws IOException {
+ // Add a document for every cartesian point in an NxN square so we can
+ // easily know which are the nearest neighbors to every point. Insert by iterating
+ // using a prime number that is not a divisor of N*N so that we will hit each point once,
+ // and chosen so that points will be inserted in a deterministic
+ // but somewhat distributed pattern
+ int n = 5, stepSize = 17;
+ float[][] values = new float[n * n][];
+ int index = 0;
+ for (int i = 0; i < values.length; i++) {
+ // System.out.printf("%d: (%d, %d)\n", i, index % n, index / n);
+ int x = index % n, y = index / n;
+ values[i] = new float[] {x, y};
+ index = (index + stepSize) % (n * n);
+ add(iw, i, values[i]);
+ if (i == 13) {
+ // create 2 segments
+ iw.commit();
+ }
+ }
+ boolean forceMerge = random().nextBoolean();
+ if (forceMerge) {
+ iw.forceMerge(1);
+ }
+ assertConsistentGraph(iw, values);
+ }
+
+ public void testMultiThreadedSearch() throws Exception {
+ similarityFunction = VectorSimilarityFunction.EUCLIDEAN;
+ IndexWriterConfig config = newIndexWriterConfig();
+ config.setCodec(codec);
+ Directory dir = newDirectory();
+ IndexWriter iw = new IndexWriter(dir, config);
+ indexData(iw);
+
+ final SearcherManager manager = new SearcherManager(iw, new SearcherFactory());
+ Thread[] threads = new Thread[randomIntBetween(2, 5)];
+ final CountDownLatch latch = new CountDownLatch(1);
+ for (int i = 0; i < threads.length; i++) {
+ threads[i] =
+ new Thread(
+ () -> {
+ try {
+ latch.await();
+ IndexSearcher searcher = manager.acquire();
+ try {
+ KnnVectorQuery query = new KnnVectorQuery("vector", new float[] {0f, 0.1f}, 5);
+ TopDocs results = searcher.search(query, 5);
+ for (ScoreDoc doc : results.scoreDocs) {
+ // map docId to insertion id
+ doc.doc =
+ Integer.parseInt(searcher.getIndexReader().document(doc.doc).get("id"));
+ }
+ assertResults(new int[] {0, 15, 3, 18, 5}, results);
+ } finally {
+ manager.release(searcher);
+ }
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ }
+ });
+ threads[i].start();
+ }
+
+ latch.countDown();
+ for (Thread t : threads) {
+ t.join();
+ }
+ IOUtils.close(manager, iw, dir);
+ }
+
private void assertGraphSearch(int[] expected, float[] vector, IndexReader reader)
throws IOException {
TopDocs results = doKnnSearch(reader, vector, 5);
@@ -310,39 +412,40 @@ public class TestKnnGraph extends LuceneTestCase {
}
}
+ private void assertConsistentGraph(IndexWriter iw, float[][] values) throws IOException {
+ assertConsistentGraph(iw, values, KNN_GRAPH_FIELD);
+ }
+
// For each leaf, verify that its graph nodes are 1-1 with vectors, that the vectors are the
- // expected values,
- // and that the graph is fully connected and symmetric.
+ // expected values, and that the graph is fully connected and symmetric.
// NOTE: when we impose max-fanout on the graph it wil no longer be symmetric, but should still
// be fully connected. Is there any other invariant we can test? Well, we can check that max
- // fanout
- // is respected. We can test *desirable* properties of the graph like small-world (the graph
- // diameter
- // should be tightly bounded).
- private void assertConsistentGraph(IndexWriter iw, float[][] values) throws IOException {
- int totalGraphDocs = 0;
+ // fanout is respected. We can test *desirable* properties of the graph like small-world
+ // (the graph diameter should be tightly bounded).
+ private void assertConsistentGraph(IndexWriter iw, float[][] values, String vectorField)
+ throws IOException {
+ int numDocsWithVectors = 0;
try (DirectoryReader dr = DirectoryReader.open(iw)) {
for (LeafReaderContext ctx : dr.leaves()) {
LeafReader reader = ctx.reader();
- VectorValues vectorValues = reader.getVectorValues(KNN_GRAPH_FIELD);
+ VectorValues vectorValues = reader.getVectorValues(vectorField);
PerFieldKnnVectorsFormat.FieldsReader perFieldReader =
(PerFieldKnnVectorsFormat.FieldsReader) ((CodecReader) reader).getVectorReader();
if (perFieldReader == null) {
continue;
}
- Lucene90HnswVectorsReader vectorReader =
- (Lucene90HnswVectorsReader) perFieldReader.getFieldReader(KNN_GRAPH_FIELD);
- KnnGraphValues graphValues = vectorReader.getGraphValues(KNN_GRAPH_FIELD);
- assertEquals((vectorValues == null), (graphValues == null));
+ Lucene91HnswVectorsReader vectorReader =
+ (Lucene91HnswVectorsReader) perFieldReader.getFieldReader(vectorField);
+ KnnGraphValues graphValues = vectorReader.getGraphValues(vectorField);
if (vectorValues == null) {
+ assert graphValues == null;
continue;
}
- int[][] graph = new int[reader.maxDoc()][];
- boolean foundOrphan = false;
- int graphSize = 0;
+
+ // assert vector values:
+ // stored vector values are the same as original
for (int i = 0; i < reader.maxDoc(); i++) {
int nextDocWithVectors = vectorValues.advance(i);
- // System.out.println("advanced to " + nextDocWithVectors);
while (i < nextDocWithVectors && i < reader.maxDoc()) {
int id = Integer.parseInt(reader.document(i).get("id"));
assertNull("document " + id + " has no vector, but was expected to", values[id]);
@@ -352,7 +455,6 @@ public class TestKnnGraph extends LuceneTestCase {
break;
}
int id = Integer.parseInt(reader.document(i).get("id"));
- graphValues.seek(graphSize);
// documents with KnnGraphValues have the expected vectors
float[] scratch = vectorValues.vectorValue();
assertArrayEquals(
@@ -360,54 +462,71 @@ public class TestKnnGraph extends LuceneTestCase {
values[id],
scratch,
0f);
- // We collect neighbors for analysis below
- List