From fcd98fd3370b36e01f35510214cbd3628b25f0f8 Mon Sep 17 00:00:00 2001 From: Rushabh Shah Date: Mon, 13 Jun 2022 09:41:39 -0700 Subject: [PATCH] LUCENE-10266 Move nearest-neighbor search on points to core (#897) Co-authored-by: Rushabh Shah --- lucene/CHANGES.txt | 4 +- .../apache/lucene/document/LatLonPoint.java | 79 ++++++++++++ .../lucene/document}/NearestNeighbor.java | 8 +- .../apache/lucene}/search/TestNearest.java | 31 ++--- .../search/LatLonPointPrototypeQueries.java | 117 ------------------ .../TestFloatPointNearestNeighbor.java | 4 +- 6 files changed, 94 insertions(+), 149 deletions(-) rename lucene/{sandbox/src/java/org/apache/lucene/sandbox/search => core/src/java/org/apache/lucene/document}/NearestNeighbor.java (99%) rename lucene/{sandbox/src/test/org/apache/lucene/sandbox => core/src/test/org/apache/lucene}/search/TestNearest.java (89%) delete mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/search/LatLonPointPrototypeQueries.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index f2afcd714bb..a981917393e 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -25,7 +25,9 @@ API Changes * LUCENE-10436: Remove deprecated DocValuesFieldExistsQuery, NormsFieldExistsQuery and KnnVectorFieldExistsQuery. (Zach Chen, Adrien Grand) -* LUCENE-10561 Reduce class/member visibility of all normalizer and stemmer classes. (Rushabh Shah) +* LUCENE-10561: Reduce class/member visibility of all normalizer and stemmer classes. (Rushabh Shah) + +* LUCENE-10266: Move nearest-neighbor search on points to core. (Rushabh Shah) New Features --------------------- diff --git a/lucene/core/src/java/org/apache/lucene/document/LatLonPoint.java b/lucene/core/src/java/org/apache/lucene/document/LatLonPoint.java index 21eb8fb541d..c459b080363 100644 --- a/lucene/core/src/java/org/apache/lucene/document/LatLonPoint.java +++ b/lucene/core/src/java/org/apache/lucene/document/LatLonPoint.java @@ -23,23 +23,36 @@ import static org.apache.lucene.geo.GeoEncodingUtils.encodeLatitudeCeil; import static org.apache.lucene.geo.GeoEncodingUtils.encodeLongitude; import static org.apache.lucene.geo.GeoEncodingUtils.encodeLongitudeCeil; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; import org.apache.lucene.geo.Circle; +import org.apache.lucene.geo.GeoUtils; import org.apache.lucene.geo.LatLonGeometry; import org.apache.lucene.geo.Point; import org.apache.lucene.geo.Polygon; import org.apache.lucene.geo.Rectangle; import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.PointValues; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.BoostQuery; import org.apache.lucene.search.ConstantScoreQuery; +import org.apache.lucene.search.FieldDoc; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.MatchNoDocsQuery; import org.apache.lucene.search.PointRangeQuery; import org.apache.lucene.search.Query; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.TopFieldDocs; +import org.apache.lucene.search.TotalHits; +import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.NumericUtils; +import org.apache.lucene.util.SloppyMath; /** * An indexed location field. @@ -362,4 +375,70 @@ public class LatLonPoint extends Field { } return query; } + + /** + * Finds the {@code n} nearest indexed points to the provided point, according to Haversine + * distance. + * + *

This is functionally equivalent to running {@link MatchAllDocsQuery} with a {@link + * LatLonDocValuesField#newDistanceSort}, but is far more efficient since it takes advantage of + * properties the indexed BKD tree. Multi-valued fields are currently not de-duplicated, so if a + * document had multiple instances of the specified field that make it into the top n, that + * document will appear more than once. + * + *

Documents are ordered by ascending distance from the location. The value returned in {@link + * FieldDoc} for the hits contains a Double instance with the distance in meters. + * + * @param searcher IndexSearcher to find nearest points from. + * @param field field name. must not be null. + * @param latitude latitude at the center: must be within standard +/-90 coordinate bounds. + * @param longitude longitude at the center: must be within standard +/-180 coordinate bounds. + * @param n the number of nearest neighbors to retrieve. + * @return TopFieldDocs containing documents ordered by distance, where the field value for each + * {@link FieldDoc} is the distance in meters + * @throws IllegalArgumentException if {@code field} or {@code searcher} is null, or if {@code + * latitude}, {@code longitude} or {@code n} are out-of-bounds + * @throws IOException if an IOException occurs while finding the points. + */ + // TODO: what about multi-valued documents? what happens? + public static TopFieldDocs nearest( + IndexSearcher searcher, String field, double latitude, double longitude, int n) + throws IOException { + GeoUtils.checkLatitude(latitude); + GeoUtils.checkLongitude(longitude); + if (n < 1) { + throw new IllegalArgumentException("n must be at least 1; got " + n); + } + if (field == null) { + throw new IllegalArgumentException("field must not be null"); + } + if (searcher == null) { + throw new IllegalArgumentException("searcher must not be null"); + } + List readers = new ArrayList<>(); + List docBases = new ArrayList<>(); + List liveDocs = new ArrayList<>(); + int totalHits = 0; + for (LeafReaderContext leaf : searcher.getIndexReader().leaves()) { + PointValues points = leaf.reader().getPointValues(field); + if (points != null) { + totalHits += points.getDocCount(); + readers.add(points); + docBases.add(leaf.docBase); + liveDocs.add(leaf.reader().getLiveDocs()); + } + } + + NearestNeighbor.NearestHit[] hits = + NearestNeighbor.nearest(latitude, longitude, readers, liveDocs, docBases, n); + + // Convert to TopFieldDocs: + ScoreDoc[] scoreDocs = new ScoreDoc[hits.length]; + for (int i = 0; i < hits.length; i++) { + NearestNeighbor.NearestHit hit = hits[i]; + double hitDistance = SloppyMath.haversinMeters(hit.distanceSortKey); + scoreDocs[i] = new FieldDoc(hit.docID, 0.0f, new Object[] {Double.valueOf(hitDistance)}); + } + return new TopFieldDocs(new TotalHits(totalHits, TotalHits.Relation.EQUAL_TO), scoreDocs, null); + } } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/NearestNeighbor.java b/lucene/core/src/java/org/apache/lucene/document/NearestNeighbor.java similarity index 99% rename from lucene/sandbox/src/java/org/apache/lucene/sandbox/search/NearestNeighbor.java rename to lucene/core/src/java/org/apache/lucene/document/NearestNeighbor.java index a3fe984e651..b5683a7993e 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/NearestNeighbor.java +++ b/lucene/core/src/java/org/apache/lucene/document/NearestNeighbor.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.sandbox.search; +package org.apache.lucene.document; import static org.apache.lucene.geo.GeoEncodingUtils.decodeLatitude; import static org.apache.lucene.geo.GeoEncodingUtils.decodeLongitude; @@ -31,11 +31,7 @@ import org.apache.lucene.index.PointValues.Relation; import org.apache.lucene.util.Bits; import org.apache.lucene.util.SloppyMath; -/** - * KNN search on top of 2D lat/lon indexed points. - * - * @lucene.experimental - */ +/** KNN search on top of 2D lat/lon indexed points. */ class NearestNeighbor { static class Cell implements Comparable { diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestNearest.java b/lucene/core/src/test/org/apache/lucene/search/TestNearest.java similarity index 89% rename from lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestNearest.java rename to lucene/core/src/test/org/apache/lucene/search/TestNearest.java index 4f7df8a587b..e35a1875c4f 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestNearest.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestNearest.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.sandbox.search; +package org.apache.lucene.search; import java.util.Arrays; import java.util.Comparator; @@ -30,12 +30,6 @@ import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.SerialMergeScheduler; import org.apache.lucene.index.Term; -import org.apache.lucene.search.FieldDoc; -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.MatchAllDocsQuery; -import org.apache.lucene.search.ScoreDoc; -import org.apache.lucene.search.Sort; -import org.apache.lucene.search.TopFieldDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.tests.geo.GeoTestUtil; import org.apache.lucene.tests.index.RandomIndexWriter; @@ -62,8 +56,7 @@ public class TestNearest extends LuceneTestCase { // can't wrap because we require Lucene60PointsFormat directly but e.g. ParallelReader wraps // with its own points impl: IndexSearcher s = newSearcher(r, false); - FieldDoc hit = - (FieldDoc) LatLonPointPrototypeQueries.nearest(s, "point", 40.0, 50.0, 1).scoreDocs[0]; + FieldDoc hit = (FieldDoc) LatLonPoint.nearest(s, "point", 40.0, 50.0, 1).scoreDocs[0]; assertEquals("0", r.document(hit.doc).getField("id").stringValue()); r.close(); @@ -72,7 +65,7 @@ public class TestNearest extends LuceneTestCase { // can't wrap because we require Lucene60PointsFormat directly but e.g. ParallelReader wraps // with its own points impl: s = newSearcher(r, false); - hit = (FieldDoc) LatLonPointPrototypeQueries.nearest(s, "point", 40.0, 50.0, 1).scoreDocs[0]; + hit = (FieldDoc) LatLonPoint.nearest(s, "point", 40.0, 50.0, 1).scoreDocs[0]; assertEquals("1", r.document(hit.doc).getField("id").stringValue()); r.close(); w.close(); @@ -95,8 +88,7 @@ public class TestNearest extends LuceneTestCase { // can't wrap because we require Lucene60PointsFormat directly but e.g. ParallelReader wraps // with its own points impl: IndexSearcher s = newSearcher(r, false); - FieldDoc hit = - (FieldDoc) LatLonPointPrototypeQueries.nearest(s, "point", 40.0, 50.0, 1).scoreDocs[0]; + FieldDoc hit = (FieldDoc) LatLonPoint.nearest(s, "point", 40.0, 50.0, 1).scoreDocs[0]; assertEquals("0", r.document(hit.doc).getField("id").stringValue()); r.close(); @@ -106,8 +98,7 @@ public class TestNearest extends LuceneTestCase { // can't wrap because we require Lucene60PointsFormat directly but e.g. ParallelReader wraps // with its own points impl: s = newSearcher(r, false); - assertEquals( - 0, LatLonPointPrototypeQueries.nearest(s, "point", 40.0, 50.0, 1).scoreDocs.length); + assertEquals(0, LatLonPoint.nearest(s, "point", 40.0, 50.0, 1).scoreDocs.length); r.close(); w.close(); dir.close(); @@ -128,9 +119,7 @@ public class TestNearest extends LuceneTestCase { DirectoryReader r = DirectoryReader.open(w); // can't wrap because we require Lucene60PointsFormat directly but e.g. ParallelReader wraps // with its own points impl: - ScoreDoc[] hits = - LatLonPointPrototypeQueries.nearest(newSearcher(r, false), "point", 45.0, 50.0, 2) - .scoreDocs; + ScoreDoc[] hits = LatLonPoint.nearest(newSearcher(r, false), "point", 45.0, 50.0, 2).scoreDocs; assertEquals("0", r.document(hits[0].doc).getField("id").stringValue()); assertEquals("1", r.document(hits[1].doc).getField("id").stringValue()); @@ -146,10 +135,7 @@ public class TestNearest extends LuceneTestCase { // can't wrap because we require Lucene60PointsFormat directly but e.g. ParallelReader wraps // with its own points impl: assertEquals( - 0, - LatLonPointPrototypeQueries.nearest(newSearcher(r, false), "point", 40.0, 50.0, 1) - .scoreDocs - .length); + 0, LatLonPoint.nearest(newSearcher(r, false), "point", 40.0, 50.0, 1).scoreDocs.length); r.close(); w.close(); dir.close(); @@ -245,8 +231,7 @@ public class TestNearest extends LuceneTestCase { topN, new Sort(LatLonDocValuesField.newDistanceSort("point", pointLat, pointLon))); - ScoreDoc[] hits = - LatLonPointPrototypeQueries.nearest(s, "point", pointLat, pointLon, topN).scoreDocs; + ScoreDoc[] hits = LatLonPoint.nearest(s, "point", pointLat, pointLon, topN).scoreDocs; for (int i = 0; i < topN; i++) { FieldDoc expected = expectedHits[i]; FieldDoc expected2 = (FieldDoc) fieldDocs.scoreDocs[i]; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/LatLonPointPrototypeQueries.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/LatLonPointPrototypeQueries.java deleted file mode 100644 index 38cf543caf4..00000000000 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/LatLonPointPrototypeQueries.java +++ /dev/null @@ -1,117 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.sandbox.search; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; -import org.apache.lucene.codecs.lucene90.Lucene90PointsFormat; -import org.apache.lucene.document.LatLonDocValuesField; -import org.apache.lucene.document.LatLonPoint; -import org.apache.lucene.geo.GeoUtils; -import org.apache.lucene.index.LeafReaderContext; -import org.apache.lucene.index.PointValues; -import org.apache.lucene.search.FieldDoc; -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.MatchAllDocsQuery; -import org.apache.lucene.search.ScoreDoc; -import org.apache.lucene.search.TopFieldDocs; -import org.apache.lucene.search.TotalHits; -import org.apache.lucene.util.Bits; -import org.apache.lucene.util.SloppyMath; - -/** - * Holder class for prototype sandboxed queries - * - *

When the query graduates from sandbox, these static calls should be placed in {@link - * LatLonPoint} - * - * @lucene.experimental - */ -public class LatLonPointPrototypeQueries { - - // no instance - private LatLonPointPrototypeQueries() {} - - /** - * Finds the {@code n} nearest indexed points to the provided point, according to Haversine - * distance. - * - *

This is functionally equivalent to running {@link MatchAllDocsQuery} with a {@link - * LatLonDocValuesField#newDistanceSort}, but is far more efficient since it takes advantage of - * properties the indexed BKD tree. Currently this only works with {@link Lucene90PointsFormat} - * (used by the default codec). Multi-valued fields are currently not de-duplicated, so if a - * document had multiple instances of the specified field that make it into the top n, that - * document will appear more than once. - * - *

Documents are ordered by ascending distance from the location. The value returned in {@link - * FieldDoc} for the hits contains a Double instance with the distance in meters. - * - * @param searcher IndexSearcher to find nearest points from. - * @param field field name. must not be null. - * @param latitude latitude at the center: must be within standard +/-90 coordinate bounds. - * @param longitude longitude at the center: must be within standard +/-180 coordinate bounds. - * @param n the number of nearest neighbors to retrieve. - * @return TopFieldDocs containing documents ordered by distance, where the field value for each - * {@link FieldDoc} is the distance in meters - * @throws IllegalArgumentException if the underlying PointValues is not a {@code - * Lucene60PointsReader} (this is a current limitation), or if {@code field} or {@code - * searcher} is null, or if {@code latitude}, {@code longitude} or {@code n} are out-of-bounds - * @throws IOException if an IOException occurs while finding the points. - */ - // TODO: what about multi-valued documents? what happens? - public static TopFieldDocs nearest( - IndexSearcher searcher, String field, double latitude, double longitude, int n) - throws IOException { - GeoUtils.checkLatitude(latitude); - GeoUtils.checkLongitude(longitude); - if (n < 1) { - throw new IllegalArgumentException("n must be at least 1; got " + n); - } - if (field == null) { - throw new IllegalArgumentException("field must not be null"); - } - if (searcher == null) { - throw new IllegalArgumentException("searcher must not be null"); - } - List readers = new ArrayList<>(); - List docBases = new ArrayList<>(); - List liveDocs = new ArrayList<>(); - int totalHits = 0; - for (LeafReaderContext leaf : searcher.getIndexReader().leaves()) { - PointValues points = leaf.reader().getPointValues(field); - if (points != null) { - totalHits += points.getDocCount(); - readers.add(points); - docBases.add(leaf.docBase); - liveDocs.add(leaf.reader().getLiveDocs()); - } - } - - NearestNeighbor.NearestHit[] hits = - NearestNeighbor.nearest(latitude, longitude, readers, liveDocs, docBases, n); - - // Convert to TopFieldDocs: - ScoreDoc[] scoreDocs = new ScoreDoc[hits.length]; - for (int i = 0; i < hits.length; i++) { - NearestNeighbor.NearestHit hit = hits[i]; - double hitDistance = SloppyMath.haversinMeters(hit.distanceSortKey); - scoreDocs[i] = new FieldDoc(hit.docID, 0.0f, new Object[] {Double.valueOf(hitDistance)}); - } - return new TopFieldDocs(new TotalHits(totalHits, TotalHits.Relation.EQUAL_TO), scoreDocs, null); - } -} diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/document/TestFloatPointNearestNeighbor.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/document/TestFloatPointNearestNeighbor.java index ae829dcf02c..7df51197cc0 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/document/TestFloatPointNearestNeighbor.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/document/TestFloatPointNearestNeighbor.java @@ -20,6 +20,7 @@ import java.util.Arrays; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FloatPoint; +import org.apache.lucene.document.LatLonPoint; import org.apache.lucene.document.StoredField; import org.apache.lucene.document.StringField; import org.apache.lucene.index.DirectoryReader; @@ -28,7 +29,6 @@ import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.PointValues; import org.apache.lucene.index.SerialMergeScheduler; import org.apache.lucene.index.Term; -import org.apache.lucene.sandbox.search.LatLonPointPrototypeQueries; import org.apache.lucene.search.FieldDoc; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.ScoreDoc; @@ -66,7 +66,7 @@ public class TestFloatPointNearestNeighbor extends LuceneTestCase { // can't wrap because we require Lucene60PointsFormat directly but e.g. ParallelReader wraps // with its own points impl: s = newSearcher(r, false); - hit = (FieldDoc) LatLonPointPrototypeQueries.nearest(s, "point", 40.0, 50.0, 1).scoreDocs[0]; + hit = (FieldDoc) LatLonPoint.nearest(s, "point", 40.0, 50.0, 1).scoreDocs[0]; assertEquals("1", r.document(hit.doc).getField("id").stringValue()); r.close(); w.close();