LUCENE-10266 Move nearest-neighbor search on points to core (#897)

Co-authored-by: Rushabh Shah <shahrs87@apache.org>
This commit is contained in:
Rushabh Shah 2022-06-13 09:41:39 -07:00 committed by GitHub
parent d850a22a51
commit fcd98fd337
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 94 additions and 149 deletions

View File

@ -25,7 +25,9 @@ API Changes
* LUCENE-10436: Remove deprecated DocValuesFieldExistsQuery, NormsFieldExistsQuery and * LUCENE-10436: Remove deprecated DocValuesFieldExistsQuery, NormsFieldExistsQuery and
KnnVectorFieldExistsQuery. (Zach Chen, Adrien Grand) KnnVectorFieldExistsQuery. (Zach Chen, Adrien Grand)
* LUCENE-10561 Reduce class/member visibility of all normalizer and stemmer classes. (Rushabh Shah) * LUCENE-10561: Reduce class/member visibility of all normalizer and stemmer classes. (Rushabh Shah)
* LUCENE-10266: Move nearest-neighbor search on points to core. (Rushabh Shah)
New Features New Features
--------------------- ---------------------

View File

@ -23,23 +23,36 @@ import static org.apache.lucene.geo.GeoEncodingUtils.encodeLatitudeCeil;
import static org.apache.lucene.geo.GeoEncodingUtils.encodeLongitude; import static org.apache.lucene.geo.GeoEncodingUtils.encodeLongitude;
import static org.apache.lucene.geo.GeoEncodingUtils.encodeLongitudeCeil; import static org.apache.lucene.geo.GeoEncodingUtils.encodeLongitudeCeil;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.geo.Circle; import org.apache.lucene.geo.Circle;
import org.apache.lucene.geo.GeoUtils;
import org.apache.lucene.geo.LatLonGeometry; import org.apache.lucene.geo.LatLonGeometry;
import org.apache.lucene.geo.Point; import org.apache.lucene.geo.Point;
import org.apache.lucene.geo.Polygon; import org.apache.lucene.geo.Polygon;
import org.apache.lucene.geo.Rectangle; import org.apache.lucene.geo.Rectangle;
import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PointValues; import org.apache.lucene.index.PointValues;
import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BoostQuery; import org.apache.lucene.search.BoostQuery;
import org.apache.lucene.search.ConstantScoreQuery; import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.FieldDoc;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.MatchNoDocsQuery; import org.apache.lucene.search.MatchNoDocsQuery;
import org.apache.lucene.search.PointRangeQuery; import org.apache.lucene.search.PointRangeQuery;
import org.apache.lucene.search.Query; import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopFieldDocs;
import org.apache.lucene.search.TotalHits;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.NumericUtils; import org.apache.lucene.util.NumericUtils;
import org.apache.lucene.util.SloppyMath;
/** /**
* An indexed location field. * An indexed location field.
@ -362,4 +375,70 @@ public class LatLonPoint extends Field {
} }
return query; return query;
} }
/**
* Finds the {@code n} nearest indexed points to the provided point, according to Haversine
* distance.
*
* <p>This is functionally equivalent to running {@link MatchAllDocsQuery} with a {@link
* LatLonDocValuesField#newDistanceSort}, but is far more efficient since it takes advantage of
* properties the indexed BKD tree. Multi-valued fields are currently not de-duplicated, so if a
* document had multiple instances of the specified field that make it into the top n, that
* document will appear more than once.
*
* <p>Documents are ordered by ascending distance from the location. The value returned in {@link
* FieldDoc} for the hits contains a Double instance with the distance in meters.
*
* @param searcher IndexSearcher to find nearest points from.
* @param field field name. must not be null.
* @param latitude latitude at the center: must be within standard +/-90 coordinate bounds.
* @param longitude longitude at the center: must be within standard +/-180 coordinate bounds.
* @param n the number of nearest neighbors to retrieve.
* @return TopFieldDocs containing documents ordered by distance, where the field value for each
* {@link FieldDoc} is the distance in meters
* @throws IllegalArgumentException if {@code field} or {@code searcher} is null, or if {@code
* latitude}, {@code longitude} or {@code n} are out-of-bounds
* @throws IOException if an IOException occurs while finding the points.
*/
// TODO: what about multi-valued documents? what happens?
public static TopFieldDocs nearest(
IndexSearcher searcher, String field, double latitude, double longitude, int n)
throws IOException {
GeoUtils.checkLatitude(latitude);
GeoUtils.checkLongitude(longitude);
if (n < 1) {
throw new IllegalArgumentException("n must be at least 1; got " + n);
}
if (field == null) {
throw new IllegalArgumentException("field must not be null");
}
if (searcher == null) {
throw new IllegalArgumentException("searcher must not be null");
}
List<PointValues> readers = new ArrayList<>();
List<Integer> docBases = new ArrayList<>();
List<Bits> liveDocs = new ArrayList<>();
int totalHits = 0;
for (LeafReaderContext leaf : searcher.getIndexReader().leaves()) {
PointValues points = leaf.reader().getPointValues(field);
if (points != null) {
totalHits += points.getDocCount();
readers.add(points);
docBases.add(leaf.docBase);
liveDocs.add(leaf.reader().getLiveDocs());
}
}
NearestNeighbor.NearestHit[] hits =
NearestNeighbor.nearest(latitude, longitude, readers, liveDocs, docBases, n);
// Convert to TopFieldDocs:
ScoreDoc[] scoreDocs = new ScoreDoc[hits.length];
for (int i = 0; i < hits.length; i++) {
NearestNeighbor.NearestHit hit = hits[i];
double hitDistance = SloppyMath.haversinMeters(hit.distanceSortKey);
scoreDocs[i] = new FieldDoc(hit.docID, 0.0f, new Object[] {Double.valueOf(hitDistance)});
}
return new TopFieldDocs(new TotalHits(totalHits, TotalHits.Relation.EQUAL_TO), scoreDocs, null);
}
} }

View File

@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
package org.apache.lucene.sandbox.search; package org.apache.lucene.document;
import static org.apache.lucene.geo.GeoEncodingUtils.decodeLatitude; import static org.apache.lucene.geo.GeoEncodingUtils.decodeLatitude;
import static org.apache.lucene.geo.GeoEncodingUtils.decodeLongitude; import static org.apache.lucene.geo.GeoEncodingUtils.decodeLongitude;
@ -31,11 +31,7 @@ import org.apache.lucene.index.PointValues.Relation;
import org.apache.lucene.util.Bits; import org.apache.lucene.util.Bits;
import org.apache.lucene.util.SloppyMath; import org.apache.lucene.util.SloppyMath;
/** /** KNN search on top of 2D lat/lon indexed points. */
* KNN search on top of 2D lat/lon indexed points.
*
* @lucene.experimental
*/
class NearestNeighbor { class NearestNeighbor {
static class Cell implements Comparable<Cell> { static class Cell implements Comparable<Cell> {

View File

@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
package org.apache.lucene.sandbox.search; package org.apache.lucene.search;
import java.util.Arrays; import java.util.Arrays;
import java.util.Comparator; import java.util.Comparator;
@ -30,12 +30,6 @@ import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.SerialMergeScheduler; import org.apache.lucene.index.SerialMergeScheduler;
import org.apache.lucene.index.Term; import org.apache.lucene.index.Term;
import org.apache.lucene.search.FieldDoc;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.TopFieldDocs;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import org.apache.lucene.tests.geo.GeoTestUtil; import org.apache.lucene.tests.geo.GeoTestUtil;
import org.apache.lucene.tests.index.RandomIndexWriter; import org.apache.lucene.tests.index.RandomIndexWriter;
@ -62,8 +56,7 @@ public class TestNearest extends LuceneTestCase {
// can't wrap because we require Lucene60PointsFormat directly but e.g. ParallelReader wraps // can't wrap because we require Lucene60PointsFormat directly but e.g. ParallelReader wraps
// with its own points impl: // with its own points impl:
IndexSearcher s = newSearcher(r, false); IndexSearcher s = newSearcher(r, false);
FieldDoc hit = FieldDoc hit = (FieldDoc) LatLonPoint.nearest(s, "point", 40.0, 50.0, 1).scoreDocs[0];
(FieldDoc) LatLonPointPrototypeQueries.nearest(s, "point", 40.0, 50.0, 1).scoreDocs[0];
assertEquals("0", r.document(hit.doc).getField("id").stringValue()); assertEquals("0", r.document(hit.doc).getField("id").stringValue());
r.close(); r.close();
@ -72,7 +65,7 @@ public class TestNearest extends LuceneTestCase {
// can't wrap because we require Lucene60PointsFormat directly but e.g. ParallelReader wraps // can't wrap because we require Lucene60PointsFormat directly but e.g. ParallelReader wraps
// with its own points impl: // with its own points impl:
s = newSearcher(r, false); s = newSearcher(r, false);
hit = (FieldDoc) LatLonPointPrototypeQueries.nearest(s, "point", 40.0, 50.0, 1).scoreDocs[0]; hit = (FieldDoc) LatLonPoint.nearest(s, "point", 40.0, 50.0, 1).scoreDocs[0];
assertEquals("1", r.document(hit.doc).getField("id").stringValue()); assertEquals("1", r.document(hit.doc).getField("id").stringValue());
r.close(); r.close();
w.close(); w.close();
@ -95,8 +88,7 @@ public class TestNearest extends LuceneTestCase {
// can't wrap because we require Lucene60PointsFormat directly but e.g. ParallelReader wraps // can't wrap because we require Lucene60PointsFormat directly but e.g. ParallelReader wraps
// with its own points impl: // with its own points impl:
IndexSearcher s = newSearcher(r, false); IndexSearcher s = newSearcher(r, false);
FieldDoc hit = FieldDoc hit = (FieldDoc) LatLonPoint.nearest(s, "point", 40.0, 50.0, 1).scoreDocs[0];
(FieldDoc) LatLonPointPrototypeQueries.nearest(s, "point", 40.0, 50.0, 1).scoreDocs[0];
assertEquals("0", r.document(hit.doc).getField("id").stringValue()); assertEquals("0", r.document(hit.doc).getField("id").stringValue());
r.close(); r.close();
@ -106,8 +98,7 @@ public class TestNearest extends LuceneTestCase {
// can't wrap because we require Lucene60PointsFormat directly but e.g. ParallelReader wraps // can't wrap because we require Lucene60PointsFormat directly but e.g. ParallelReader wraps
// with its own points impl: // with its own points impl:
s = newSearcher(r, false); s = newSearcher(r, false);
assertEquals( assertEquals(0, LatLonPoint.nearest(s, "point", 40.0, 50.0, 1).scoreDocs.length);
0, LatLonPointPrototypeQueries.nearest(s, "point", 40.0, 50.0, 1).scoreDocs.length);
r.close(); r.close();
w.close(); w.close();
dir.close(); dir.close();
@ -128,9 +119,7 @@ public class TestNearest extends LuceneTestCase {
DirectoryReader r = DirectoryReader.open(w); DirectoryReader r = DirectoryReader.open(w);
// can't wrap because we require Lucene60PointsFormat directly but e.g. ParallelReader wraps // can't wrap because we require Lucene60PointsFormat directly but e.g. ParallelReader wraps
// with its own points impl: // with its own points impl:
ScoreDoc[] hits = ScoreDoc[] hits = LatLonPoint.nearest(newSearcher(r, false), "point", 45.0, 50.0, 2).scoreDocs;
LatLonPointPrototypeQueries.nearest(newSearcher(r, false), "point", 45.0, 50.0, 2)
.scoreDocs;
assertEquals("0", r.document(hits[0].doc).getField("id").stringValue()); assertEquals("0", r.document(hits[0].doc).getField("id").stringValue());
assertEquals("1", r.document(hits[1].doc).getField("id").stringValue()); assertEquals("1", r.document(hits[1].doc).getField("id").stringValue());
@ -146,10 +135,7 @@ public class TestNearest extends LuceneTestCase {
// can't wrap because we require Lucene60PointsFormat directly but e.g. ParallelReader wraps // can't wrap because we require Lucene60PointsFormat directly but e.g. ParallelReader wraps
// with its own points impl: // with its own points impl:
assertEquals( assertEquals(
0, 0, LatLonPoint.nearest(newSearcher(r, false), "point", 40.0, 50.0, 1).scoreDocs.length);
LatLonPointPrototypeQueries.nearest(newSearcher(r, false), "point", 40.0, 50.0, 1)
.scoreDocs
.length);
r.close(); r.close();
w.close(); w.close();
dir.close(); dir.close();
@ -245,8 +231,7 @@ public class TestNearest extends LuceneTestCase {
topN, topN,
new Sort(LatLonDocValuesField.newDistanceSort("point", pointLat, pointLon))); new Sort(LatLonDocValuesField.newDistanceSort("point", pointLat, pointLon)));
ScoreDoc[] hits = ScoreDoc[] hits = LatLonPoint.nearest(s, "point", pointLat, pointLon, topN).scoreDocs;
LatLonPointPrototypeQueries.nearest(s, "point", pointLat, pointLon, topN).scoreDocs;
for (int i = 0; i < topN; i++) { for (int i = 0; i < topN; i++) {
FieldDoc expected = expectedHits[i]; FieldDoc expected = expectedHits[i];
FieldDoc expected2 = (FieldDoc) fieldDocs.scoreDocs[i]; FieldDoc expected2 = (FieldDoc) fieldDocs.scoreDocs[i];

View File

@ -1,117 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.sandbox.search;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.codecs.lucene90.Lucene90PointsFormat;
import org.apache.lucene.document.LatLonDocValuesField;
import org.apache.lucene.document.LatLonPoint;
import org.apache.lucene.geo.GeoUtils;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PointValues;
import org.apache.lucene.search.FieldDoc;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopFieldDocs;
import org.apache.lucene.search.TotalHits;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.SloppyMath;
/**
* Holder class for prototype sandboxed queries
*
* <p>When the query graduates from sandbox, these static calls should be placed in {@link
* LatLonPoint}
*
* @lucene.experimental
*/
public class LatLonPointPrototypeQueries {
// no instance
private LatLonPointPrototypeQueries() {}
/**
* Finds the {@code n} nearest indexed points to the provided point, according to Haversine
* distance.
*
* <p>This is functionally equivalent to running {@link MatchAllDocsQuery} with a {@link
* LatLonDocValuesField#newDistanceSort}, but is far more efficient since it takes advantage of
* properties the indexed BKD tree. Currently this only works with {@link Lucene90PointsFormat}
* (used by the default codec). Multi-valued fields are currently not de-duplicated, so if a
* document had multiple instances of the specified field that make it into the top n, that
* document will appear more than once.
*
* <p>Documents are ordered by ascending distance from the location. The value returned in {@link
* FieldDoc} for the hits contains a Double instance with the distance in meters.
*
* @param searcher IndexSearcher to find nearest points from.
* @param field field name. must not be null.
* @param latitude latitude at the center: must be within standard +/-90 coordinate bounds.
* @param longitude longitude at the center: must be within standard +/-180 coordinate bounds.
* @param n the number of nearest neighbors to retrieve.
* @return TopFieldDocs containing documents ordered by distance, where the field value for each
* {@link FieldDoc} is the distance in meters
* @throws IllegalArgumentException if the underlying PointValues is not a {@code
* Lucene60PointsReader} (this is a current limitation), or if {@code field} or {@code
* searcher} is null, or if {@code latitude}, {@code longitude} or {@code n} are out-of-bounds
* @throws IOException if an IOException occurs while finding the points.
*/
// TODO: what about multi-valued documents? what happens?
public static TopFieldDocs nearest(
IndexSearcher searcher, String field, double latitude, double longitude, int n)
throws IOException {
GeoUtils.checkLatitude(latitude);
GeoUtils.checkLongitude(longitude);
if (n < 1) {
throw new IllegalArgumentException("n must be at least 1; got " + n);
}
if (field == null) {
throw new IllegalArgumentException("field must not be null");
}
if (searcher == null) {
throw new IllegalArgumentException("searcher must not be null");
}
List<PointValues> readers = new ArrayList<>();
List<Integer> docBases = new ArrayList<>();
List<Bits> liveDocs = new ArrayList<>();
int totalHits = 0;
for (LeafReaderContext leaf : searcher.getIndexReader().leaves()) {
PointValues points = leaf.reader().getPointValues(field);
if (points != null) {
totalHits += points.getDocCount();
readers.add(points);
docBases.add(leaf.docBase);
liveDocs.add(leaf.reader().getLiveDocs());
}
}
NearestNeighbor.NearestHit[] hits =
NearestNeighbor.nearest(latitude, longitude, readers, liveDocs, docBases, n);
// Convert to TopFieldDocs:
ScoreDoc[] scoreDocs = new ScoreDoc[hits.length];
for (int i = 0; i < hits.length; i++) {
NearestNeighbor.NearestHit hit = hits[i];
double hitDistance = SloppyMath.haversinMeters(hit.distanceSortKey);
scoreDocs[i] = new FieldDoc(hit.docID, 0.0f, new Object[] {Double.valueOf(hitDistance)});
}
return new TopFieldDocs(new TotalHits(totalHits, TotalHits.Relation.EQUAL_TO), scoreDocs, null);
}
}

View File

@ -20,6 +20,7 @@ import java.util.Arrays;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field; import org.apache.lucene.document.Field;
import org.apache.lucene.document.FloatPoint; import org.apache.lucene.document.FloatPoint;
import org.apache.lucene.document.LatLonPoint;
import org.apache.lucene.document.StoredField; import org.apache.lucene.document.StoredField;
import org.apache.lucene.document.StringField; import org.apache.lucene.document.StringField;
import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.DirectoryReader;
@ -28,7 +29,6 @@ import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.PointValues; import org.apache.lucene.index.PointValues;
import org.apache.lucene.index.SerialMergeScheduler; import org.apache.lucene.index.SerialMergeScheduler;
import org.apache.lucene.index.Term; import org.apache.lucene.index.Term;
import org.apache.lucene.sandbox.search.LatLonPointPrototypeQueries;
import org.apache.lucene.search.FieldDoc; import org.apache.lucene.search.FieldDoc;
import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.ScoreDoc;
@ -66,7 +66,7 @@ public class TestFloatPointNearestNeighbor extends LuceneTestCase {
// can't wrap because we require Lucene60PointsFormat directly but e.g. ParallelReader wraps // can't wrap because we require Lucene60PointsFormat directly but e.g. ParallelReader wraps
// with its own points impl: // with its own points impl:
s = newSearcher(r, false); s = newSearcher(r, false);
hit = (FieldDoc) LatLonPointPrototypeQueries.nearest(s, "point", 40.0, 50.0, 1).scoreDocs[0]; hit = (FieldDoc) LatLonPoint.nearest(s, "point", 40.0, 50.0, 1).scoreDocs[0];
assertEquals("1", r.document(hit.doc).getField("id").stringValue()); assertEquals("1", r.document(hit.doc).getField("id").stringValue());
r.close(); r.close();
w.close(); w.close();