mirror of https://github.com/apache/lucene.git
LUCENE-10266 Move nearest-neighbor search on points to core (#897)
Co-authored-by: Rushabh Shah <shahrs87@apache.org>
This commit is contained in:
parent
d850a22a51
commit
fcd98fd337
|
@ -25,7 +25,9 @@ API Changes
|
|||
* LUCENE-10436: Remove deprecated DocValuesFieldExistsQuery, NormsFieldExistsQuery and
|
||||
KnnVectorFieldExistsQuery. (Zach Chen, Adrien Grand)
|
||||
|
||||
* LUCENE-10561 Reduce class/member visibility of all normalizer and stemmer classes. (Rushabh Shah)
|
||||
* LUCENE-10561: Reduce class/member visibility of all normalizer and stemmer classes. (Rushabh Shah)
|
||||
|
||||
* LUCENE-10266: Move nearest-neighbor search on points to core. (Rushabh Shah)
|
||||
|
||||
New Features
|
||||
---------------------
|
||||
|
|
|
@ -23,23 +23,36 @@ import static org.apache.lucene.geo.GeoEncodingUtils.encodeLatitudeCeil;
|
|||
import static org.apache.lucene.geo.GeoEncodingUtils.encodeLongitude;
|
||||
import static org.apache.lucene.geo.GeoEncodingUtils.encodeLongitudeCeil;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import org.apache.lucene.geo.Circle;
|
||||
import org.apache.lucene.geo.GeoUtils;
|
||||
import org.apache.lucene.geo.LatLonGeometry;
|
||||
import org.apache.lucene.geo.Point;
|
||||
import org.apache.lucene.geo.Polygon;
|
||||
import org.apache.lucene.geo.Rectangle;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.PointValues;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
import org.apache.lucene.search.BooleanClause.Occur;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.BoostQuery;
|
||||
import org.apache.lucene.search.ConstantScoreQuery;
|
||||
import org.apache.lucene.search.FieldDoc;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.MatchAllDocsQuery;
|
||||
import org.apache.lucene.search.MatchNoDocsQuery;
|
||||
import org.apache.lucene.search.PointRangeQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.ScoreDoc;
|
||||
import org.apache.lucene.search.TopFieldDocs;
|
||||
import org.apache.lucene.search.TotalHits;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.NumericUtils;
|
||||
import org.apache.lucene.util.SloppyMath;
|
||||
|
||||
/**
|
||||
* An indexed location field.
|
||||
|
@ -362,4 +375,70 @@ public class LatLonPoint extends Field {
|
|||
}
|
||||
return query;
|
||||
}
|
||||
|
||||
/**
|
||||
* Finds the {@code n} nearest indexed points to the provided point, according to Haversine
|
||||
* distance.
|
||||
*
|
||||
* <p>This is functionally equivalent to running {@link MatchAllDocsQuery} with a {@link
|
||||
* LatLonDocValuesField#newDistanceSort}, but is far more efficient since it takes advantage of
|
||||
* properties the indexed BKD tree. Multi-valued fields are currently not de-duplicated, so if a
|
||||
* document had multiple instances of the specified field that make it into the top n, that
|
||||
* document will appear more than once.
|
||||
*
|
||||
* <p>Documents are ordered by ascending distance from the location. The value returned in {@link
|
||||
* FieldDoc} for the hits contains a Double instance with the distance in meters.
|
||||
*
|
||||
* @param searcher IndexSearcher to find nearest points from.
|
||||
* @param field field name. must not be null.
|
||||
* @param latitude latitude at the center: must be within standard +/-90 coordinate bounds.
|
||||
* @param longitude longitude at the center: must be within standard +/-180 coordinate bounds.
|
||||
* @param n the number of nearest neighbors to retrieve.
|
||||
* @return TopFieldDocs containing documents ordered by distance, where the field value for each
|
||||
* {@link FieldDoc} is the distance in meters
|
||||
* @throws IllegalArgumentException if {@code field} or {@code searcher} is null, or if {@code
|
||||
* latitude}, {@code longitude} or {@code n} are out-of-bounds
|
||||
* @throws IOException if an IOException occurs while finding the points.
|
||||
*/
|
||||
// TODO: what about multi-valued documents? what happens?
|
||||
public static TopFieldDocs nearest(
|
||||
IndexSearcher searcher, String field, double latitude, double longitude, int n)
|
||||
throws IOException {
|
||||
GeoUtils.checkLatitude(latitude);
|
||||
GeoUtils.checkLongitude(longitude);
|
||||
if (n < 1) {
|
||||
throw new IllegalArgumentException("n must be at least 1; got " + n);
|
||||
}
|
||||
if (field == null) {
|
||||
throw new IllegalArgumentException("field must not be null");
|
||||
}
|
||||
if (searcher == null) {
|
||||
throw new IllegalArgumentException("searcher must not be null");
|
||||
}
|
||||
List<PointValues> readers = new ArrayList<>();
|
||||
List<Integer> docBases = new ArrayList<>();
|
||||
List<Bits> liveDocs = new ArrayList<>();
|
||||
int totalHits = 0;
|
||||
for (LeafReaderContext leaf : searcher.getIndexReader().leaves()) {
|
||||
PointValues points = leaf.reader().getPointValues(field);
|
||||
if (points != null) {
|
||||
totalHits += points.getDocCount();
|
||||
readers.add(points);
|
||||
docBases.add(leaf.docBase);
|
||||
liveDocs.add(leaf.reader().getLiveDocs());
|
||||
}
|
||||
}
|
||||
|
||||
NearestNeighbor.NearestHit[] hits =
|
||||
NearestNeighbor.nearest(latitude, longitude, readers, liveDocs, docBases, n);
|
||||
|
||||
// Convert to TopFieldDocs:
|
||||
ScoreDoc[] scoreDocs = new ScoreDoc[hits.length];
|
||||
for (int i = 0; i < hits.length; i++) {
|
||||
NearestNeighbor.NearestHit hit = hits[i];
|
||||
double hitDistance = SloppyMath.haversinMeters(hit.distanceSortKey);
|
||||
scoreDocs[i] = new FieldDoc(hit.docID, 0.0f, new Object[] {Double.valueOf(hitDistance)});
|
||||
}
|
||||
return new TopFieldDocs(new TotalHits(totalHits, TotalHits.Relation.EQUAL_TO), scoreDocs, null);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -14,7 +14,7 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.sandbox.search;
|
||||
package org.apache.lucene.document;
|
||||
|
||||
import static org.apache.lucene.geo.GeoEncodingUtils.decodeLatitude;
|
||||
import static org.apache.lucene.geo.GeoEncodingUtils.decodeLongitude;
|
||||
|
@ -31,11 +31,7 @@ import org.apache.lucene.index.PointValues.Relation;
|
|||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.SloppyMath;
|
||||
|
||||
/**
|
||||
* KNN search on top of 2D lat/lon indexed points.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
/** KNN search on top of 2D lat/lon indexed points. */
|
||||
class NearestNeighbor {
|
||||
|
||||
static class Cell implements Comparable<Cell> {
|
|
@ -14,7 +14,7 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.sandbox.search;
|
||||
package org.apache.lucene.search;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Comparator;
|
||||
|
@ -30,12 +30,6 @@ import org.apache.lucene.index.IndexWriter;
|
|||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.index.SerialMergeScheduler;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.FieldDoc;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.MatchAllDocsQuery;
|
||||
import org.apache.lucene.search.ScoreDoc;
|
||||
import org.apache.lucene.search.Sort;
|
||||
import org.apache.lucene.search.TopFieldDocs;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.tests.geo.GeoTestUtil;
|
||||
import org.apache.lucene.tests.index.RandomIndexWriter;
|
||||
|
@ -62,8 +56,7 @@ public class TestNearest extends LuceneTestCase {
|
|||
// can't wrap because we require Lucene60PointsFormat directly but e.g. ParallelReader wraps
|
||||
// with its own points impl:
|
||||
IndexSearcher s = newSearcher(r, false);
|
||||
FieldDoc hit =
|
||||
(FieldDoc) LatLonPointPrototypeQueries.nearest(s, "point", 40.0, 50.0, 1).scoreDocs[0];
|
||||
FieldDoc hit = (FieldDoc) LatLonPoint.nearest(s, "point", 40.0, 50.0, 1).scoreDocs[0];
|
||||
assertEquals("0", r.document(hit.doc).getField("id").stringValue());
|
||||
r.close();
|
||||
|
||||
|
@ -72,7 +65,7 @@ public class TestNearest extends LuceneTestCase {
|
|||
// can't wrap because we require Lucene60PointsFormat directly but e.g. ParallelReader wraps
|
||||
// with its own points impl:
|
||||
s = newSearcher(r, false);
|
||||
hit = (FieldDoc) LatLonPointPrototypeQueries.nearest(s, "point", 40.0, 50.0, 1).scoreDocs[0];
|
||||
hit = (FieldDoc) LatLonPoint.nearest(s, "point", 40.0, 50.0, 1).scoreDocs[0];
|
||||
assertEquals("1", r.document(hit.doc).getField("id").stringValue());
|
||||
r.close();
|
||||
w.close();
|
||||
|
@ -95,8 +88,7 @@ public class TestNearest extends LuceneTestCase {
|
|||
// can't wrap because we require Lucene60PointsFormat directly but e.g. ParallelReader wraps
|
||||
// with its own points impl:
|
||||
IndexSearcher s = newSearcher(r, false);
|
||||
FieldDoc hit =
|
||||
(FieldDoc) LatLonPointPrototypeQueries.nearest(s, "point", 40.0, 50.0, 1).scoreDocs[0];
|
||||
FieldDoc hit = (FieldDoc) LatLonPoint.nearest(s, "point", 40.0, 50.0, 1).scoreDocs[0];
|
||||
assertEquals("0", r.document(hit.doc).getField("id").stringValue());
|
||||
r.close();
|
||||
|
||||
|
@ -106,8 +98,7 @@ public class TestNearest extends LuceneTestCase {
|
|||
// can't wrap because we require Lucene60PointsFormat directly but e.g. ParallelReader wraps
|
||||
// with its own points impl:
|
||||
s = newSearcher(r, false);
|
||||
assertEquals(
|
||||
0, LatLonPointPrototypeQueries.nearest(s, "point", 40.0, 50.0, 1).scoreDocs.length);
|
||||
assertEquals(0, LatLonPoint.nearest(s, "point", 40.0, 50.0, 1).scoreDocs.length);
|
||||
r.close();
|
||||
w.close();
|
||||
dir.close();
|
||||
|
@ -128,9 +119,7 @@ public class TestNearest extends LuceneTestCase {
|
|||
DirectoryReader r = DirectoryReader.open(w);
|
||||
// can't wrap because we require Lucene60PointsFormat directly but e.g. ParallelReader wraps
|
||||
// with its own points impl:
|
||||
ScoreDoc[] hits =
|
||||
LatLonPointPrototypeQueries.nearest(newSearcher(r, false), "point", 45.0, 50.0, 2)
|
||||
.scoreDocs;
|
||||
ScoreDoc[] hits = LatLonPoint.nearest(newSearcher(r, false), "point", 45.0, 50.0, 2).scoreDocs;
|
||||
assertEquals("0", r.document(hits[0].doc).getField("id").stringValue());
|
||||
assertEquals("1", r.document(hits[1].doc).getField("id").stringValue());
|
||||
|
||||
|
@ -146,10 +135,7 @@ public class TestNearest extends LuceneTestCase {
|
|||
// can't wrap because we require Lucene60PointsFormat directly but e.g. ParallelReader wraps
|
||||
// with its own points impl:
|
||||
assertEquals(
|
||||
0,
|
||||
LatLonPointPrototypeQueries.nearest(newSearcher(r, false), "point", 40.0, 50.0, 1)
|
||||
.scoreDocs
|
||||
.length);
|
||||
0, LatLonPoint.nearest(newSearcher(r, false), "point", 40.0, 50.0, 1).scoreDocs.length);
|
||||
r.close();
|
||||
w.close();
|
||||
dir.close();
|
||||
|
@ -245,8 +231,7 @@ public class TestNearest extends LuceneTestCase {
|
|||
topN,
|
||||
new Sort(LatLonDocValuesField.newDistanceSort("point", pointLat, pointLon)));
|
||||
|
||||
ScoreDoc[] hits =
|
||||
LatLonPointPrototypeQueries.nearest(s, "point", pointLat, pointLon, topN).scoreDocs;
|
||||
ScoreDoc[] hits = LatLonPoint.nearest(s, "point", pointLat, pointLon, topN).scoreDocs;
|
||||
for (int i = 0; i < topN; i++) {
|
||||
FieldDoc expected = expectedHits[i];
|
||||
FieldDoc expected2 = (FieldDoc) fieldDocs.scoreDocs[i];
|
|
@ -1,117 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.sandbox.search;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import org.apache.lucene.codecs.lucene90.Lucene90PointsFormat;
|
||||
import org.apache.lucene.document.LatLonDocValuesField;
|
||||
import org.apache.lucene.document.LatLonPoint;
|
||||
import org.apache.lucene.geo.GeoUtils;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.PointValues;
|
||||
import org.apache.lucene.search.FieldDoc;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.MatchAllDocsQuery;
|
||||
import org.apache.lucene.search.ScoreDoc;
|
||||
import org.apache.lucene.search.TopFieldDocs;
|
||||
import org.apache.lucene.search.TotalHits;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.SloppyMath;
|
||||
|
||||
/**
|
||||
* Holder class for prototype sandboxed queries
|
||||
*
|
||||
* <p>When the query graduates from sandbox, these static calls should be placed in {@link
|
||||
* LatLonPoint}
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class LatLonPointPrototypeQueries {
|
||||
|
||||
// no instance
|
||||
private LatLonPointPrototypeQueries() {}
|
||||
|
||||
/**
|
||||
* Finds the {@code n} nearest indexed points to the provided point, according to Haversine
|
||||
* distance.
|
||||
*
|
||||
* <p>This is functionally equivalent to running {@link MatchAllDocsQuery} with a {@link
|
||||
* LatLonDocValuesField#newDistanceSort}, but is far more efficient since it takes advantage of
|
||||
* properties the indexed BKD tree. Currently this only works with {@link Lucene90PointsFormat}
|
||||
* (used by the default codec). Multi-valued fields are currently not de-duplicated, so if a
|
||||
* document had multiple instances of the specified field that make it into the top n, that
|
||||
* document will appear more than once.
|
||||
*
|
||||
* <p>Documents are ordered by ascending distance from the location. The value returned in {@link
|
||||
* FieldDoc} for the hits contains a Double instance with the distance in meters.
|
||||
*
|
||||
* @param searcher IndexSearcher to find nearest points from.
|
||||
* @param field field name. must not be null.
|
||||
* @param latitude latitude at the center: must be within standard +/-90 coordinate bounds.
|
||||
* @param longitude longitude at the center: must be within standard +/-180 coordinate bounds.
|
||||
* @param n the number of nearest neighbors to retrieve.
|
||||
* @return TopFieldDocs containing documents ordered by distance, where the field value for each
|
||||
* {@link FieldDoc} is the distance in meters
|
||||
* @throws IllegalArgumentException if the underlying PointValues is not a {@code
|
||||
* Lucene60PointsReader} (this is a current limitation), or if {@code field} or {@code
|
||||
* searcher} is null, or if {@code latitude}, {@code longitude} or {@code n} are out-of-bounds
|
||||
* @throws IOException if an IOException occurs while finding the points.
|
||||
*/
|
||||
// TODO: what about multi-valued documents? what happens?
|
||||
public static TopFieldDocs nearest(
|
||||
IndexSearcher searcher, String field, double latitude, double longitude, int n)
|
||||
throws IOException {
|
||||
GeoUtils.checkLatitude(latitude);
|
||||
GeoUtils.checkLongitude(longitude);
|
||||
if (n < 1) {
|
||||
throw new IllegalArgumentException("n must be at least 1; got " + n);
|
||||
}
|
||||
if (field == null) {
|
||||
throw new IllegalArgumentException("field must not be null");
|
||||
}
|
||||
if (searcher == null) {
|
||||
throw new IllegalArgumentException("searcher must not be null");
|
||||
}
|
||||
List<PointValues> readers = new ArrayList<>();
|
||||
List<Integer> docBases = new ArrayList<>();
|
||||
List<Bits> liveDocs = new ArrayList<>();
|
||||
int totalHits = 0;
|
||||
for (LeafReaderContext leaf : searcher.getIndexReader().leaves()) {
|
||||
PointValues points = leaf.reader().getPointValues(field);
|
||||
if (points != null) {
|
||||
totalHits += points.getDocCount();
|
||||
readers.add(points);
|
||||
docBases.add(leaf.docBase);
|
||||
liveDocs.add(leaf.reader().getLiveDocs());
|
||||
}
|
||||
}
|
||||
|
||||
NearestNeighbor.NearestHit[] hits =
|
||||
NearestNeighbor.nearest(latitude, longitude, readers, liveDocs, docBases, n);
|
||||
|
||||
// Convert to TopFieldDocs:
|
||||
ScoreDoc[] scoreDocs = new ScoreDoc[hits.length];
|
||||
for (int i = 0; i < hits.length; i++) {
|
||||
NearestNeighbor.NearestHit hit = hits[i];
|
||||
double hitDistance = SloppyMath.haversinMeters(hit.distanceSortKey);
|
||||
scoreDocs[i] = new FieldDoc(hit.docID, 0.0f, new Object[] {Double.valueOf(hitDistance)});
|
||||
}
|
||||
return new TopFieldDocs(new TotalHits(totalHits, TotalHits.Relation.EQUAL_TO), scoreDocs, null);
|
||||
}
|
||||
}
|
|
@ -20,6 +20,7 @@ import java.util.Arrays;
|
|||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.FloatPoint;
|
||||
import org.apache.lucene.document.LatLonPoint;
|
||||
import org.apache.lucene.document.StoredField;
|
||||
import org.apache.lucene.document.StringField;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
|
@ -28,7 +29,6 @@ import org.apache.lucene.index.IndexWriterConfig;
|
|||
import org.apache.lucene.index.PointValues;
|
||||
import org.apache.lucene.index.SerialMergeScheduler;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.sandbox.search.LatLonPointPrototypeQueries;
|
||||
import org.apache.lucene.search.FieldDoc;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.ScoreDoc;
|
||||
|
@ -66,7 +66,7 @@ public class TestFloatPointNearestNeighbor extends LuceneTestCase {
|
|||
// can't wrap because we require Lucene60PointsFormat directly but e.g. ParallelReader wraps
|
||||
// with its own points impl:
|
||||
s = newSearcher(r, false);
|
||||
hit = (FieldDoc) LatLonPointPrototypeQueries.nearest(s, "point", 40.0, 50.0, 1).scoreDocs[0];
|
||||
hit = (FieldDoc) LatLonPoint.nearest(s, "point", 40.0, 50.0, 1).scoreDocs[0];
|
||||
assertEquals("1", r.document(hit.doc).getField("id").stringValue());
|
||||
r.close();
|
||||
w.close();
|
||||
|
|
Loading…
Reference in New Issue