LUCENE-10266 Move nearest-neighbor search on points to core (#897)

Co-authored-by: Rushabh Shah <shahrs87@apache.org>
2022-06-13 09:41:39 -07:00 · 2022-06-13 09:41:39 -07:00 · fcd98fd337
parent d850a22a51
commit fcd98fd337
6 changed files with 94 additions and 149 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -25,7 +25,9 @@ API Changes
 * LUCENE-10436: Remove deprecated DocValuesFieldExistsQuery, NormsFieldExistsQuery and
  KnnVectorFieldExistsQuery. (Zach Chen, Adrien Grand)
-* LUCENE-10561 Reduce class/member visibility of all normalizer and stemmer classes. (Rushabh Shah)
+* LUCENE-10561: Reduce class/member visibility of all normalizer and stemmer classes. (Rushabh Shah)
 * LUCENE-10266: Move nearest-neighbor search on points to core. (Rushabh Shah)
 New Features
 ---------------------
--- a/lucene/core/src/java/org/apache/lucene/document/LatLonPoint.java
+++ b/lucene/core/src/java/org/apache/lucene/document/LatLonPoint.java
@ -23,23 +23,36 @@ import static org.apache.lucene.geo.GeoEncodingUtils.encodeLatitudeCeil;
 import static org.apache.lucene.geo.GeoEncodingUtils.encodeLongitude;
 import static org.apache.lucene.geo.GeoEncodingUtils.encodeLongitudeCeil;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
 import org.apache.lucene.geo.Circle;
 import org.apache.lucene.geo.GeoUtils;
 import org.apache.lucene.geo.LatLonGeometry;
 import org.apache.lucene.geo.Point;
 import org.apache.lucene.geo.Polygon;
 import org.apache.lucene.geo.Rectangle;
 import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.PointValues;
 import org.apache.lucene.search.BooleanClause;
 import org.apache.lucene.search.BooleanClause.Occur;
 import org.apache.lucene.search.BooleanQuery;
 import org.apache.lucene.search.BoostQuery;
 import org.apache.lucene.search.ConstantScoreQuery;
 import org.apache.lucene.search.FieldDoc;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.MatchAllDocsQuery;
 import org.apache.lucene.search.MatchNoDocsQuery;
 import org.apache.lucene.search.PointRangeQuery;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.ScoreDoc;
 import org.apache.lucene.search.TopFieldDocs;
 import org.apache.lucene.search.TotalHits;
 import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.NumericUtils;
 import org.apache.lucene.util.SloppyMath;
 /**
 * An indexed location field.
@ -362,4 +375,70 @@ public class LatLonPoint extends Field {
    }
    return query;
  }
  /**
   * Finds the {@code n} nearest indexed points to the provided point, according to Haversine
   * distance.
   *
   * <p>This is functionally equivalent to running {@link MatchAllDocsQuery} with a {@link
   * LatLonDocValuesField#newDistanceSort}, but is far more efficient since it takes advantage of
   * properties the indexed BKD tree. Multi-valued fields are currently not de-duplicated, so if a
   * document had multiple instances of the specified field that make it into the top n, that
   * document will appear more than once.
   *
   * <p>Documents are ordered by ascending distance from the location. The value returned in {@link
   * FieldDoc} for the hits contains a Double instance with the distance in meters.
   *
   * @param searcher IndexSearcher to find nearest points from.
   * @param field field name. must not be null.
   * @param latitude latitude at the center: must be within standard +/-90 coordinate bounds.
   * @param longitude longitude at the center: must be within standard +/-180 coordinate bounds.
   * @param n the number of nearest neighbors to retrieve.
   * @return TopFieldDocs containing documents ordered by distance, where the field value for each
   *     {@link FieldDoc} is the distance in meters
   * @throws IllegalArgumentException if {@code field} or {@code searcher} is null, or if {@code
   *     latitude}, {@code longitude} or {@code n} are out-of-bounds
   * @throws IOException if an IOException occurs while finding the points.
   */
  // TODO: what about multi-valued documents? what happens?
  public static TopFieldDocs nearest(
      IndexSearcher searcher, String field, double latitude, double longitude, int n)
      throws IOException {
    GeoUtils.checkLatitude(latitude);
    GeoUtils.checkLongitude(longitude);
    if (n < 1) {
      throw new IllegalArgumentException("n must be at least 1; got " + n);
    }
    if (field == null) {
      throw new IllegalArgumentException("field must not be null");
    }
    if (searcher == null) {
      throw new IllegalArgumentException("searcher must not be null");
    }
    List<PointValues> readers = new ArrayList<>();
    List<Integer> docBases = new ArrayList<>();
    List<Bits> liveDocs = new ArrayList<>();
    int totalHits = 0;
    for (LeafReaderContext leaf : searcher.getIndexReader().leaves()) {
      PointValues points = leaf.reader().getPointValues(field);
      if (points != null) {
        totalHits += points.getDocCount();
        readers.add(points);
        docBases.add(leaf.docBase);
        liveDocs.add(leaf.reader().getLiveDocs());
      }
    }
    NearestNeighbor.NearestHit[] hits =
        NearestNeighbor.nearest(latitude, longitude, readers, liveDocs, docBases, n);
    // Convert to TopFieldDocs:
    ScoreDoc[] scoreDocs = new ScoreDoc[hits.length];
    for (int i = 0; i < hits.length; i++) {
      NearestNeighbor.NearestHit hit = hits[i];
      double hitDistance = SloppyMath.haversinMeters(hit.distanceSortKey);
      scoreDocs[i] = new FieldDoc(hit.docID, 0.0f, new Object[] {Double.valueOf(hitDistance)});
    }
    return new TopFieldDocs(new TotalHits(totalHits, TotalHits.Relation.EQUAL_TO), scoreDocs, null);
  }
 }
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/NearestNeighbor.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/NearestNeighbor.java
@ -14,7 +14,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-package org.apache.lucene.sandbox.search;
+package org.apache.lucene.document;
 import static org.apache.lucene.geo.GeoEncodingUtils.decodeLatitude;
 import static org.apache.lucene.geo.GeoEncodingUtils.decodeLongitude;
@ -31,11 +31,7 @@ import org.apache.lucene.index.PointValues.Relation;
 import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.SloppyMath;
-/**
+/** KNN search on top of 2D lat/lon indexed points. */
 * KNN search on top of 2D lat/lon indexed points.
 *
 * @lucene.experimental
 */
 class NearestNeighbor {
  static class Cell implements Comparable<Cell> {
--- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestNearest.java
+++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestNearest.java
@ -14,7 +14,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-package org.apache.lucene.sandbox.search;
+package org.apache.lucene.search;
 import java.util.Arrays;
 import java.util.Comparator;
@ -30,12 +30,6 @@ import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.IndexWriterConfig;
 import org.apache.lucene.index.SerialMergeScheduler;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.search.FieldDoc;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.MatchAllDocsQuery;
 import org.apache.lucene.search.ScoreDoc;
 import org.apache.lucene.search.Sort;
 import org.apache.lucene.search.TopFieldDocs;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.tests.geo.GeoTestUtil;
 import org.apache.lucene.tests.index.RandomIndexWriter;
@ -62,8 +56,7 @@ public class TestNearest extends LuceneTestCase {
    // can't wrap because we require Lucene60PointsFormat directly but e.g. ParallelReader wraps
    // with its own points impl:
    IndexSearcher s = newSearcher(r, false);
-    FieldDoc hit =
+    FieldDoc hit = (FieldDoc) LatLonPoint.nearest(s, "point", 40.0, 50.0, 1).scoreDocs[0];
        (FieldDoc) LatLonPointPrototypeQueries.nearest(s, "point", 40.0, 50.0, 1).scoreDocs[0];
    assertEquals("0", r.document(hit.doc).getField("id").stringValue());
    r.close();
@ -72,7 +65,7 @@ public class TestNearest extends LuceneTestCase {
    // can't wrap because we require Lucene60PointsFormat directly but e.g. ParallelReader wraps
    // with its own points impl:
    s = newSearcher(r, false);
-    hit = (FieldDoc) LatLonPointPrototypeQueries.nearest(s, "point", 40.0, 50.0, 1).scoreDocs[0];
+    hit = (FieldDoc) LatLonPoint.nearest(s, "point", 40.0, 50.0, 1).scoreDocs[0];
    assertEquals("1", r.document(hit.doc).getField("id").stringValue());
    r.close();
    w.close();
@ -95,8 +88,7 @@ public class TestNearest extends LuceneTestCase {
    // can't wrap because we require Lucene60PointsFormat directly but e.g. ParallelReader wraps
    // with its own points impl:
    IndexSearcher s = newSearcher(r, false);
-    FieldDoc hit =
+    FieldDoc hit = (FieldDoc) LatLonPoint.nearest(s, "point", 40.0, 50.0, 1).scoreDocs[0];
        (FieldDoc) LatLonPointPrototypeQueries.nearest(s, "point", 40.0, 50.0, 1).scoreDocs[0];
    assertEquals("0", r.document(hit.doc).getField("id").stringValue());
    r.close();
@ -106,8 +98,7 @@ public class TestNearest extends LuceneTestCase {
    // can't wrap because we require Lucene60PointsFormat directly but e.g. ParallelReader wraps
    // with its own points impl:
    s = newSearcher(r, false);
-    assertEquals(
+    assertEquals(0, LatLonPoint.nearest(s, "point", 40.0, 50.0, 1).scoreDocs.length);
        0, LatLonPointPrototypeQueries.nearest(s, "point", 40.0, 50.0, 1).scoreDocs.length);
    r.close();
    w.close();
    dir.close();
@ -128,9 +119,7 @@ public class TestNearest extends LuceneTestCase {
    DirectoryReader r = DirectoryReader.open(w);
    // can't wrap because we require Lucene60PointsFormat directly but e.g. ParallelReader wraps
    // with its own points impl:
-    ScoreDoc[] hits =
+    ScoreDoc[] hits = LatLonPoint.nearest(newSearcher(r, false), "point", 45.0, 50.0, 2).scoreDocs;
        LatLonPointPrototypeQueries.nearest(newSearcher(r, false), "point", 45.0, 50.0, 2)
            .scoreDocs;
    assertEquals("0", r.document(hits[0].doc).getField("id").stringValue());
    assertEquals("1", r.document(hits[1].doc).getField("id").stringValue());
@ -146,10 +135,7 @@ public class TestNearest extends LuceneTestCase {
    // can't wrap because we require Lucene60PointsFormat directly but e.g. ParallelReader wraps
    // with its own points impl:
    assertEquals(
-        0,
+        0, LatLonPoint.nearest(newSearcher(r, false), "point", 40.0, 50.0, 1).scoreDocs.length);
        LatLonPointPrototypeQueries.nearest(newSearcher(r, false), "point", 40.0, 50.0, 1)
            .scoreDocs
            .length);
    r.close();
    w.close();
    dir.close();
@ -245,8 +231,7 @@ public class TestNearest extends LuceneTestCase {
              topN,
              new Sort(LatLonDocValuesField.newDistanceSort("point", pointLat, pointLon)));
-      ScoreDoc[] hits =
+      ScoreDoc[] hits = LatLonPoint.nearest(s, "point", pointLat, pointLon, topN).scoreDocs;
          LatLonPointPrototypeQueries.nearest(s, "point", pointLat, pointLon, topN).scoreDocs;
      for (int i = 0; i < topN; i++) {
        FieldDoc expected = expectedHits[i];
        FieldDoc expected2 = (FieldDoc) fieldDocs.scoreDocs[i];
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/LatLonPointPrototypeQueries.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/LatLonPointPrototypeQueries.java
@ -1,117 +0,0 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.sandbox.search;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
 import org.apache.lucene.codecs.lucene90.Lucene90PointsFormat;
 import org.apache.lucene.document.LatLonDocValuesField;
 import org.apache.lucene.document.LatLonPoint;
 import org.apache.lucene.geo.GeoUtils;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.PointValues;
 import org.apache.lucene.search.FieldDoc;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.MatchAllDocsQuery;
 import org.apache.lucene.search.ScoreDoc;
 import org.apache.lucene.search.TopFieldDocs;
 import org.apache.lucene.search.TotalHits;
 import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.SloppyMath;
 /**
 * Holder class for prototype sandboxed queries
 *
 * <p>When the query graduates from sandbox, these static calls should be placed in {@link
 * LatLonPoint}
 *
 * @lucene.experimental
 */
 public class LatLonPointPrototypeQueries {
  // no instance
  private LatLonPointPrototypeQueries() {}
  /**
   * Finds the {@code n} nearest indexed points to the provided point, according to Haversine
   * distance.
   *
   * <p>This is functionally equivalent to running {@link MatchAllDocsQuery} with a {@link
   * LatLonDocValuesField#newDistanceSort}, but is far more efficient since it takes advantage of
   * properties the indexed BKD tree. Currently this only works with {@link Lucene90PointsFormat}
   * (used by the default codec). Multi-valued fields are currently not de-duplicated, so if a
   * document had multiple instances of the specified field that make it into the top n, that
   * document will appear more than once.
   *
   * <p>Documents are ordered by ascending distance from the location. The value returned in {@link
   * FieldDoc} for the hits contains a Double instance with the distance in meters.
   *
   * @param searcher IndexSearcher to find nearest points from.
   * @param field field name. must not be null.
   * @param latitude latitude at the center: must be within standard +/-90 coordinate bounds.
   * @param longitude longitude at the center: must be within standard +/-180 coordinate bounds.
   * @param n the number of nearest neighbors to retrieve.
   * @return TopFieldDocs containing documents ordered by distance, where the field value for each
   *     {@link FieldDoc} is the distance in meters
   * @throws IllegalArgumentException if the underlying PointValues is not a {@code
   *     Lucene60PointsReader} (this is a current limitation), or if {@code field} or {@code
   *     searcher} is null, or if {@code latitude}, {@code longitude} or {@code n} are out-of-bounds
   * @throws IOException if an IOException occurs while finding the points.
   */
  // TODO: what about multi-valued documents? what happens?
  public static TopFieldDocs nearest(
      IndexSearcher searcher, String field, double latitude, double longitude, int n)
      throws IOException {
    GeoUtils.checkLatitude(latitude);
    GeoUtils.checkLongitude(longitude);
    if (n < 1) {
      throw new IllegalArgumentException("n must be at least 1; got " + n);
    }
    if (field == null) {
      throw new IllegalArgumentException("field must not be null");
    }
    if (searcher == null) {
      throw new IllegalArgumentException("searcher must not be null");
    }
    List<PointValues> readers = new ArrayList<>();
    List<Integer> docBases = new ArrayList<>();
    List<Bits> liveDocs = new ArrayList<>();
    int totalHits = 0;
    for (LeafReaderContext leaf : searcher.getIndexReader().leaves()) {
      PointValues points = leaf.reader().getPointValues(field);
      if (points != null) {
        totalHits += points.getDocCount();
        readers.add(points);
        docBases.add(leaf.docBase);
        liveDocs.add(leaf.reader().getLiveDocs());
      }
    }
    NearestNeighbor.NearestHit[] hits =
        NearestNeighbor.nearest(latitude, longitude, readers, liveDocs, docBases, n);
    // Convert to TopFieldDocs:
    ScoreDoc[] scoreDocs = new ScoreDoc[hits.length];
    for (int i = 0; i < hits.length; i++) {
      NearestNeighbor.NearestHit hit = hits[i];
      double hitDistance = SloppyMath.haversinMeters(hit.distanceSortKey);
      scoreDocs[i] = new FieldDoc(hit.docID, 0.0f, new Object[] {Double.valueOf(hitDistance)});
    }
    return new TopFieldDocs(new TotalHits(totalHits, TotalHits.Relation.EQUAL_TO), scoreDocs, null);
  }
 }
--- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/document/TestFloatPointNearestNeighbor.java
+++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/document/TestFloatPointNearestNeighbor.java
@ -20,6 +20,7 @@ import java.util.Arrays;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.FloatPoint;
 import org.apache.lucene.document.LatLonPoint;
 import org.apache.lucene.document.StoredField;
 import org.apache.lucene.document.StringField;
 import org.apache.lucene.index.DirectoryReader;
@ -28,7 +29,6 @@ import org.apache.lucene.index.IndexWriterConfig;
 import org.apache.lucene.index.PointValues;
 import org.apache.lucene.index.SerialMergeScheduler;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.sandbox.search.LatLonPointPrototypeQueries;
 import org.apache.lucene.search.FieldDoc;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.ScoreDoc;
@ -66,7 +66,7 @@ public class TestFloatPointNearestNeighbor extends LuceneTestCase {
    // can't wrap because we require Lucene60PointsFormat directly but e.g. ParallelReader wraps
    // with its own points impl:
    s = newSearcher(r, false);
-    hit = (FieldDoc) LatLonPointPrototypeQueries.nearest(s, "point", 40.0, 50.0, 1).scoreDocs[0];
+    hit = (FieldDoc) LatLonPoint.nearest(s, "point", 40.0, 50.0, 1).scoreDocs[0];
    assertEquals("1", r.document(hit.doc).getField("id").stringValue());
    r.close();
    w.close();