mirror of https://github.com/apache/lucene.git
LUCENE-8952: Use a sort key instead of true distance in NearestNeighbor. (#832)
This commit is contained in:
parent
e94a7b05fb
commit
f3c7bbf79e
|
@ -50,6 +50,8 @@ Improvements
|
|||
|
||||
* SOLR-13663: Introduce <SpanPositionRange> into XML Query Parser (Alessandro Benedetti via Mikhail Khludnev)
|
||||
|
||||
* LUCENE-8952: Use a sort key instead of true distance in NearestNeighbor (Julie Tibshirani).
|
||||
|
||||
Optimizations
|
||||
|
||||
* LUCENE-8922: DisjunctionMaxQuery more efficiently leverages impacts to skip
|
||||
|
|
|
@ -27,6 +27,7 @@ import org.apache.lucene.geo.GeoUtils;
|
|||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.PointValues;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.SloppyMath;
|
||||
import org.apache.lucene.util.bkd.BKDReader;
|
||||
|
||||
/**
|
||||
|
@ -104,7 +105,8 @@ public class LatLonPointPrototypeQueries {
|
|||
ScoreDoc[] scoreDocs = new ScoreDoc[hits.length];
|
||||
for(int i=0;i<hits.length;i++) {
|
||||
NearestNeighbor.NearestHit hit = hits[i];
|
||||
scoreDocs[i] = new FieldDoc(hit.docID, 0.0f, new Object[] {Double.valueOf(hit.distanceMeters)});
|
||||
double hitDistance = SloppyMath.haversinMeters(hit.distanceSortKey);
|
||||
scoreDocs[i] = new FieldDoc(hit.docID, 0.0f, new Object[] {Double.valueOf(hitDistance)});
|
||||
}
|
||||
return new TopFieldDocs(new TotalHits(totalHits, TotalHits.Relation.EQUAL_TO), scoreDocs, null);
|
||||
}
|
||||
|
|
|
@ -28,9 +28,9 @@ import org.apache.lucene.index.PointValues.Relation;
|
|||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.SloppyMath;
|
||||
import org.apache.lucene.util.bkd.BKDReader;
|
||||
import org.apache.lucene.util.bkd.BKDReader.IndexTree;
|
||||
import org.apache.lucene.util.bkd.BKDReader.IntersectState;
|
||||
import org.apache.lucene.util.bkd.BKDReader;
|
||||
|
||||
import static org.apache.lucene.geo.GeoEncodingUtils.decodeLatitude;
|
||||
import static org.apache.lucene.geo.GeoEncodingUtils.decodeLongitude;
|
||||
|
@ -48,19 +48,23 @@ class NearestNeighbor {
|
|||
final byte[] maxPacked;
|
||||
final IndexTree index;
|
||||
|
||||
/** The closest possible distance of all points in this cell */
|
||||
final double distanceMeters;
|
||||
/**
|
||||
* The closest distance from a point in this cell to the query point, computed as a sort key through
|
||||
* {@link SloppyMath#haversinSortKey}. Note that this is an approximation to the closest distance,
|
||||
* and there could be a point in the cell that is closer.
|
||||
*/
|
||||
final double distanceSortKey;
|
||||
|
||||
public Cell(IndexTree index, int readerIndex, byte[] minPacked, byte[] maxPacked, double distanceMeters) {
|
||||
public Cell(IndexTree index, int readerIndex, byte[] minPacked, byte[] maxPacked, double distanceSortKey) {
|
||||
this.index = index;
|
||||
this.readerIndex = readerIndex;
|
||||
this.minPacked = minPacked.clone();
|
||||
this.maxPacked = maxPacked.clone();
|
||||
this.distanceMeters = distanceMeters;
|
||||
this.distanceSortKey = distanceSortKey;
|
||||
}
|
||||
|
||||
public int compareTo(Cell other) {
|
||||
return Double.compare(distanceMeters, other.distanceMeters);
|
||||
return Double.compare(distanceSortKey, other.distanceSortKey);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -69,7 +73,7 @@ class NearestNeighbor {
|
|||
double minLon = decodeLongitude(minPacked, Integer.BYTES);
|
||||
double maxLat = decodeLatitude(maxPacked, 0);
|
||||
double maxLon = decodeLongitude(maxPacked, Integer.BYTES);
|
||||
return "Cell(readerIndex=" + readerIndex + " nodeID=" + index.getNodeID() + " isLeaf=" + index.isLeafNode() + " lat=" + minLat + " TO " + maxLat + ", lon=" + minLon + " TO " + maxLon + "; distanceMeters=" + distanceMeters + ")";
|
||||
return "Cell(readerIndex=" + readerIndex + " nodeID=" + index.getNodeID() + " isLeaf=" + index.isLeafNode() + " lat=" + minLat + " TO " + maxLat + ", lon=" + minLon + " TO " + maxLon + "; distanceSortKey=" + distanceSortKey + ")";
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -106,7 +110,8 @@ class NearestNeighbor {
|
|||
private void maybeUpdateBBox() {
|
||||
if (setBottomCounter < 1024 || (setBottomCounter & 0x3F) == 0x3F) {
|
||||
NearestHit hit = hitQueue.peek();
|
||||
Rectangle box = Rectangle.fromPointDistance(pointLat, pointLon, hit.distanceMeters);
|
||||
Rectangle box = Rectangle.fromPointDistance(pointLat, pointLon,
|
||||
SloppyMath.haversinMeters(hit.distanceSortKey));
|
||||
//System.out.println(" update bbox to " + box);
|
||||
minLat = box.minLat;
|
||||
maxLat = box.maxLat;
|
||||
|
@ -134,8 +139,6 @@ class NearestNeighbor {
|
|||
return;
|
||||
}
|
||||
|
||||
// TODO: work in int space, use haversinSortKey
|
||||
|
||||
double docLatitude = decodeLatitude(packedValue, 0);
|
||||
double docLongitude = decodeLongitude(packedValue, Integer.BYTES);
|
||||
|
||||
|
@ -147,21 +150,22 @@ class NearestNeighbor {
|
|||
return;
|
||||
}
|
||||
|
||||
double distanceMeters = SloppyMath.haversinMeters(pointLat, pointLon, docLatitude, docLongitude);
|
||||
// Use the haversin sort key when comparing hits, as it is faster to compute than the true distance.
|
||||
double distanceSortKey = SloppyMath.haversinSortKey(pointLat, pointLon, docLatitude, docLongitude);
|
||||
|
||||
//System.out.println(" visit docID=" + docID + " distanceMeters=" + distanceMeters + " docLat=" + docLatitude + " docLon=" + docLongitude);
|
||||
//System.out.println(" visit docID=" + docID + " distanceSortKey=" + distanceSortKey + " docLat=" + docLatitude + " docLon=" + docLongitude);
|
||||
|
||||
int fullDocID = curDocBase + docID;
|
||||
|
||||
if (hitQueue.size() == topN) {
|
||||
// queue already full
|
||||
NearestHit hit = hitQueue.peek();
|
||||
//System.out.println(" bottom distanceMeters=" + hit.distanceMeters);
|
||||
//System.out.println(" bottom distanceSortKey=" + hit.distanceSortKey);
|
||||
// we don't collect docs in order here, so we must also test the tie-break case ourselves:
|
||||
if (distanceMeters < hit.distanceMeters || (distanceMeters == hit.distanceMeters && fullDocID < hit.docID)) {
|
||||
if (distanceSortKey < hit.distanceSortKey || (distanceSortKey == hit.distanceSortKey && fullDocID < hit.docID)) {
|
||||
hitQueue.poll();
|
||||
hit.docID = fullDocID;
|
||||
hit.distanceMeters = distanceMeters;
|
||||
hit.distanceSortKey = distanceSortKey;
|
||||
hitQueue.offer(hit);
|
||||
//System.out.println(" ** keep2, now bottom=" + hit);
|
||||
maybeUpdateBBox();
|
||||
|
@ -170,7 +174,7 @@ class NearestNeighbor {
|
|||
} else {
|
||||
NearestHit hit = new NearestHit();
|
||||
hit.docID = fullDocID;
|
||||
hit.distanceMeters = distanceMeters;
|
||||
hit.distanceSortKey = distanceSortKey;
|
||||
hitQueue.offer(hit);
|
||||
//System.out.println(" ** keep1, now bottom=" + hit);
|
||||
}
|
||||
|
@ -182,14 +186,18 @@ class NearestNeighbor {
|
|||
}
|
||||
}
|
||||
|
||||
/** Holds one hit from {@link LatLonPointPrototypeQueries#nearest} */
|
||||
/** Holds one hit from {@link NearestNeighbor#nearest} */
|
||||
static class NearestHit {
|
||||
public int docID;
|
||||
public double distanceMeters;
|
||||
|
||||
/**
|
||||
* The distance from the hit to the query point, computed as a sort key through {@link SloppyMath#haversinSortKey}.
|
||||
*/
|
||||
public double distanceSortKey;
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "NearestHit(docID=" + docID + " distanceMeters=" + distanceMeters + ")";
|
||||
return "NearestHit(docID=" + docID + " distanceSortKey=" + distanceSortKey + ")";
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -204,8 +212,8 @@ class NearestNeighbor {
|
|||
final PriorityQueue<NearestHit> hitQueue = new PriorityQueue<>(n, new Comparator<NearestHit>() {
|
||||
@Override
|
||||
public int compare(NearestHit a, NearestHit b) {
|
||||
// sort by opposite distanceMeters natural order
|
||||
int cmp = Double.compare(a.distanceMeters, b.distanceMeters);
|
||||
// sort by opposite distanceSortKey natural order
|
||||
int cmp = Double.compare(a.distanceSortKey, b.distanceSortKey);
|
||||
if (cmp != 0) {
|
||||
return -cmp;
|
||||
}
|
||||
|
@ -319,10 +327,10 @@ class NearestNeighbor {
|
|||
return 0.0;
|
||||
}
|
||||
|
||||
double d1 = SloppyMath.haversinMeters(pointLat, pointLon, minLat, minLon);
|
||||
double d2 = SloppyMath.haversinMeters(pointLat, pointLon, minLat, maxLon);
|
||||
double d3 = SloppyMath.haversinMeters(pointLat, pointLon, maxLat, maxLon);
|
||||
double d4 = SloppyMath.haversinMeters(pointLat, pointLon, maxLat, minLon);
|
||||
double d1 = SloppyMath.haversinSortKey(pointLat, pointLon, minLat, minLon);
|
||||
double d2 = SloppyMath.haversinSortKey(pointLat, pointLon, minLat, maxLon);
|
||||
double d3 = SloppyMath.haversinSortKey(pointLat, pointLon, maxLat, maxLon);
|
||||
double d4 = SloppyMath.haversinSortKey(pointLat, pointLon, maxLat, minLon);
|
||||
return Math.min(Math.min(d1, d2), Math.min(d3, d4));
|
||||
}
|
||||
|
||||
|
|
|
@ -26,7 +26,6 @@ import org.apache.lucene.document.LatLonDocValuesField;
|
|||
import org.apache.lucene.document.LatLonPoint;
|
||||
import org.apache.lucene.document.StoredField;
|
||||
import org.apache.lucene.document.StringField;
|
||||
import org.apache.lucene.search.NearestNeighbor.NearestHit;
|
||||
import org.apache.lucene.geo.GeoEncodingUtils;
|
||||
import org.apache.lucene.geo.GeoTestUtil;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
|
@ -190,23 +189,22 @@ public class TestNearest extends LuceneTestCase {
|
|||
double pointLon = GeoTestUtil.nextLongitude();
|
||||
|
||||
// dumb brute force search to get the expected result:
|
||||
NearestHit[] expectedHits = new NearestHit[lats.length];
|
||||
FieldDoc[] expectedHits = new FieldDoc[lats.length];
|
||||
for(int id=0;id<lats.length;id++) {
|
||||
NearestHit hit = new NearestHit();
|
||||
hit.distanceMeters = SloppyMath.haversinMeters(pointLat, pointLon, lats[id], lons[id]);
|
||||
hit.docID = id;
|
||||
double distance = SloppyMath.haversinMeters(pointLat, pointLon, lats[id], lons[id]);
|
||||
FieldDoc hit = new FieldDoc(id, 0.0f, new Object[] {Double.valueOf(distance)});
|
||||
expectedHits[id] = hit;
|
||||
}
|
||||
|
||||
Arrays.sort(expectedHits, new Comparator<NearestHit>() {
|
||||
Arrays.sort(expectedHits, new Comparator<FieldDoc>() {
|
||||
@Override
|
||||
public int compare(NearestHit a, NearestHit b) {
|
||||
int cmp = Double.compare(a.distanceMeters, b.distanceMeters);
|
||||
public int compare(FieldDoc a, FieldDoc b) {
|
||||
int cmp = Double.compare(((Double) a.fields[0]).doubleValue(), ((Double) b.fields[0]).doubleValue());
|
||||
if (cmp != 0) {
|
||||
return cmp;
|
||||
}
|
||||
// tie break by smaller docID:
|
||||
return a.docID - b.docID;
|
||||
return a.doc - b.doc;
|
||||
}
|
||||
});
|
||||
|
||||
|
@ -221,22 +219,23 @@ public class TestNearest extends LuceneTestCase {
|
|||
|
||||
ScoreDoc[] hits = LatLonPointPrototypeQueries.nearest(s, "point", pointLat, pointLon, topN).scoreDocs;
|
||||
for(int i=0;i<topN;i++) {
|
||||
NearestHit expected = expectedHits[i];
|
||||
FieldDoc expected = expectedHits[i];
|
||||
FieldDoc expected2 = (FieldDoc) fieldDocs.scoreDocs[i];
|
||||
FieldDoc actual = (FieldDoc) hits[i];
|
||||
Document actualDoc = r.document(actual.doc);
|
||||
|
||||
if (VERBOSE) {
|
||||
System.out.println("hit " + i);
|
||||
System.out.println(" expected id=" + expected.docID + " lat=" + lats[expected.docID] + " lon=" + lons[expected.docID] + " distance=" + expected.distanceMeters + " meters");
|
||||
System.out.println(" expected id=" + expected.doc+ " lat=" + lats[expected.doc] + " lon=" + lons[expected.doc]
|
||||
+ " distance=" + ((Double) expected.fields[0]).doubleValue() + " meters");
|
||||
System.out.println(" actual id=" + actualDoc.getField("id") + " distance=" + actual.fields[0] + " meters");
|
||||
}
|
||||
|
||||
assertEquals(expected.docID, actual.doc);
|
||||
assertEquals(expected.distanceMeters, ((Double) actual.fields[0]).doubleValue(), 0.0);
|
||||
assertEquals(expected.doc, actual.doc);
|
||||
assertEquals(((Double) expected.fields[0]).doubleValue(), ((Double) actual.fields[0]).doubleValue(), 0.0);
|
||||
|
||||
assertEquals(expected.docID, expected.docID);
|
||||
assertEquals(((Double) expected2.fields[0]).doubleValue(), expected.distanceMeters, 0.0);
|
||||
assertEquals(expected2.doc, actual.doc);
|
||||
assertEquals(((Double) expected2.fields[0]).doubleValue(), ((Double) actual.fields[0]).doubleValue(), 0.0);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue