LUCENE-7102: LatLonPoint.newDistanceSort fails with "sort missing first"

This commit is contained in:
Robert Muir 2016-03-14 14:08:25 -04:00
parent ae866b0149
commit a6fe1c0b15
4 changed files with 152 additions and 53 deletions

View File

@ -333,8 +333,8 @@ public class LatLonPoint extends Field {
* the hits contains a Double instance with the distance in meters.
* <p>
* If a document is missing the field, then by default it is treated as having {@link Double#POSITIVE_INFINITY} distance
* (missing last). You can change this by calling {@link SortField#setMissingValue(Object)} on the returned SortField
* to a different Double value.
* (missing values sort last). You can change this to sort missing values first by calling
* {@link SortField#setMissingValue(Object) setMissingValue(Double.NEGATIVE_INFINITY)} on the returned SortField.
* <p>
* If a document contains multiple values for the field, the <i>closest</i> distance to the location is used.
* <p>

View File

@ -89,45 +89,53 @@ class LatLonPointDistanceComparator extends FieldComparator<Double> implements L
// sampling if we get called way too much: don't make gobs of bounding
// boxes if comparator hits a worst case order (e.g. backwards distance order)
if (setBottomCounter < 1024 || (setBottomCounter & 0x3F) == 0x3F) {
GeoRect box = GeoUtils.circleToBBox(longitude, latitude, bottom);
// pre-encode our box to our integer encoding, so we don't have to decode
// to double values for uncompetitive hits. This has some cost!
int minLatEncoded = LatLonPoint.encodeLatitude(box.minLat);
int maxLatEncoded = LatLonPoint.encodeLatitude(box.maxLat);
int minLonEncoded = LatLonPoint.encodeLongitude(box.minLon);
int maxLonEncoded = LatLonPoint.encodeLongitude(box.maxLon);
// be sure to not introduce quantization error in our optimization, just
// round up our encoded box safely in all directions.
if (minLatEncoded != Integer.MIN_VALUE) {
minLatEncoded--;
}
if (minLonEncoded != Integer.MIN_VALUE) {
minLonEncoded--;
}
if (maxLatEncoded != Integer.MAX_VALUE) {
maxLatEncoded++;
}
if (maxLonEncoded != Integer.MAX_VALUE) {
maxLonEncoded++;
}
crossesDateLine = box.crossesDateline();
// crosses dateline: split
if (crossesDateLine) {
// box1
minLon = Integer.MIN_VALUE;
maxLon = maxLonEncoded;
minLat = minLatEncoded;
maxLat = maxLatEncoded;
// box2
minLon2 = minLonEncoded;
maxLon2 = Integer.MAX_VALUE;
minLat2 = minLatEncoded;
maxLat2 = maxLatEncoded;
// don't pass infinite values to circleToBBox: just make a complete box.
if (bottom == missingValue) {
minLat = minLon = Integer.MIN_VALUE;
maxLat = maxLon = Integer.MAX_VALUE;
crossesDateLine = false;
} else {
minLon = minLonEncoded;
maxLon = maxLonEncoded;
minLat = minLatEncoded;
maxLat = maxLatEncoded;
assert Double.isFinite(bottom);
GeoRect box = GeoUtils.circleToBBox(longitude, latitude, bottom);
// pre-encode our box to our integer encoding, so we don't have to decode
// to double values for uncompetitive hits. This has some cost!
int minLatEncoded = LatLonPoint.encodeLatitude(box.minLat);
int maxLatEncoded = LatLonPoint.encodeLatitude(box.maxLat);
int minLonEncoded = LatLonPoint.encodeLongitude(box.minLon);
int maxLonEncoded = LatLonPoint.encodeLongitude(box.maxLon);
// be sure to not introduce quantization error in our optimization, just
// round up our encoded box safely in all directions.
if (minLatEncoded != Integer.MIN_VALUE) {
minLatEncoded--;
}
if (minLonEncoded != Integer.MIN_VALUE) {
minLonEncoded--;
}
if (maxLatEncoded != Integer.MAX_VALUE) {
maxLatEncoded++;
}
if (maxLonEncoded != Integer.MAX_VALUE) {
maxLonEncoded++;
}
crossesDateLine = box.crossesDateline();
// crosses dateline: split
if (crossesDateLine) {
// box1
minLon = Integer.MIN_VALUE;
maxLon = maxLonEncoded;
minLat = minLatEncoded;
maxLat = maxLatEncoded;
// box2
minLon2 = minLonEncoded;
maxLon2 = Integer.MAX_VALUE;
minLat2 = minLatEncoded;
maxLat2 = maxLatEncoded;
} else {
minLon = minLonEncoded;
maxLon = maxLonEncoded;
minLat = minLatEncoded;
maxLat = maxLatEncoded;
}
}
}
setBottomCounter++;

View File

@ -62,7 +62,11 @@ final class LatLonPointSortField extends SortField {
}
if (missingValue.getClass() != Double.class)
throw new IllegalArgumentException("Missing value can only be of type java.lang.Double, but got " + missingValue.getClass());
this.missingValue = missingValue;
Double value = (Double) missingValue;
if (!Double.isInfinite(value)) {
throw new IllegalArgumentException("Missing value can only be Double.NEGATIVE_INFINITY (missing values first) or Double.POSITIVE_INFINITY (missing values last), but got " + value);
}
this.missingValue = value;
}
@Override

View File

@ -73,6 +73,82 @@ public class TestLatLonPointDistanceSort extends LuceneTestCase {
dir.close();
}
/** Add two points (one doc missing) and sort by distance */
public void testMissingLast() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter iw = new RandomIndexWriter(random(), dir);
// missing
Document doc = new Document();
iw.addDocument(doc);
doc = new Document();
doc.add(new LatLonPoint("location", 40.718266, -74.007819));
iw.addDocument(doc);
doc = new Document();
doc.add(new LatLonPoint("location", 40.7051157, -74.0088305));
iw.addDocument(doc);
IndexReader reader = iw.getReader();
IndexSearcher searcher = new IndexSearcher(reader);
iw.close();
Sort sort = new Sort(LatLonPoint.newDistanceSort("location", 40.7143528, -74.0059731));
TopDocs td = searcher.search(new MatchAllDocsQuery(), 3, sort);
FieldDoc d = (FieldDoc) td.scoreDocs[0];
assertEquals(462.61748421408186D, (Double)d.fields[0], 0.0D);
d = (FieldDoc) td.scoreDocs[1];
assertEquals(1056.1630445911035D, (Double)d.fields[0], 0.0D);
d = (FieldDoc) td.scoreDocs[2];
assertEquals(Double.POSITIVE_INFINITY, (Double)d.fields[0], 0.0D);
reader.close();
dir.close();
}
/** Add two points (one doc missing) and sort by distance */
public void testMissingFirst() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter iw = new RandomIndexWriter(random(), dir);
// missing
Document doc = new Document();
iw.addDocument(doc);
doc = new Document();
doc.add(new LatLonPoint("location", 40.718266, -74.007819));
iw.addDocument(doc);
doc = new Document();
doc.add(new LatLonPoint("location", 40.7051157, -74.0088305));
iw.addDocument(doc);
IndexReader reader = iw.getReader();
IndexSearcher searcher = new IndexSearcher(reader);
iw.close();
SortField sortField = LatLonPoint.newDistanceSort("location", 40.7143528, -74.0059731);
sortField.setMissingValue(Double.NEGATIVE_INFINITY);
Sort sort = new Sort(sortField);
TopDocs td = searcher.search(new MatchAllDocsQuery(), 3, sort);
FieldDoc d = (FieldDoc) td.scoreDocs[0];
assertEquals(Double.NEGATIVE_INFINITY, (Double)d.fields[0], 0.0D);
d = (FieldDoc) td.scoreDocs[1];
assertEquals(462.61748421408186D, (Double)d.fields[0], 0.0D);
d = (FieldDoc) td.scoreDocs[2];
assertEquals(1056.1630445911035D, (Double)d.fields[0], 0.0D);
reader.close();
dir.close();
}
/** Run a few iterations with just 10 docs, hopefully easy to debug */
public void testRandom() throws Exception {
for (int iters = 0; iters < 100; iters++) {
@ -141,17 +217,20 @@ public class TestLatLonPointDistanceSort extends LuceneTestCase {
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
for (int i = 0; i < numDocs; i++) {
double latRaw = -90 + 180.0 * random().nextDouble();
double lonRaw = -180 + 360.0 * random().nextDouble();
// pre-normalize up front, so we can just use quantized value for testing and do simple exact comparisons
double lat = LatLonPoint.decodeLatitude(LatLonPoint.encodeLatitude(latRaw));
double lon = LatLonPoint.decodeLongitude(LatLonPoint.encodeLongitude(lonRaw));
Document doc = new Document();
doc.add(new StoredField("id", i));
doc.add(new NumericDocValuesField("id", i));
doc.add(new LatLonPoint("field", lat, lon));
doc.add(new StoredField("lat", lat));
doc.add(new StoredField("lon", lon));
if (random().nextInt(10) > 7) {
double latRaw = -90 + 180.0 * random().nextDouble();
double lonRaw = -180 + 360.0 * random().nextDouble();
// pre-normalize up front, so we can just use quantized value for testing and do simple exact comparisons
double lat = LatLonPoint.decodeLatitude(LatLonPoint.encodeLatitude(latRaw));
double lon = LatLonPoint.decodeLongitude(LatLonPoint.encodeLongitude(lonRaw));
doc.add(new LatLonPoint("field", lat, lon));
doc.add(new StoredField("lat", lat));
doc.add(new StoredField("lon", lon));
} // otherwise "missing"
writer.addDocument(doc);
}
IndexReader reader = writer.getReader();
@ -160,14 +239,20 @@ public class TestLatLonPointDistanceSort extends LuceneTestCase {
for (int i = 0; i < numQueries; i++) {
double lat = -90 + 180.0 * random().nextDouble();
double lon = -180 + 360.0 * random().nextDouble();
double missingValue = random().nextBoolean() ? Double.POSITIVE_INFINITY : Double.NEGATIVE_INFINITY;
Result expected[] = new Result[reader.maxDoc()];
for (int doc = 0; doc < reader.maxDoc(); doc++) {
Document targetDoc = reader.document(doc);
double docLatitude = targetDoc.getField("lat").numericValue().doubleValue();
double docLongitude = targetDoc.getField("lon").numericValue().doubleValue();
double distance = GeoDistanceUtils.haversin(lat, lon, docLatitude, docLongitude);
final double distance;
if (targetDoc.getField("lat") == null) {
distance = missingValue; // missing
} else {
double docLatitude = targetDoc.getField("lat").numericValue().doubleValue();
double docLongitude = targetDoc.getField("lon").numericValue().doubleValue();
distance = GeoDistanceUtils.haversin(lat, lon, docLatitude, docLongitude);
}
int id = targetDoc.getField("id").numericValue().intValue();
expected[doc] = new Result(id, distance);
}
@ -177,7 +262,9 @@ public class TestLatLonPointDistanceSort extends LuceneTestCase {
// randomize the topN a bit
int topN = TestUtil.nextInt(random(), 1, reader.maxDoc());
// sort by distance, then ID
Sort sort = new Sort(LatLonPoint.newDistanceSort("field", lat, lon),
SortField distanceSort = LatLonPoint.newDistanceSort("field", lat, lon);
distanceSort.setMissingValue(missingValue);
Sort sort = new Sort(distanceSort,
new SortField("id", SortField.Type.INT));
TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), topN, sort);