LUCENE-7099: add newDistanceSort to sandbox LatLonPoint

This commit is contained in:
Robert Muir 2016-03-11 19:29:22 -05:00
parent e11182c628
commit 51d02687a6
6 changed files with 485 additions and 10 deletions

View File

@ -4,7 +4,11 @@ For more information on past and future Lucene versions, please see:
http://s.apache.org/luceneversions
======================= Lucene 6.1.0 =======================
(No Changes)
New Features
* LUCENE-7099: Add LatLonPoint.newDistanceSort to the sandbox's
LatLonPoint. (Robert Muir)
Optimizations

View File

@ -18,14 +18,16 @@ package org.apache.lucene.document;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.NumericUtils;
import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.PointValues;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.FieldDoc;
import org.apache.lucene.search.PointRangeQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.SortField;
import org.apache.lucene.spatial.util.GeoUtils;
/**
@ -35,10 +37,11 @@ import org.apache.lucene.spatial.util.GeoUtils;
* efficient. Multiple values for the same field in one document
* is allowed.
* <p>
* This field defines static factory methods for creating common queries:
* This field defines static factory methods for common operations:
* <ul>
* <li>{@link #newBoxQuery newBoxQuery()} for matching points within a bounding box.
* <li>{@link #newDistanceQuery newDistanceQuery()} for matching points within a specified distance.
* <li>{@link #newDistanceSort newDistanceSort()} for ordering documents by distance from a specified location.
* <li>{@link #newPolygonQuery newPolygonQuery()} for matching points within an arbitrary polygon.
* </ul>
* <p>
@ -50,6 +53,8 @@ import org.apache.lucene.spatial.util.GeoUtils;
// to the field is not actually what gets indexed. Float would be 1E-5 error vs 1E-7, but it might be
// a better tradeoff? then it would be completely transparent to the user and lucene would be "lossless".
public class LatLonPoint extends Field {
private long currentValue;
/**
* Type for an indexed LatLonPoint
* <p>
@ -58,6 +63,7 @@ public class LatLonPoint extends Field {
public static final FieldType TYPE = new FieldType();
static {
TYPE.setDimensions(2, Integer.BYTES);
TYPE.setDocValuesType(DocValuesType.SORTED_NUMERIC);
TYPE.freeze();
}
@ -69,9 +75,12 @@ public class LatLonPoint extends Field {
*/
public void setLocationValue(double latitude, double longitude) {
byte[] bytes = new byte[8];
NumericUtils.intToSortableBytes(encodeLatitude(latitude), bytes, 0);
NumericUtils.intToSortableBytes(encodeLongitude(longitude), bytes, Integer.BYTES);
int latitudeEncoded = encodeLatitude(latitude);
int longitudeEncoded = encodeLongitude(longitude);
NumericUtils.intToSortableBytes(latitudeEncoded, bytes, 0);
NumericUtils.intToSortableBytes(longitudeEncoded, bytes, Integer.BYTES);
fieldsData = new BytesRef(bytes);
currentValue = (((long)latitudeEncoded) << 32) | (longitudeEncoded & 0xFFFFFFFFL);
}
/**
@ -98,15 +107,25 @@ public class LatLonPoint extends Field {
result.append(name);
result.append(':');
BytesRef bytes = (BytesRef) fieldsData;
result.append(decodeLatitude(BytesRef.deepCopyOf(bytes).bytes, 0));
result.append(decodeLatitude((int)(currentValue >> 32)));
result.append(',');
result.append(decodeLongitude(BytesRef.deepCopyOf(bytes).bytes, Integer.BYTES));
result.append(decodeLongitude((int)(currentValue & 0xFFFFFFFF)));
result.append('>');
return result.toString();
}
/**
* Returns a 64-bit long, where the upper 32 bits are the encoded latitude,
* and the lower 32 bits are the encoded longitude.
* @see #decodeLatitude(int)
* @see #decodeLongitude(int)
*/
@Override
public Number numericValue() {
return currentValue;
}
// public helper methods (e.g. for queries)
/**
@ -197,16 +216,22 @@ public class LatLonPoint extends Field {
/** helper: checks a fieldinfo and throws exception if its definitely not a LatLonPoint */
static void checkCompatible(FieldInfo fieldInfo) {
if (fieldInfo.getPointDimensionCount() != TYPE.pointDimensionCount()) {
// point/dv properties could be "unset", if you e.g. used only StoredField with this same name in the segment.
if (fieldInfo.getPointDimensionCount() != 0 && fieldInfo.getPointDimensionCount() != TYPE.pointDimensionCount()) {
throw new IllegalArgumentException("field=\"" + fieldInfo.name + "\" was indexed with numDims=" + fieldInfo.getPointDimensionCount() +
" but this point type has numDims=" + TYPE.pointDimensionCount() +
", is the field really a LatLonPoint?");
}
if (fieldInfo.getPointNumBytes() != TYPE.pointNumBytes()) {
if (fieldInfo.getPointNumBytes() != 0 && fieldInfo.getPointNumBytes() != TYPE.pointNumBytes()) {
throw new IllegalArgumentException("field=\"" + fieldInfo.name + "\" was indexed with bytesPerDim=" + fieldInfo.getPointNumBytes() +
" but this point type has bytesPerDim=" + TYPE.pointNumBytes() +
", is the field really a LatLonPoint?");
}
if (fieldInfo.getDocValuesType() != DocValuesType.NONE && fieldInfo.getDocValuesType() != TYPE.docValuesType()) {
throw new IllegalArgumentException("field=\"" + fieldInfo.name + "\" was indexed with docValuesType=" + fieldInfo.getDocValuesType() +
" but this point type has docValuesType=" + TYPE.docValuesType() +
", is the field really a LatLonPoint?");
}
}
// static methods for generating queries
@ -298,4 +323,30 @@ public class LatLonPoint extends Field {
public static Query newPolygonQuery(String field, double[] polyLats, double[] polyLons) {
return new LatLonPointInPolygonQuery(field, polyLats, polyLons);
}
/**
* Creates a SortField for sorting by distance from a location.
* <p>
* This sort orders documents by ascending distance from the location. The value returned in {@link FieldDoc} for
* the hits contains a Double instance with the distance in meters.
* <p>
* If a document is missing the field, then by default it is treated as having {@link Double#POSITIVE_INFINITY} distance
* (missing last). You can change this by calling {@link SortField#setMissingValue(Object)} on the returned SortField
* to a different Double value.
* <p>
* If a document contains multiple values for the field, the <i>closest</i> distance to the location is used.
* <p>
* <b>NOTE</b>: distance sorting might be expensive for many documents. Consider restricting the document
* set with a {@link #newBoxQuery box}, {@link #newDistanceQuery radius} radius, or {@link #newPolygonQuery polygon}
* query for better performance
*
* @param field field name. cannot be null.
* @param latitude latitude at the center: must be within standard +/-90 coordinate bounds.
* @param longitude longitude at the center: must be within standard +/-180 coordinate bounds.
* @return SortField ordering documents by distance
* @throws IllegalArgumentException if {@code field} is null or location has invalid coordinates.
*/
public static SortField newDistanceSort(String field, double latitude, double longitude) {
return new LatLonPointSortField(field, latitude, longitude);
}
}

View File

@ -0,0 +1,119 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.document;
import java.io.IOException;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.SortedNumericDocValues;
import org.apache.lucene.search.FieldComparator;
import org.apache.lucene.search.LeafFieldComparator;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.spatial.util.GeoDistanceUtils;
/** Compares docs by distance from an origin */
class LatLonPointDistanceComparator extends FieldComparator<Double> implements LeafFieldComparator {
final String field;
final double latitude;
final double longitude;
final double missingValue;
final double[] values;
double bottom;
double topValue;
SortedNumericDocValues currentDocs;
public LatLonPointDistanceComparator(String field, double latitude, double longitude, int numHits, double missingValue) {
this.field = field;
this.latitude = latitude;
this.longitude = longitude;
this.values = new double[numHits];
this.missingValue = missingValue;
}
@Override
public void setScorer(Scorer scorer) {}
@Override
public int compare(int slot1, int slot2) {
return Double.compare(values[slot1], values[slot2]);
}
@Override
public void setBottom(int slot) {
bottom = values[slot];
}
@Override
public void setTopValue(Double value) {
topValue = value.doubleValue();
}
@Override
public int compareBottom(int doc) throws IOException {
return Double.compare(bottom, distance(doc));
}
@Override
public void copy(int slot, int doc) throws IOException {
values[slot] = distance(doc);
}
@Override
public LeafFieldComparator getLeafComparator(LeafReaderContext context) throws IOException {
LeafReader reader = context.reader();
FieldInfo info = reader.getFieldInfos().fieldInfo(field);
if (info != null) {
LatLonPoint.checkCompatible(info);
}
currentDocs = DocValues.getSortedNumeric(reader, field);
return this;
}
@Override
public Double value(int slot) {
return Double.valueOf(values[slot]);
}
@Override
public int compareTop(int doc) throws IOException {
return Double.compare(topValue, distance(doc));
}
// TODO: optimize for single-valued case?
// TODO: do all kinds of other optimizations!
double distance(int doc) {
currentDocs.setDocument(doc);
int numValues = currentDocs.count();
if (numValues == 0) {
return missingValue;
}
double minValue = Double.POSITIVE_INFINITY;
for (int i = 0; i < numValues; i++) {
long encoded = currentDocs.valueAt(i);
double docLatitude = LatLonPoint.decodeLatitude((int)(encoded >> 32));
double docLongitude = LatLonPoint.decodeLongitude((int)(encoded & 0xFFFFFFFF));
minValue = Math.min(minValue, GeoDistanceUtils.haversin(latitude, longitude, docLatitude, docLongitude));
}
return minValue;
}
}

View File

@ -0,0 +1,108 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.document;
import java.io.IOException;
import org.apache.lucene.search.FieldComparator;
import org.apache.lucene.search.SortField;
import org.apache.lucene.spatial.util.GeoUtils;
/**
* Sorts by distance from an origin location.
*/
final class LatLonPointSortField extends SortField {
final double latitude;
final double longitude;
LatLonPointSortField(String field, double latitude, double longitude) {
super(field, SortField.Type.CUSTOM);
if (field == null) {
throw new IllegalArgumentException("field cannot be null");
}
if (GeoUtils.isValidLat(latitude) == false) {
throw new IllegalArgumentException("latitude: '" + latitude + "' is invalid");
}
if (GeoUtils.isValidLon(longitude) == false) {
throw new IllegalArgumentException("longitude: '" + longitude + "' is invalid");
}
this.latitude = latitude;
this.longitude = longitude;
setMissingValue(Double.POSITIVE_INFINITY);
}
@Override
public FieldComparator<?> getComparator(int numHits, int sortPos) throws IOException {
return new LatLonPointDistanceComparator(getField(), latitude, longitude, numHits, getMissingValue());
}
@Override
public Double getMissingValue() {
return (Double) super.getMissingValue();
}
@Override
public void setMissingValue(Object missingValue) {
if (missingValue == null) {
throw new IllegalArgumentException("Missing value cannot be null");
}
if (missingValue.getClass() != Double.class)
throw new IllegalArgumentException("Missing value can only be of type java.lang.Double, but got " + missingValue.getClass());
this.missingValue = missingValue;
}
@Override
public int hashCode() {
final int prime = 31;
int result = super.hashCode();
long temp;
temp = Double.doubleToLongBits(latitude);
result = prime * result + (int) (temp ^ (temp >>> 32));
temp = Double.doubleToLongBits(longitude);
result = prime * result + (int) (temp ^ (temp >>> 32));
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj) return true;
if (!super.equals(obj)) return false;
if (getClass() != obj.getClass()) return false;
LatLonPointSortField other = (LatLonPointSortField) obj;
if (Double.doubleToLongBits(latitude) != Double.doubleToLongBits(other.latitude)) return false;
if (Double.doubleToLongBits(longitude) != Double.doubleToLongBits(other.longitude)) return false;
return true;
}
@Override
public String toString() {
StringBuilder builder = new StringBuilder();
builder.append("<distance:");
builder.append('"');
builder.append(getField());
builder.append('"');
builder.append(" latitude=");
builder.append(latitude);
builder.append(" longitude=");
builder.append(longitude);
if (Double.POSITIVE_INFINITY != getMissingValue()) {
builder.append(" missingValue=" + getMissingValue());
}
builder.append('>');
return builder.toString();
}
}

View File

@ -56,6 +56,9 @@ public class TestLatLonPoint extends LuceneTestCase {
// distance query does not quantize inputs
assertEquals("field:18.0,19.0 +/- 25.0 meters", LatLonPoint.newDistanceQuery("field", 18, 19, 25).toString());
// sort field
assertEquals("<distance:\"field\" latitude=18.0 longitude=19.0>", LatLonPoint.newDistanceSort("field", 18.0, 19.0).toString());
}
/** Valid values that should not cause exception */

View File

@ -0,0 +1,190 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.document;
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.search.FieldDoc;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TopFieldDocs;
import org.apache.lucene.spatial.util.GeoDistanceUtils;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
/** Simple tests for {@link LatLonPoint#newDistanceSort} */
public class TestLatLonPointDistanceSort extends LuceneTestCase {
/** Add three points and sort by distance */
public void testDistanceSort() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter iw = new RandomIndexWriter(random(), dir);
// add some docs
Document doc = new Document();
doc.add(new LatLonPoint("location", 40.759011, -73.9844722));
iw.addDocument(doc);
doc = new Document();
doc.add(new LatLonPoint("location", 40.718266, -74.007819));
iw.addDocument(doc);
doc = new Document();
doc.add(new LatLonPoint("location", 40.7051157, -74.0088305));
iw.addDocument(doc);
IndexReader reader = iw.getReader();
IndexSearcher searcher = new IndexSearcher(reader);
iw.close();
Sort sort = new Sort(LatLonPoint.newDistanceSort("location", 40.7143528, -74.0059731));
TopFieldDocs td = searcher.search(new MatchAllDocsQuery(), 3, sort);
FieldDoc d = (FieldDoc) td.scoreDocs[0];
assertEquals(462.61748421408186D, (Double)d.fields[0], 0.0D);
d = (FieldDoc) td.scoreDocs[1];
assertEquals(1056.1630445911035D, (Double)d.fields[0], 0.0D);
d = (FieldDoc) td.scoreDocs[2];
assertEquals(5291.798081404466D, (Double)d.fields[0], 0.0D);
reader.close();
dir.close();
}
/** Run a few iterations with just 10 docs, hopefully easy to debug */
public void testRandom() throws Exception {
for (int iters = 0; iters < 100; iters++) {
doRandomTest(10, 100);
}
}
/** Runs with thousands of docs */
@Nightly
public void testRandomHuge() throws Exception {
for (int iters = 0; iters < 10; iters++) {
doRandomTest(2000, 100);
}
}
// result class used for testing. holds an id+distance.
// we sort these with Arrays.sort and compare with lucene's results
static class Result implements Comparable<Result> {
int id;
double distance;
Result(int id, double distance) {
this.id = id;
this.distance = distance;
}
@Override
public int compareTo(Result o) {
int cmp = Double.compare(distance, o.distance);
if (cmp == 0) {
return Integer.compare(id, o.id);
}
return cmp;
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
long temp;
temp = Double.doubleToLongBits(distance);
result = prime * result + (int) (temp ^ (temp >>> 32));
result = prime * result + id;
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj) return true;
if (obj == null) return false;
if (getClass() != obj.getClass()) return false;
Result other = (Result) obj;
if (Double.doubleToLongBits(distance) != Double.doubleToLongBits(other.distance)) return false;
if (id != other.id) return false;
return true;
}
}
private void doRandomTest(int numDocs, int numQueries) throws IOException {
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
for (int i = 0; i < numDocs; i++) {
double latRaw = -90 + 180.0 * random().nextDouble();
double lonRaw = -180 + 360.0 * random().nextDouble();
// pre-normalize up front, so we can just use quantized value for testing and do simple exact comparisons
double lat = LatLonPoint.decodeLatitude(LatLonPoint.encodeLatitude(latRaw));
double lon = LatLonPoint.decodeLongitude(LatLonPoint.encodeLongitude(lonRaw));
Document doc = new Document();
doc.add(new StoredField("id", i));
doc.add(new NumericDocValuesField("id", i));
doc.add(new LatLonPoint("field", lat, lon));
doc.add(new StoredField("lat", lat));
doc.add(new StoredField("lon", lon));
writer.addDocument(doc);
}
IndexReader reader = writer.getReader();
IndexSearcher searcher = new IndexSearcher(reader);
for (int i = 0; i < numQueries; i++) {
double lat = -90 + 180.0 * random().nextDouble();
double lon = -180 + 360.0 * random().nextDouble();
Result expected[] = new Result[reader.maxDoc()];
for (int doc = 0; doc < reader.maxDoc(); doc++) {
Document targetDoc = reader.document(doc);
double docLatitude = targetDoc.getField("lat").numericValue().doubleValue();
double docLongitude = targetDoc.getField("lon").numericValue().doubleValue();
double distance = GeoDistanceUtils.haversin(lat, lon, docLatitude, docLongitude);
int id = targetDoc.getField("id").numericValue().intValue();
expected[doc] = new Result(id, distance);
}
Arrays.sort(expected);
// randomize the topN a bit
int topN = TestUtil.nextInt(random(), 1, reader.maxDoc());
// sort by distance, then ID
Sort sort = new Sort(LatLonPoint.newDistanceSort("field", lat, lon),
new SortField("id", SortField.Type.INT));
TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), topN, sort);
for (int resultNumber = 0; resultNumber < topN; resultNumber++) {
FieldDoc fieldDoc = (FieldDoc) topDocs.scoreDocs[resultNumber];
Result actual = new Result((Integer) fieldDoc.fields[1], (Double) fieldDoc.fields[0]);
assertEquals(expected[resultNumber], actual);
}
}
reader.close();
writer.close();
dir.close();
}
}