From b3ee746a8b1f394944c2bb9552e1ea6fd7afd83f Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Fri, 11 Mar 2016 19:29:22 -0500 Subject: [PATCH] LUCENE-7099: add newDistanceSort to sandbox LatLonPoint --- lucene/CHANGES.txt | 6 +- .../apache/lucene/document/LatLonPoint.java | 69 ++++++- .../LatLonPointDistanceComparator.java | 119 +++++++++++ .../lucene/document/LatLonPointSortField.java | 108 ++++++++++ .../lucene/document/TestLatLonPoint.java | 3 + .../document/TestLatLonPointDistanceSort.java | 190 ++++++++++++++++++ 6 files changed, 485 insertions(+), 10 deletions(-) create mode 100644 lucene/sandbox/src/java/org/apache/lucene/document/LatLonPointDistanceComparator.java create mode 100644 lucene/sandbox/src/java/org/apache/lucene/document/LatLonPointSortField.java create mode 100644 lucene/sandbox/src/test/org/apache/lucene/document/TestLatLonPointDistanceSort.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index a07e69d1696..82d22fe34ef 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -7,7 +7,11 @@ http://s.apache.org/luceneversions (No Changes) ======================= Lucene 6.1.0 ======================= -(No Changes) + +New Features + +* LUCENE-7099: Add LatLonPoint.newDistanceSort to the sandbox's + LatLonPoint. (Robert Muir) Optimizations diff --git a/lucene/sandbox/src/java/org/apache/lucene/document/LatLonPoint.java b/lucene/sandbox/src/java/org/apache/lucene/document/LatLonPoint.java index fd3284b6e29..e8c2f177849 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/document/LatLonPoint.java +++ b/lucene/sandbox/src/java/org/apache/lucene/document/LatLonPoint.java @@ -18,14 +18,16 @@ package org.apache.lucene.document; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.NumericUtils; - +import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.PointValues; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.ConstantScoreQuery; +import org.apache.lucene.search.FieldDoc; import org.apache.lucene.search.PointRangeQuery; import org.apache.lucene.search.Query; +import org.apache.lucene.search.SortField; import org.apache.lucene.spatial.util.GeoUtils; /** @@ -35,10 +37,11 @@ import org.apache.lucene.spatial.util.GeoUtils; * efficient. Multiple values for the same field in one document * is allowed. *

- * This field defines static factory methods for creating common queries: + * This field defines static factory methods for common operations: *

*

@@ -50,6 +53,8 @@ import org.apache.lucene.spatial.util.GeoUtils; // to the field is not actually what gets indexed. Float would be 1E-5 error vs 1E-7, but it might be // a better tradeoff? then it would be completely transparent to the user and lucene would be "lossless". public class LatLonPoint extends Field { + private long currentValue; + /** * Type for an indexed LatLonPoint *

@@ -58,6 +63,7 @@ public class LatLonPoint extends Field { public static final FieldType TYPE = new FieldType(); static { TYPE.setDimensions(2, Integer.BYTES); + TYPE.setDocValuesType(DocValuesType.SORTED_NUMERIC); TYPE.freeze(); } @@ -69,9 +75,12 @@ public class LatLonPoint extends Field { */ public void setLocationValue(double latitude, double longitude) { byte[] bytes = new byte[8]; - NumericUtils.intToSortableBytes(encodeLatitude(latitude), bytes, 0); - NumericUtils.intToSortableBytes(encodeLongitude(longitude), bytes, Integer.BYTES); + int latitudeEncoded = encodeLatitude(latitude); + int longitudeEncoded = encodeLongitude(longitude); + NumericUtils.intToSortableBytes(latitudeEncoded, bytes, 0); + NumericUtils.intToSortableBytes(longitudeEncoded, bytes, Integer.BYTES); fieldsData = new BytesRef(bytes); + currentValue = (((long)latitudeEncoded) << 32) | (longitudeEncoded & 0xFFFFFFFFL); } /** @@ -98,15 +107,25 @@ public class LatLonPoint extends Field { result.append(name); result.append(':'); - BytesRef bytes = (BytesRef) fieldsData; - result.append(decodeLatitude(BytesRef.deepCopyOf(bytes).bytes, 0)); + result.append(decodeLatitude((int)(currentValue >> 32))); result.append(','); - result.append(decodeLongitude(BytesRef.deepCopyOf(bytes).bytes, Integer.BYTES)); + result.append(decodeLongitude((int)(currentValue & 0xFFFFFFFF))); result.append('>'); return result.toString(); } + /** + * Returns a 64-bit long, where the upper 32 bits are the encoded latitude, + * and the lower 32 bits are the encoded longitude. + * @see #decodeLatitude(int) + * @see #decodeLongitude(int) + */ + @Override + public Number numericValue() { + return currentValue; + } + // public helper methods (e.g. for queries) /** @@ -197,16 +216,22 @@ public class LatLonPoint extends Field { /** helper: checks a fieldinfo and throws exception if its definitely not a LatLonPoint */ static void checkCompatible(FieldInfo fieldInfo) { - if (fieldInfo.getPointDimensionCount() != TYPE.pointDimensionCount()) { + // point/dv properties could be "unset", if you e.g. used only StoredField with this same name in the segment. + if (fieldInfo.getPointDimensionCount() != 0 && fieldInfo.getPointDimensionCount() != TYPE.pointDimensionCount()) { throw new IllegalArgumentException("field=\"" + fieldInfo.name + "\" was indexed with numDims=" + fieldInfo.getPointDimensionCount() + " but this point type has numDims=" + TYPE.pointDimensionCount() + ", is the field really a LatLonPoint?"); } - if (fieldInfo.getPointNumBytes() != TYPE.pointNumBytes()) { + if (fieldInfo.getPointNumBytes() != 0 && fieldInfo.getPointNumBytes() != TYPE.pointNumBytes()) { throw new IllegalArgumentException("field=\"" + fieldInfo.name + "\" was indexed with bytesPerDim=" + fieldInfo.getPointNumBytes() + " but this point type has bytesPerDim=" + TYPE.pointNumBytes() + ", is the field really a LatLonPoint?"); } + if (fieldInfo.getDocValuesType() != DocValuesType.NONE && fieldInfo.getDocValuesType() != TYPE.docValuesType()) { + throw new IllegalArgumentException("field=\"" + fieldInfo.name + "\" was indexed with docValuesType=" + fieldInfo.getDocValuesType() + + " but this point type has docValuesType=" + TYPE.docValuesType() + + ", is the field really a LatLonPoint?"); + } } // static methods for generating queries @@ -298,4 +323,30 @@ public class LatLonPoint extends Field { public static Query newPolygonQuery(String field, double[] polyLats, double[] polyLons) { return new LatLonPointInPolygonQuery(field, polyLats, polyLons); } + + /** + * Creates a SortField for sorting by distance from a location. + *

+ * This sort orders documents by ascending distance from the location. The value returned in {@link FieldDoc} for + * the hits contains a Double instance with the distance in meters. + *

+ * If a document is missing the field, then by default it is treated as having {@link Double#POSITIVE_INFINITY} distance + * (missing last). You can change this by calling {@link SortField#setMissingValue(Object)} on the returned SortField + * to a different Double value. + *

+ * If a document contains multiple values for the field, the closest distance to the location is used. + *

+ * NOTE: distance sorting might be expensive for many documents. Consider restricting the document + * set with a {@link #newBoxQuery box}, {@link #newDistanceQuery radius} radius, or {@link #newPolygonQuery polygon} + * query for better performance + * + * @param field field name. cannot be null. + * @param latitude latitude at the center: must be within standard +/-90 coordinate bounds. + * @param longitude longitude at the center: must be within standard +/-180 coordinate bounds. + * @return SortField ordering documents by distance + * @throws IllegalArgumentException if {@code field} is null or location has invalid coordinates. + */ + public static SortField newDistanceSort(String field, double latitude, double longitude) { + return new LatLonPointSortField(field, latitude, longitude); + } } diff --git a/lucene/sandbox/src/java/org/apache/lucene/document/LatLonPointDistanceComparator.java b/lucene/sandbox/src/java/org/apache/lucene/document/LatLonPointDistanceComparator.java new file mode 100644 index 00000000000..a5d85344d84 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/document/LatLonPointDistanceComparator.java @@ -0,0 +1,119 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.document; + +import java.io.IOException; + +import org.apache.lucene.index.DocValues; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.SortedNumericDocValues; +import org.apache.lucene.search.FieldComparator; +import org.apache.lucene.search.LeafFieldComparator; +import org.apache.lucene.search.Scorer; +import org.apache.lucene.spatial.util.GeoDistanceUtils; + +/** Compares docs by distance from an origin */ +class LatLonPointDistanceComparator extends FieldComparator implements LeafFieldComparator { + final String field; + final double latitude; + final double longitude; + final double missingValue; + + final double[] values; + double bottom; + double topValue; + SortedNumericDocValues currentDocs; + + public LatLonPointDistanceComparator(String field, double latitude, double longitude, int numHits, double missingValue) { + this.field = field; + this.latitude = latitude; + this.longitude = longitude; + this.values = new double[numHits]; + this.missingValue = missingValue; + } + + @Override + public void setScorer(Scorer scorer) {} + + @Override + public int compare(int slot1, int slot2) { + return Double.compare(values[slot1], values[slot2]); + } + + @Override + public void setBottom(int slot) { + bottom = values[slot]; + } + + @Override + public void setTopValue(Double value) { + topValue = value.doubleValue(); + } + + @Override + public int compareBottom(int doc) throws IOException { + return Double.compare(bottom, distance(doc)); + } + + @Override + public void copy(int slot, int doc) throws IOException { + values[slot] = distance(doc); + } + + @Override + public LeafFieldComparator getLeafComparator(LeafReaderContext context) throws IOException { + LeafReader reader = context.reader(); + FieldInfo info = reader.getFieldInfos().fieldInfo(field); + if (info != null) { + LatLonPoint.checkCompatible(info); + } + currentDocs = DocValues.getSortedNumeric(reader, field); + return this; + } + + @Override + public Double value(int slot) { + return Double.valueOf(values[slot]); + } + + @Override + public int compareTop(int doc) throws IOException { + return Double.compare(topValue, distance(doc)); + } + + // TODO: optimize for single-valued case? + // TODO: do all kinds of other optimizations! + double distance(int doc) { + currentDocs.setDocument(doc); + + int numValues = currentDocs.count(); + if (numValues == 0) { + return missingValue; + } + + double minValue = Double.POSITIVE_INFINITY; + for (int i = 0; i < numValues; i++) { + long encoded = currentDocs.valueAt(i); + double docLatitude = LatLonPoint.decodeLatitude((int)(encoded >> 32)); + double docLongitude = LatLonPoint.decodeLongitude((int)(encoded & 0xFFFFFFFF)); + minValue = Math.min(minValue, GeoDistanceUtils.haversin(latitude, longitude, docLatitude, docLongitude)); + } + return minValue; + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/document/LatLonPointSortField.java b/lucene/sandbox/src/java/org/apache/lucene/document/LatLonPointSortField.java new file mode 100644 index 00000000000..9d3928ea8c3 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/document/LatLonPointSortField.java @@ -0,0 +1,108 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.document; + +import java.io.IOException; + +import org.apache.lucene.search.FieldComparator; +import org.apache.lucene.search.SortField; +import org.apache.lucene.spatial.util.GeoUtils; + +/** + * Sorts by distance from an origin location. + */ +final class LatLonPointSortField extends SortField { + final double latitude; + final double longitude; + + LatLonPointSortField(String field, double latitude, double longitude) { + super(field, SortField.Type.CUSTOM); + if (field == null) { + throw new IllegalArgumentException("field cannot be null"); + } + if (GeoUtils.isValidLat(latitude) == false) { + throw new IllegalArgumentException("latitude: '" + latitude + "' is invalid"); + } + if (GeoUtils.isValidLon(longitude) == false) { + throw new IllegalArgumentException("longitude: '" + longitude + "' is invalid"); + } + this.latitude = latitude; + this.longitude = longitude; + setMissingValue(Double.POSITIVE_INFINITY); + } + + @Override + public FieldComparator getComparator(int numHits, int sortPos) throws IOException { + return new LatLonPointDistanceComparator(getField(), latitude, longitude, numHits, getMissingValue()); + } + + @Override + public Double getMissingValue() { + return (Double) super.getMissingValue(); + } + + @Override + public void setMissingValue(Object missingValue) { + if (missingValue == null) { + throw new IllegalArgumentException("Missing value cannot be null"); + } + if (missingValue.getClass() != Double.class) + throw new IllegalArgumentException("Missing value can only be of type java.lang.Double, but got " + missingValue.getClass()); + this.missingValue = missingValue; + } + + @Override + public int hashCode() { + final int prime = 31; + int result = super.hashCode(); + long temp; + temp = Double.doubleToLongBits(latitude); + result = prime * result + (int) (temp ^ (temp >>> 32)); + temp = Double.doubleToLongBits(longitude); + result = prime * result + (int) (temp ^ (temp >>> 32)); + return result; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) return true; + if (!super.equals(obj)) return false; + if (getClass() != obj.getClass()) return false; + LatLonPointSortField other = (LatLonPointSortField) obj; + if (Double.doubleToLongBits(latitude) != Double.doubleToLongBits(other.latitude)) return false; + if (Double.doubleToLongBits(longitude) != Double.doubleToLongBits(other.longitude)) return false; + return true; + } + + @Override + public String toString() { + StringBuilder builder = new StringBuilder(); + builder.append("", LatLonPoint.newDistanceSort("field", 18.0, 19.0).toString()); } /** Valid values that should not cause exception */ diff --git a/lucene/sandbox/src/test/org/apache/lucene/document/TestLatLonPointDistanceSort.java b/lucene/sandbox/src/test/org/apache/lucene/document/TestLatLonPointDistanceSort.java new file mode 100644 index 00000000000..a0ee83f3de0 --- /dev/null +++ b/lucene/sandbox/src/test/org/apache/lucene/document/TestLatLonPointDistanceSort.java @@ -0,0 +1,190 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.document; + +import java.io.IOException; +import java.util.Arrays; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.search.FieldDoc; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.SortField; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.TopFieldDocs; +import org.apache.lucene.spatial.util.GeoDistanceUtils; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.TestUtil; + +/** Simple tests for {@link LatLonPoint#newDistanceSort} */ +public class TestLatLonPointDistanceSort extends LuceneTestCase { + + /** Add three points and sort by distance */ + public void testDistanceSort() throws Exception { + Directory dir = newDirectory(); + RandomIndexWriter iw = new RandomIndexWriter(random(), dir); + + // add some docs + Document doc = new Document(); + doc.add(new LatLonPoint("location", 40.759011, -73.9844722)); + iw.addDocument(doc); + + doc = new Document(); + doc.add(new LatLonPoint("location", 40.718266, -74.007819)); + iw.addDocument(doc); + + doc = new Document(); + doc.add(new LatLonPoint("location", 40.7051157, -74.0088305)); + iw.addDocument(doc); + + IndexReader reader = iw.getReader(); + IndexSearcher searcher = new IndexSearcher(reader); + iw.close(); + + Sort sort = new Sort(LatLonPoint.newDistanceSort("location", 40.7143528, -74.0059731)); + TopFieldDocs td = searcher.search(new MatchAllDocsQuery(), 3, sort); + + FieldDoc d = (FieldDoc) td.scoreDocs[0]; + assertEquals(462.61748421408186D, (Double)d.fields[0], 0.0D); + + d = (FieldDoc) td.scoreDocs[1]; + assertEquals(1056.1630445911035D, (Double)d.fields[0], 0.0D); + + d = (FieldDoc) td.scoreDocs[2]; + assertEquals(5291.798081404466D, (Double)d.fields[0], 0.0D); + + reader.close(); + dir.close(); + } + + /** Run a few iterations with just 10 docs, hopefully easy to debug */ + public void testRandom() throws Exception { + for (int iters = 0; iters < 100; iters++) { + doRandomTest(10, 100); + } + } + + /** Runs with thousands of docs */ + @Nightly + public void testRandomHuge() throws Exception { + for (int iters = 0; iters < 10; iters++) { + doRandomTest(2000, 100); + } + } + + // result class used for testing. holds an id+distance. + // we sort these with Arrays.sort and compare with lucene's results + static class Result implements Comparable { + int id; + double distance; + + Result(int id, double distance) { + this.id = id; + this.distance = distance; + } + + @Override + public int compareTo(Result o) { + int cmp = Double.compare(distance, o.distance); + if (cmp == 0) { + return Integer.compare(id, o.id); + } + return cmp; + } + + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + long temp; + temp = Double.doubleToLongBits(distance); + result = prime * result + (int) (temp ^ (temp >>> 32)); + result = prime * result + id; + return result; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) return true; + if (obj == null) return false; + if (getClass() != obj.getClass()) return false; + Result other = (Result) obj; + if (Double.doubleToLongBits(distance) != Double.doubleToLongBits(other.distance)) return false; + if (id != other.id) return false; + return true; + } + } + + private void doRandomTest(int numDocs, int numQueries) throws IOException { + Directory dir = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), dir); + + for (int i = 0; i < numDocs; i++) { + double latRaw = -90 + 180.0 * random().nextDouble(); + double lonRaw = -180 + 360.0 * random().nextDouble(); + // pre-normalize up front, so we can just use quantized value for testing and do simple exact comparisons + double lat = LatLonPoint.decodeLatitude(LatLonPoint.encodeLatitude(latRaw)); + double lon = LatLonPoint.decodeLongitude(LatLonPoint.encodeLongitude(lonRaw)); + Document doc = new Document(); + doc.add(new StoredField("id", i)); + doc.add(new NumericDocValuesField("id", i)); + doc.add(new LatLonPoint("field", lat, lon)); + doc.add(new StoredField("lat", lat)); + doc.add(new StoredField("lon", lon)); + writer.addDocument(doc); + } + IndexReader reader = writer.getReader(); + IndexSearcher searcher = new IndexSearcher(reader); + + for (int i = 0; i < numQueries; i++) { + double lat = -90 + 180.0 * random().nextDouble(); + double lon = -180 + 360.0 * random().nextDouble(); + + Result expected[] = new Result[reader.maxDoc()]; + + for (int doc = 0; doc < reader.maxDoc(); doc++) { + Document targetDoc = reader.document(doc); + double docLatitude = targetDoc.getField("lat").numericValue().doubleValue(); + double docLongitude = targetDoc.getField("lon").numericValue().doubleValue(); + double distance = GeoDistanceUtils.haversin(lat, lon, docLatitude, docLongitude); + int id = targetDoc.getField("id").numericValue().intValue(); + expected[doc] = new Result(id, distance); + } + + Arrays.sort(expected); + + // randomize the topN a bit + int topN = TestUtil.nextInt(random(), 1, reader.maxDoc()); + // sort by distance, then ID + Sort sort = new Sort(LatLonPoint.newDistanceSort("field", lat, lon), + new SortField("id", SortField.Type.INT)); + + TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), topN, sort); + for (int resultNumber = 0; resultNumber < topN; resultNumber++) { + FieldDoc fieldDoc = (FieldDoc) topDocs.scoreDocs[resultNumber]; + Result actual = new Result((Integer) fieldDoc.fields[1], (Double) fieldDoc.fields[0]); + assertEquals(expected[resultNumber], actual); + } + } + reader.close(); + writer.close(); + dir.close(); + } +}