From 6ea458a0edaa4b2e30a2c31dcb703350ee3936c1 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Wed, 16 Mar 2016 09:25:06 -0400 Subject: [PATCH] LUCENE-7109: LatLonPoint.newPolygonQuery should use two-phase iterator --- lucene/CHANGES.txt | 3 + .../document/LatLonPointInPolygonQuery.java | 65 ++++++++++++++++--- .../TestLatLonPointInPolygonQuery.java | 49 ++++++++++++++ 3 files changed, 107 insertions(+), 10 deletions(-) create mode 100644 lucene/sandbox/src/test/org/apache/lucene/document/TestLatLonPointInPolygonQuery.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 3b4f507942c..be28e340736 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -23,6 +23,9 @@ Optimizations * LUCENE-7105: Optimize LatLonPoint's newDistanceQuery. (Robert Muir) +* LUCENE-7109: LatLonPoint's newPolygonQuery supports two-phase + iteration. (Robert Muir) + * LUCENE-7097: IntroSorter now recurses to 2 * log_2(count) quicksort stack depth before switching to heapsort (Adrien Grand, Mike McCandless) diff --git a/lucene/sandbox/src/java/org/apache/lucene/document/LatLonPointInPolygonQuery.java b/lucene/sandbox/src/java/org/apache/lucene/document/LatLonPointInPolygonQuery.java index fb9189f2bb6..cb988951b94 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/document/LatLonPointInPolygonQuery.java +++ b/lucene/sandbox/src/java/org/apache/lucene/document/LatLonPointInPolygonQuery.java @@ -23,15 +23,23 @@ import org.apache.lucene.index.PointValues.IntersectVisitor; import org.apache.lucene.index.PointValues.Relation; import org.apache.lucene.search.ConstantScoreScorer; import org.apache.lucene.search.ConstantScoreWeight; +import org.apache.lucene.search.DocIdSet; +import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.Scorer; +import org.apache.lucene.search.TwoPhaseIterator; import org.apache.lucene.search.Weight; import org.apache.lucene.index.PointValues; +import org.apache.lucene.index.SortedNumericDocValues; +import org.apache.lucene.index.DocValues; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.util.BitSet; import org.apache.lucene.util.DocIdSetBuilder; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.SparseFixedBitSet; import org.apache.lucene.spatial.util.GeoRelationUtils; import org.apache.lucene.spatial.util.GeoUtils; @@ -110,9 +118,6 @@ final class LatLonPointInPolygonQuery extends Query { // I don't use RandomAccessWeight here: it's no good to approximate with "match all docs"; this is an inverted structure and should be // used in the first pass: - // TODO: except that the polygon verify is costly! The approximation should be all docs in all overlapping cells, and matches() should - // then check the polygon - return new ConstantScoreWeight(this) { @Override @@ -130,22 +135,28 @@ final class LatLonPointInPolygonQuery extends Query { } LatLonPoint.checkCompatible(fieldInfo); + // approximation (postfiltering has not yet been applied) DocIdSetBuilder result = new DocIdSetBuilder(reader.maxDoc()); + // subset of documents that need no postfiltering, this is purely an optimization + final BitSet preApproved; + // dumb heuristic: if the field is really sparse, use a sparse impl + if (values.getDocCount(field) * 100L < reader.maxDoc()) { + preApproved = new SparseFixedBitSet(reader.maxDoc()); + } else { + preApproved = new FixedBitSet(reader.maxDoc()); + } values.intersect(field, new IntersectVisitor() { @Override public void visit(int docID) { result.add(docID); + preApproved.set(docID); } @Override public void visit(int docID, byte[] packedValue) { - assert packedValue.length == 8; - double lat = LatLonPoint.decodeLatitude(packedValue, 0); - double lon = LatLonPoint.decodeLongitude(packedValue, Integer.BYTES); - if (GeoRelationUtils.pointInPolygon(polyLons, polyLats, lat, lon)) { - result.add(docID); - } + // TODO: range checks + result.add(docID); } @Override @@ -172,7 +183,41 @@ final class LatLonPointInPolygonQuery extends Query { } }); - return new ConstantScoreScorer(this, score(), result.build().iterator()); + DocIdSet set = result.build(); + final DocIdSetIterator disi = set.iterator(); + if (disi == null) { + return null; + } + + // return two-phase iterator using docvalues to postfilter candidates + SortedNumericDocValues docValues = DocValues.getSortedNumeric(reader, field); + TwoPhaseIterator iterator = new TwoPhaseIterator(disi) { + @Override + public boolean matches() throws IOException { + int docId = disi.docID(); + if (preApproved.get(docId)) { + return true; + } else { + docValues.setDocument(docId); + int count = docValues.count(); + for (int i = 0; i < count; i++) { + long encoded = docValues.valueAt(i); + double docLatitude = LatLonPoint.decodeLatitude((int)(encoded >> 32)); + double docLongitude = LatLonPoint.decodeLongitude((int)(encoded & 0xFFFFFFFF)); + if (GeoRelationUtils.pointInPolygon(polyLons, polyLats, docLatitude, docLongitude)) { + return true; + } + } + return false; + } + } + + @Override + public float matchCost() { + return 20 * polyLons.length; // TODO: make this fancier, but currently linear with number of vertices + } + }; + return new ConstantScoreScorer(this, score(), iterator); } }; } diff --git a/lucene/sandbox/src/test/org/apache/lucene/document/TestLatLonPointInPolygonQuery.java b/lucene/sandbox/src/test/org/apache/lucene/document/TestLatLonPointInPolygonQuery.java new file mode 100644 index 00000000000..de870277a33 --- /dev/null +++ b/lucene/sandbox/src/test/org/apache/lucene/document/TestLatLonPointInPolygonQuery.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.document; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.LuceneTestCase; + +/** Simple tests for {@link LatLonPoint#newPolygonQuery} */ +public class TestLatLonPointInPolygonQuery extends LuceneTestCase { + + /** test we can search for a polygon */ + public void testBasics() throws Exception { + Directory dir = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), dir); + + // add a doc with a point + Document document = new Document(); + document.add(new LatLonPoint("field", 18.313694, -65.227444)); + writer.addDocument(document); + + // search and verify we found our doc + IndexReader reader = writer.getReader(); + IndexSearcher searcher = newSearcher(reader); + assertEquals(1, searcher.count(LatLonPoint.newPolygonQuery("field", + new double[] { 18, 18, 19, 19, 18 }, + new double[] { -66, -65, -65, -66, -66 }))); + + reader.close(); + writer.close(); + dir.close(); + } +}