LUCENE-6519: make BKDPointInPolygonQueries much faster: avoid the per-hit polygon check when a leaf cell is fully contained by the polygon

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1684719 13f79535-47bb-0310-9956-ffa450edef68
2015-06-10 17:23:06 +00:00 · 2015-06-10 17:23:06 +00:00 · bbadb6fc42
parent f346fc0f21
commit bbadb6fc42
3 changed files with 65 additions and 80 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -66,6 +66,10 @@ New Features
  wrote the oldest segment in the index, for faster checking of "too
  old" indices (Ryan Ernst, Robert Muir, Mike McCandless)

+* LUCENE-6519: BKDPointInPolygonQuery is much faster by avoiding
+  the per-hit polygon check when a leaf cell is fully contained by the
+  polygon.  (Nick Knize, Mike McCandless)
+
 API Changes

 * LUCENE-6508: Simplify Lock api, there is now just 
--- a/lucene/sandbox/src/java/org/apache/lucene/bkdtree/BKDPointInPolygonQuery.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/bkdtree/BKDPointInPolygonQuery.java
@ -17,10 +17,6 @@ package org.apache.lucene.bkdtree;
 * limitations under the License.
 */

-import java.io.IOException;
-import java.util.Arrays;
-import java.util.Set;
-
 import org.apache.lucene.index.LeafReader;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.SortedNumericDocValues;
@ -33,8 +29,13 @@ import org.apache.lucene.search.Query;
 import org.apache.lucene.search.Scorer;
 import org.apache.lucene.search.Weight;
 import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.GeoUtils;
 import org.apache.lucene.util.ToStringUtils;

+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Set;
+
 /** Finds all previously indexed points that fall within the specified polygon.
 *
 *  <p>The field must be indexed with {@link BKDTreeDocValuesFormat}, and {@link BKDPointField} added per document.
@ -74,6 +75,8 @@ public class BKDPointInPolygonQuery extends Query {
    this.polyLats = polyLats;
    this.polyLons = polyLons;

+    // TODO: we could also compute the maximal innner bounding box, to make relations faster to compute?
+
    double minLon = Double.POSITIVE_INFINITY;
    double minLat = Double.POSITIVE_INFINITY;
    double maxLon = Double.NEGATIVE_INFINITY;
@ -161,7 +164,22 @@ public class BKDPointInPolygonQuery extends Query {
                                         new BKDTreeReader.LatLonFilter() {
                                           @Override
                                           public boolean accept(double lat, double lon) {
-                                             return pointInPolygon(lat, lon);
+                                             return GeoUtils.pointInPolygon(polyLons, polyLats, lat, lon);
+                                           }
+
+                                           @Override
+                                           public BKDTreeReader.Relation compare(double cellLatMin, double cellLatMax, double cellLonMin, double cellLonMax) {
+                                             if (GeoUtils.rectWithinPoly(cellLonMin, cellLatMin, cellLonMax, cellLatMax,
+                                                                         polyLons, polyLats,
+                                                                         minLon, minLat, maxLon, maxLat)) {
+                                               return BKDTreeReader.Relation.INSIDE;
+                                             } else if (GeoUtils.rectCrossesPoly(cellLonMin, cellLatMin, cellLonMax, cellLatMax,
+                                                                                 polyLons, polyLats,
+                                                                                 minLon, minLat, maxLon, maxLat)) {
+                                               return BKDTreeReader.Relation.CROSSES;
+                                             } else {
+                                               return BKDTreeReader.Relation.OUTSIDE;
+                                             }
                                           }
                                         }, treeDV.delegate);

@ -203,36 +221,6 @@ public class BKDPointInPolygonQuery extends Query {
    };
  }

-  // TODO: share w/ GeoUtils:
-
-  /**
-   * simple even-odd point in polygon computation
-   *    1.  Determine if point is contained in the longitudinal range
-   *    2.  Determine whether point crosses the edge by computing the latitudinal delta
-   *        between the end-point of a parallel vector (originating at the point) and the
-   *        y-component of the edge sink
-   *
-   * NOTE: Requires polygon point (x,y) order either clockwise or counter-clockwise
-   */
-  boolean pointInPolygon(double lat, double lon) {
-    /**
-     * Note: This is using a euclidean coordinate system which could result in
-     * upwards of 110KM error at the equator.
-     * TODO convert coordinates to cylindrical projection (e.g. mercator)
-     */
-
-    // TODO: this quantizes a bit differently ... boundary cases will fail here:
-    boolean inPoly = false;
-    for (int i = 1; i < polyLons.length; i++) {
-      if (polyLons[i] <= lon && polyLons[i-1] > lon || polyLons[i-1] <= lon && polyLons[i] > lon) {
-        if (polyLats[i] + (lon - polyLons[i]) / (polyLons[i-1] - polyLons[i]) * (polyLats[i-1] - polyLats[i]) <= lat) {
-          inPoly = !inPoly;
-        }
-      }
-    }
-    return inPoly;
-  }
-
  @Override
  @SuppressWarnings({"unchecked","rawtypes"})
  public boolean equals(Object o) {
--- a/lucene/sandbox/src/java/org/apache/lucene/bkdtree/BKDTreeReader.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/bkdtree/BKDTreeReader.java
@ -40,8 +40,11 @@ final class BKDTreeReader implements Accountable {
  final int maxDoc;
  final IndexInput in;

+  enum Relation {INSIDE, CROSSES, OUTSIDE};
+
  interface LatLonFilter {
    boolean accept(double lat, double lon);
+    Relation compare(double latMin, double latMax, double lonMin, double lonMax);
  }

  public BKDTreeReader(IndexInput in, int maxDoc) throws IOException {
@ -137,7 +140,19 @@ final class BKDTreeReader implements Accountable {

  /** Fast path: this is called when the query rect fully encompasses all cells under this node. */
  private int addAll(Bits acceptDocs, QueryState state, int nodeID) throws IOException {
+
+    //long latRange = (long) cellLatMaxEnc - (long) cellLatMinEnc;
+    //long lonRange = (long) cellLonMaxEnc - (long) cellLonMinEnc;
+
    if (nodeID >= leafNodeOffset) {
+
+      /*
+      System.out.println("A: " + BKDTreeWriter.decodeLat(cellLatMinEnc)
+                         + " " + BKDTreeWriter.decodeLat(cellLatMaxEnc)
+                         + " " + BKDTreeWriter.decodeLon(cellLonMinEnc)
+                         + " " + BKDTreeWriter.decodeLon(cellLonMaxEnc));
+      */
+
      // Leaf node
      long fp = leafBlockFPs[nodeID-leafNodeOffset];
      //System.out.println("    leaf nodeID=" + nodeID + " vs leafNodeOffset=" + leafNodeOffset + " fp=" + fp);
@ -152,40 +167,7 @@ final class BKDTreeReader implements Accountable {
      //System.out.println("    seek to leafFP=" + fp);
      // How many points are stored in this leaf cell:
      int count = state.in.readVInt();
-      if (state.latLonFilter != null) {
-        // Handle this differently since we must also look up lat/lon:
-
-        int hitCount = 0;
-        for(int i=0;i<count;i++) {
-
-          int docID = state.in.readInt();
-          
-          if (acceptDocs == null || acceptDocs.get(docID)) {
-
-            state.sndv.setDocument(docID);
-
-            // How many values this doc has:
-            int docValueCount = state.sndv.count();
-            for(int j=0;j<docValueCount;j++) {
-              long enc = state.sndv.valueAt(j);
-              int latEnc = (int) ((enc>>32) & 0xffffffffL);
-              int lonEnc = (int) (enc & 0xffffffffL);
-
-              // TODO: maybe we can fix LatLonFilter to operate on encoded forms?
-              if (state.latLonFilter.accept(BKDTreeWriter.decodeLat(latEnc), BKDTreeWriter.decodeLon(lonEnc))) {
-                state.bits.set(docID);
-                hitCount++;
-
-                // Stop processing values for this doc since it's now accepted:
-                break;
-              }
-            }
-          }
-        }
-
-        return hitCount;
-
-      } else if (acceptDocs != null) {
+      if (acceptDocs != null) {
        for(int i=0;i<count;i++) {
          int docID = state.in.readInt();
          if (acceptDocs.get(docID)) {
@ -213,8 +195,7 @@ final class BKDTreeReader implements Accountable {
      //System.out.println("  splitValue=" + splitValue);

      //System.out.println("  addAll: inner");
-      int count = 0;
-      count += addAll(acceptDocs, state, 2*nodeID);
+      int count = addAll(acceptDocs, state, 2*nodeID);
      count += addAll(acceptDocs, state, 2*nodeID+1);
      //System.out.println("  addAll: return count=" + count);
      return count;
@ -227,17 +208,29 @@ final class BKDTreeReader implements Accountable {
    throws IOException {

    // 2.06 sec -> 1.52 sec for 225 OSM London queries:
-    if (state.latMinEnc <= cellLatMinEnc && state.latMaxEnc >= cellLatMaxEnc && state.lonMinEnc <= cellLonMinEnc && state.lonMaxEnc >= cellLonMaxEnc) {
+    if (state.latLonFilter != null) {
+      if (cellLatMinEnc > state.latMinEnc ||
+          cellLatMaxEnc < state.latMaxEnc ||
+          cellLonMinEnc > state.lonMinEnc ||
+          cellLonMaxEnc < state.lonMaxEnc) {
+        Relation r = state.latLonFilter.compare(BKDTreeWriter.decodeLat(cellLatMinEnc),
+                                                BKDTreeWriter.decodeLat(cellLatMaxEnc),
+                                                BKDTreeWriter.decodeLon(cellLonMinEnc),
+                                                BKDTreeWriter.decodeLon(cellLonMaxEnc));
+        //System.out.println("BKD.intersect cellLat=" + BKDTreeWriter.decodeLat(cellLatMinEnc) + " TO " + BKDTreeWriter.decodeLat(cellLatMaxEnc) + ", cellLon=" + BKDTreeWriter.decodeLon(cellLonMinEnc) + " TO " + BKDTreeWriter.decodeLon(cellLonMaxEnc) + " compare=" + r);
+        if (r == Relation.OUTSIDE) {
+          // This cell is fully outside of the query shape: stop recursing
+          return 0;
+        } else if (r == Relation.INSIDE) {
+          // This cell is fully inside of the query shape: recursively add all points in this cell without filtering
+          return addAll(acceptDocs, state, nodeID);
+        } else {
+          // The cell crosses the shape boundary, so we fall through and do full filtering
+        }
+      }
+    } else if (state.latMinEnc <= cellLatMinEnc && state.latMaxEnc >= cellLatMaxEnc && state.lonMinEnc <= cellLonMinEnc && state.lonMaxEnc >= cellLonMaxEnc) {
      // Optimize the case when the query fully contains this cell: we can
      // recursively add all points without checking if they match the query:
-
-      /*
-      System.out.println("A: " + BKDTreeWriter.decodeLat(cellLatMinEnc)
-                         + " " + BKDTreeWriter.decodeLat(cellLatMaxEnc)
-                         + " " + BKDTreeWriter.decodeLon(cellLonMinEnc)
-                         + " " + BKDTreeWriter.decodeLon(cellLonMaxEnc));
-      */
-
      return addAll(acceptDocs, state, nodeID);
    }