LUCENE-10425：speed up IndexSortSortedNumericDocValuesRangeQuery#BoundedDocSetIdIterator construction using bkd binary search (#687)

2022-09-22 14:51:13 +08:00 · 2022-09-22 14:51:13 +08:00 · 5b24a233bd
parent bcc116057d
commit 5b24a233bd
3 changed files with 282 additions and 0 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -101,6 +101,9 @@ Improvements
 * GITHUB#11785: Improve Tessellator performance by delaying calls to the method
  #isIntersectingPolygon (Ignacio Vera)   

+* GITHUB#687: speed up IndexSortSortedNumericDocValuesRangeQuery#BoundedDocIdSetIterator
+  construction using bkd binary search. (Jianping Weng)
+
 Bug Fixes
 ---------------------
 * GITHUB#11726: Indexing term vectors on large documents could fail due to
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/IndexSortSortedNumericDocValuesRangeQuery.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/IndexSortSortedNumericDocValuesRangeQuery.java
@ -18,12 +18,17 @@ package org.apache.lucene.sandbox.search;

 import java.io.IOException;
 import java.util.Objects;
+import java.util.function.Predicate;
+import org.apache.lucene.document.IntPoint;
+import org.apache.lucene.document.LongPoint;
 import org.apache.lucene.index.DocValues;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.LeafReader;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.NumericDocValues;
 import org.apache.lucene.index.PointValues;
+import org.apache.lucene.index.PointValues.IntersectVisitor;
+import org.apache.lucene.index.PointValues.Relation;
 import org.apache.lucene.index.SortedNumericDocValues;
 import org.apache.lucene.search.ConstantScoreScorer;
 import org.apache.lucene.search.ConstantScoreWeight;
@ -43,6 +48,8 @@ import org.apache.lucene.search.SortField;
 import org.apache.lucene.search.SortField.Type;
 import org.apache.lucene.search.SortedNumericSortField;
 import org.apache.lucene.search.Weight;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.ArrayUtil.ByteArrayComparator;

 /**
 * A range query that can take advantage of the fact that the index is sorted to speed up execution.
@ -214,12 +221,172 @@ public class IndexSortSortedNumericDocValuesRangeQuery extends Query {
    };
  }

+  /**
+   * Returns the first document whose packed value is greater than or equal (if allowEqual is true)
+   * to the provided packed value or -1 if all packed values are smaller than the provided one,
+   */
+  public final int nextDoc(PointValues values, byte[] packedValue, boolean allowEqual)
+      throws IOException {
+    assert values.getNumDimensions() == 1;
+    final int bytesPerDim = values.getBytesPerDimension();
+    final ByteArrayComparator comparator = ArrayUtil.getUnsignedComparator(bytesPerDim);
+    final Predicate<byte[]> biggerThan =
+        testPackedValue -> {
+          int cmp = comparator.compare(testPackedValue, 0, packedValue, 0);
+          return cmp > 0 || (cmp == 0 && allowEqual);
+        };
+    return nextDoc(values.getPointTree(), biggerThan);
+  }
+
+  private int nextDoc(PointValues.PointTree pointTree, Predicate<byte[]> biggerThan)
+      throws IOException {
+    if (biggerThan.test(pointTree.getMaxPackedValue()) == false) {
+      // doc is before us
+      return -1;
+    } else if (pointTree.moveToChild()) {
+      // navigate down
+      do {
+        final int doc = nextDoc(pointTree, biggerThan);
+        if (doc != -1) {
+          return doc;
+        }
+      } while (pointTree.moveToSibling());
+      pointTree.moveToParent();
+      return -1;
+    } else {
+      // doc is in this leaf
+      final int[] doc = {-1};
+      pointTree.visitDocValues(
+          new IntersectVisitor() {
+            @Override
+            public void visit(int docID) {
+              throw new AssertionError("Invalid call to visit(docID)");
+            }
+
+            @Override
+            public void visit(int docID, byte[] packedValue) {
+              if (doc[0] == -1 && biggerThan.test(packedValue)) {
+                doc[0] = docID;
+              }
+            }
+
+            @Override
+            public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
+              return Relation.CELL_CROSSES_QUERY;
+            }
+          });
+      return doc[0];
+    }
+  }
+
+  private boolean matchNone(PointValues points, byte[] queryLowerPoint, byte[] queryUpperPoint)
+      throws IOException {
+    final ByteArrayComparator comparator =
+        ArrayUtil.getUnsignedComparator(points.getBytesPerDimension());
+    for (int dim = 0; dim < points.getNumDimensions(); dim++) {
+      int offset = dim * points.getBytesPerDimension();
+      if (comparator.compare(points.getMinPackedValue(), offset, queryUpperPoint, offset) > 0
+          || comparator.compare(points.getMaxPackedValue(), offset, queryLowerPoint, offset) < 0) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  private boolean matchAll(PointValues points, byte[] queryLowerPoint, byte[] queryUpperPoint)
+      throws IOException {
+    final ByteArrayComparator comparator =
+        ArrayUtil.getUnsignedComparator(points.getBytesPerDimension());
+    for (int dim = 0; dim < points.getNumDimensions(); dim++) {
+      int offset = dim * points.getBytesPerDimension();
+      if (comparator.compare(points.getMinPackedValue(), offset, queryUpperPoint, offset) > 0) {
+        return false;
+      }
+      if (comparator.compare(points.getMaxPackedValue(), offset, queryLowerPoint, offset) < 0) {
+        return false;
+      }
+      if (comparator.compare(points.getMinPackedValue(), offset, queryLowerPoint, offset) < 0
+          || comparator.compare(points.getMaxPackedValue(), offset, queryUpperPoint, offset) > 0) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  private BoundedDocIdSetIterator getDocIdSetIteratorOrNullFromBkd(
+      LeafReaderContext context, DocIdSetIterator delegate) throws IOException {
+    Sort indexSort = context.reader().getMetaData().getSort();
+    if (indexSort != null
+        && indexSort.getSort().length > 0
+        && indexSort.getSort()[0].getField().equals(field)
+        && indexSort.getSort()[0].getReverse() == false) {
+      PointValues points = context.reader().getPointValues(field);
+      if (points == null) {
+        return null;
+      }
+
+      if (points.getNumDimensions() != 1) {
+        return null;
+      }
+
+      if (points.getBytesPerDimension() != Long.BYTES
+          && points.getBytesPerDimension() != Integer.BYTES) {
+        return null;
+      }
+
+      // Each doc that has points has exactly one point.
+      if (points.size() == points.getDocCount()) {
+
+        byte[] queryLowerPoint;
+        byte[] queryUpperPoint;
+        if (points.getBytesPerDimension() == Integer.BYTES) {
+          queryLowerPoint = IntPoint.pack((int) lowerValue).bytes;
+          queryUpperPoint = IntPoint.pack((int) upperValue).bytes;
+        } else {
+          queryLowerPoint = LongPoint.pack(lowerValue).bytes;
+          queryUpperPoint = LongPoint.pack(upperValue).bytes;
+        }
+        if (lowerValue > upperValue || matchNone(points, queryLowerPoint, queryUpperPoint)) {
+          return new BoundedDocIdSetIterator(0, 0, null);
+        }
+        int minDocId, maxDocId;
+        if (matchAll(points, queryLowerPoint, queryUpperPoint)) {
+          minDocId = 0;
+          maxDocId = context.reader().maxDoc();
+        } else {
+          // >=queryLowerPoint
+          minDocId = nextDoc(points, queryLowerPoint, true);
+
+          if (minDocId == -1) {
+            return new BoundedDocIdSetIterator(0, 0, null);
+          }
+          // >queryUpperPoint,
+          maxDocId = nextDoc(points, queryUpperPoint, false);
+          if (maxDocId == -1) {
+            maxDocId = context.reader().maxDoc();
+          }
+        }
+
+        if ((points.getDocCount() == context.reader().maxDoc())) {
+          return new BoundedDocIdSetIterator(minDocId, maxDocId, null);
+        } else {
+          return new BoundedDocIdSetIterator(minDocId, maxDocId, delegate);
+        }
+      }
+    }
+    return null;
+  }
+
  private BoundedDocIdSetIterator getDocIdSetIteratorOrNull(LeafReaderContext context)
      throws IOException {
    SortedNumericDocValues sortedNumericValues =
        DocValues.getSortedNumeric(context.reader(), field);
    NumericDocValues numericValues = DocValues.unwrapSingleton(sortedNumericValues);
    if (numericValues != null) {
+      BoundedDocIdSetIterator iterator = getDocIdSetIteratorOrNullFromBkd(context, numericValues);
+      if (iterator != null) {
+        return iterator;
+      }
      Sort indexSort = context.reader().getMetaData().getSort();
      if (indexSort != null
          && indexSort.getSort().length > 0
--- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestIndexSortSortedNumericDocValuesRangeQuery.java
+++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestIndexSortSortedNumericDocValuesRangeQuery.java
@ -19,6 +19,7 @@ package org.apache.lucene.sandbox.search;
 import static org.hamcrest.CoreMatchers.instanceOf;

 import java.io.IOException;
+import java.util.Random;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.LongPoint;
@ -641,4 +642,115 @@ public class TestIndexSortSortedNumericDocValuesRangeQuery extends LuceneTestCas
    return new IndexSortSortedNumericDocValuesRangeQuery(
        field, lowerValue, upperValue, fallbackQuery);
  }
+
+  public void testCountWithBkd() throws IOException {
+    String filedName = "field";
+    Directory dir = newDirectory();
+    IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
+    Sort indexSort = new Sort(new SortedNumericSortField(filedName, SortField.Type.LONG, false));
+    iwc.setIndexSort(indexSort);
+    RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc);
+    addDocWithBkd(writer, filedName, 6, 500);
+    addDocWithBkd(writer, filedName, 5, 500);
+    addDocWithBkd(writer, filedName, 8, 500);
+    addDocWithBkd(writer, filedName, 9, 500);
+    addDocWithBkd(writer, filedName, 7, 500);
+    writer.flush();
+    writer.forceMerge(1);
+    IndexReader reader = writer.getReader();
+    IndexSearcher searcher = newSearcher(reader);
+
+    Query fallbackQuery = LongPoint.newRangeQuery(filedName, 6, 8);
+    Query query = new IndexSortSortedNumericDocValuesRangeQuery(filedName, 6, 8, fallbackQuery);
+    Weight weight = query.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
+    for (LeafReaderContext context : searcher.getLeafContexts()) {
+      assertEquals(1500, weight.count(context));
+    }
+
+    fallbackQuery = LongPoint.newRangeQuery(filedName, 6, 10);
+    query = new IndexSortSortedNumericDocValuesRangeQuery(filedName, 6, 10, fallbackQuery);
+    weight = query.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
+    for (LeafReaderContext context : searcher.getLeafContexts()) {
+      assertEquals(2000, weight.count(context));
+    }
+
+    fallbackQuery = LongPoint.newRangeQuery(filedName, 4, 6);
+    query = new IndexSortSortedNumericDocValuesRangeQuery(filedName, 4, 6, fallbackQuery);
+    weight = query.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
+    for (LeafReaderContext context : searcher.getLeafContexts()) {
+      assertEquals(1000, weight.count(context));
+    }
+
+    fallbackQuery = LongPoint.newRangeQuery(filedName, 2, 10);
+    query = new IndexSortSortedNumericDocValuesRangeQuery(filedName, 2, 10, fallbackQuery);
+    weight = query.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
+    for (LeafReaderContext context : searcher.getLeafContexts()) {
+      assertEquals(2500, weight.count(context));
+    }
+
+    fallbackQuery = LongPoint.newRangeQuery(filedName, 2, 3);
+    query = new IndexSortSortedNumericDocValuesRangeQuery(filedName, 2, 3, fallbackQuery);
+    weight = query.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
+    for (LeafReaderContext context : searcher.getLeafContexts()) {
+      assertEquals(0, weight.count(context));
+    }
+
+    fallbackQuery = LongPoint.newRangeQuery(filedName, 10, 11);
+    query = new IndexSortSortedNumericDocValuesRangeQuery(filedName, 10, 11, fallbackQuery);
+    weight = query.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
+    for (LeafReaderContext context : searcher.getLeafContexts()) {
+      assertEquals(0, weight.count(context));
+    }
+
+    writer.close();
+    reader.close();
+    dir.close();
+  }
+
+  public void testRandomCountWithBkd() throws IOException {
+    String filedName = "field";
+    Directory dir = newDirectory();
+    IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
+    Sort indexSort = new Sort(new SortedNumericSortField(filedName, SortField.Type.LONG, false));
+    iwc.setIndexSort(indexSort);
+    RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc);
+    Random random = random();
+    for (int i = 0; i < 100; i++) {
+      addDocWithBkd(writer, filedName, random.nextInt(1000), random.nextInt(1000));
+    }
+    writer.flush();
+    writer.forceMerge(1);
+    IndexReader reader = writer.getReader();
+    IndexSearcher searcher = newSearcher(reader);
+
+    for (int i = 0; i < 100; i++) {
+      int random1 = random.nextInt(1100);
+      int random2 = random.nextInt(1100);
+      int low = Math.min(random1, random2);
+      int upper = Math.max(random1, random2);
+      Query rangeQuery = LongPoint.newRangeQuery(filedName, low, upper);
+      Query indexSortRangeQuery =
+          new IndexSortSortedNumericDocValuesRangeQuery(filedName, low, upper, rangeQuery);
+      Weight indexSortRangeQueryWeight =
+          indexSortRangeQuery.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
+      Weight rangeQueryWeight = rangeQuery.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
+      for (LeafReaderContext context : searcher.getLeafContexts()) {
+        assertEquals(rangeQueryWeight.count(context), indexSortRangeQueryWeight.count(context));
+      }
+    }
+
+    writer.close();
+    reader.close();
+    dir.close();
+  }
+
+  private void addDocWithBkd(RandomIndexWriter indexWriter, String field, long value, int repeat)
+      throws IOException {
+    for (int i = 0; i < repeat; i++) {
+      Document doc = new Document();
+      doc.add(new SortedNumericDocValuesField(field, value));
+      doc.add(new LongPoint(field, value));
+      indexWriter.addDocument(doc);
+    }
+  }
 }