LUCENE-10425：speed up IndexSortSortedNumericDocValuesRangeQuery#BoundedDocSetIdIterator construction using bkd binary search (#687)

2022-09-22 14:51:13 +08:00 · 2022-09-22 14:51:13 +08:00 · 5b24a233bd
parent bcc116057d
commit 5b24a233bd
3 changed files with 282 additions and 0 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -101,6 +101,9 @@ Improvements
 * GITHUB#11785: Improve Tessellator performance by delaying calls to the method
  #isIntersectingPolygon (Ignacio Vera)   
 * GITHUB#687: speed up IndexSortSortedNumericDocValuesRangeQuery#BoundedDocIdSetIterator
  construction using bkd binary search. (Jianping Weng)
 Bug Fixes
 ---------------------
 * GITHUB#11726: Indexing term vectors on large documents could fail due to
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/IndexSortSortedNumericDocValuesRangeQuery.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/IndexSortSortedNumericDocValuesRangeQuery.java
@ -18,12 +18,17 @@ package org.apache.lucene.sandbox.search;
 import java.io.IOException;
 import java.util.Objects;
 import java.util.function.Predicate;
 import org.apache.lucene.document.IntPoint;
 import org.apache.lucene.document.LongPoint;
 import org.apache.lucene.index.DocValues;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.LeafReader;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.NumericDocValues;
 import org.apache.lucene.index.PointValues;
 import org.apache.lucene.index.PointValues.IntersectVisitor;
 import org.apache.lucene.index.PointValues.Relation;
 import org.apache.lucene.index.SortedNumericDocValues;
 import org.apache.lucene.search.ConstantScoreScorer;
 import org.apache.lucene.search.ConstantScoreWeight;
@ -43,6 +48,8 @@ import org.apache.lucene.search.SortField;
 import org.apache.lucene.search.SortField.Type;
 import org.apache.lucene.search.SortedNumericSortField;
 import org.apache.lucene.search.Weight;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.ArrayUtil.ByteArrayComparator;
 /**
 * A range query that can take advantage of the fact that the index is sorted to speed up execution.
@ -214,12 +221,172 @@ public class IndexSortSortedNumericDocValuesRangeQuery extends Query {
    };
  }
  /**
   * Returns the first document whose packed value is greater than or equal (if allowEqual is true)
   * to the provided packed value or -1 if all packed values are smaller than the provided one,
   */
  public final int nextDoc(PointValues values, byte[] packedValue, boolean allowEqual)
      throws IOException {
    assert values.getNumDimensions() == 1;
    final int bytesPerDim = values.getBytesPerDimension();
    final ByteArrayComparator comparator = ArrayUtil.getUnsignedComparator(bytesPerDim);
    final Predicate<byte[]> biggerThan =
        testPackedValue -> {
          int cmp = comparator.compare(testPackedValue, 0, packedValue, 0);
          return cmp > 0 || (cmp == 0 && allowEqual);
        };
    return nextDoc(values.getPointTree(), biggerThan);
  }
  private int nextDoc(PointValues.PointTree pointTree, Predicate<byte[]> biggerThan)
      throws IOException {
    if (biggerThan.test(pointTree.getMaxPackedValue()) == false) {
      // doc is before us
      return -1;
    } else if (pointTree.moveToChild()) {
      // navigate down
      do {
        final int doc = nextDoc(pointTree, biggerThan);
        if (doc != -1) {
          return doc;
        }
      } while (pointTree.moveToSibling());
      pointTree.moveToParent();
      return -1;
    } else {
      // doc is in this leaf
      final int[] doc = {-1};
      pointTree.visitDocValues(
          new IntersectVisitor() {
            @Override
            public void visit(int docID) {
              throw new AssertionError("Invalid call to visit(docID)");
            }
            @Override
            public void visit(int docID, byte[] packedValue) {
              if (doc[0] == -1 && biggerThan.test(packedValue)) {
                doc[0] = docID;
              }
            }
            @Override
            public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
              return Relation.CELL_CROSSES_QUERY;
            }
          });
      return doc[0];
    }
  }
  private boolean matchNone(PointValues points, byte[] queryLowerPoint, byte[] queryUpperPoint)
      throws IOException {
    final ByteArrayComparator comparator =
        ArrayUtil.getUnsignedComparator(points.getBytesPerDimension());
    for (int dim = 0; dim < points.getNumDimensions(); dim++) {
      int offset = dim * points.getBytesPerDimension();
      if (comparator.compare(points.getMinPackedValue(), offset, queryUpperPoint, offset) > 0
          || comparator.compare(points.getMaxPackedValue(), offset, queryLowerPoint, offset) < 0) {
        return true;
      }
    }
    return false;
  }
  private boolean matchAll(PointValues points, byte[] queryLowerPoint, byte[] queryUpperPoint)
      throws IOException {
    final ByteArrayComparator comparator =
        ArrayUtil.getUnsignedComparator(points.getBytesPerDimension());
    for (int dim = 0; dim < points.getNumDimensions(); dim++) {
      int offset = dim * points.getBytesPerDimension();
      if (comparator.compare(points.getMinPackedValue(), offset, queryUpperPoint, offset) > 0) {
        return false;
      }
      if (comparator.compare(points.getMaxPackedValue(), offset, queryLowerPoint, offset) < 0) {
        return false;
      }
      if (comparator.compare(points.getMinPackedValue(), offset, queryLowerPoint, offset) < 0
          || comparator.compare(points.getMaxPackedValue(), offset, queryUpperPoint, offset) > 0) {
        return false;
      }
    }
    return true;
  }
  private BoundedDocIdSetIterator getDocIdSetIteratorOrNullFromBkd(
      LeafReaderContext context, DocIdSetIterator delegate) throws IOException {
    Sort indexSort = context.reader().getMetaData().getSort();
    if (indexSort != null
        && indexSort.getSort().length > 0
        && indexSort.getSort()[0].getField().equals(field)
        && indexSort.getSort()[0].getReverse() == false) {
      PointValues points = context.reader().getPointValues(field);
      if (points == null) {
        return null;
      }
      if (points.getNumDimensions() != 1) {
        return null;
      }
      if (points.getBytesPerDimension() != Long.BYTES
          && points.getBytesPerDimension() != Integer.BYTES) {
        return null;
      }
      // Each doc that has points has exactly one point.
      if (points.size() == points.getDocCount()) {
        byte[] queryLowerPoint;
        byte[] queryUpperPoint;
        if (points.getBytesPerDimension() == Integer.BYTES) {
          queryLowerPoint = IntPoint.pack((int) lowerValue).bytes;
          queryUpperPoint = IntPoint.pack((int) upperValue).bytes;
        } else {
          queryLowerPoint = LongPoint.pack(lowerValue).bytes;
          queryUpperPoint = LongPoint.pack(upperValue).bytes;
        }
        if (lowerValue > upperValue || matchNone(points, queryLowerPoint, queryUpperPoint)) {
          return new BoundedDocIdSetIterator(0, 0, null);
        }
        int minDocId, maxDocId;
        if (matchAll(points, queryLowerPoint, queryUpperPoint)) {
          minDocId = 0;
          maxDocId = context.reader().maxDoc();
        } else {
          // >=queryLowerPoint
          minDocId = nextDoc(points, queryLowerPoint, true);
          if (minDocId == -1) {
            return new BoundedDocIdSetIterator(0, 0, null);
          }
          // >queryUpperPoint,
          maxDocId = nextDoc(points, queryUpperPoint, false);
          if (maxDocId == -1) {
            maxDocId = context.reader().maxDoc();
          }
        }
        if ((points.getDocCount() == context.reader().maxDoc())) {
          return new BoundedDocIdSetIterator(minDocId, maxDocId, null);
        } else {
          return new BoundedDocIdSetIterator(minDocId, maxDocId, delegate);
        }
      }
    }
    return null;
  }
  private BoundedDocIdSetIterator getDocIdSetIteratorOrNull(LeafReaderContext context)
      throws IOException {
    SortedNumericDocValues sortedNumericValues =
        DocValues.getSortedNumeric(context.reader(), field);
    NumericDocValues numericValues = DocValues.unwrapSingleton(sortedNumericValues);
    if (numericValues != null) {
      BoundedDocIdSetIterator iterator = getDocIdSetIteratorOrNullFromBkd(context, numericValues);
      if (iterator != null) {
        return iterator;
      }
      Sort indexSort = context.reader().getMetaData().getSort();
      if (indexSort != null
          && indexSort.getSort().length > 0
--- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestIndexSortSortedNumericDocValuesRangeQuery.java
+++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestIndexSortSortedNumericDocValuesRangeQuery.java
@ -19,6 +19,7 @@ package org.apache.lucene.sandbox.search;
 import static org.hamcrest.CoreMatchers.instanceOf;
 import java.io.IOException;
 import java.util.Random;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.LongPoint;
@ -641,4 +642,115 @@ public class TestIndexSortSortedNumericDocValuesRangeQuery extends LuceneTestCas
    return new IndexSortSortedNumericDocValuesRangeQuery(
        field, lowerValue, upperValue, fallbackQuery);
  }
  public void testCountWithBkd() throws IOException {
    String filedName = "field";
    Directory dir = newDirectory();
    IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
    Sort indexSort = new Sort(new SortedNumericSortField(filedName, SortField.Type.LONG, false));
    iwc.setIndexSort(indexSort);
    RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc);
    addDocWithBkd(writer, filedName, 6, 500);
    addDocWithBkd(writer, filedName, 5, 500);
    addDocWithBkd(writer, filedName, 8, 500);
    addDocWithBkd(writer, filedName, 9, 500);
    addDocWithBkd(writer, filedName, 7, 500);
    writer.flush();
    writer.forceMerge(1);
    IndexReader reader = writer.getReader();
    IndexSearcher searcher = newSearcher(reader);
    Query fallbackQuery = LongPoint.newRangeQuery(filedName, 6, 8);
    Query query = new IndexSortSortedNumericDocValuesRangeQuery(filedName, 6, 8, fallbackQuery);
    Weight weight = query.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
    for (LeafReaderContext context : searcher.getLeafContexts()) {
      assertEquals(1500, weight.count(context));
    }
    fallbackQuery = LongPoint.newRangeQuery(filedName, 6, 10);
    query = new IndexSortSortedNumericDocValuesRangeQuery(filedName, 6, 10, fallbackQuery);
    weight = query.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
    for (LeafReaderContext context : searcher.getLeafContexts()) {
      assertEquals(2000, weight.count(context));
    }
    fallbackQuery = LongPoint.newRangeQuery(filedName, 4, 6);
    query = new IndexSortSortedNumericDocValuesRangeQuery(filedName, 4, 6, fallbackQuery);
    weight = query.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
    for (LeafReaderContext context : searcher.getLeafContexts()) {
      assertEquals(1000, weight.count(context));
    }
    fallbackQuery = LongPoint.newRangeQuery(filedName, 2, 10);
    query = new IndexSortSortedNumericDocValuesRangeQuery(filedName, 2, 10, fallbackQuery);
    weight = query.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
    for (LeafReaderContext context : searcher.getLeafContexts()) {
      assertEquals(2500, weight.count(context));
    }
    fallbackQuery = LongPoint.newRangeQuery(filedName, 2, 3);
    query = new IndexSortSortedNumericDocValuesRangeQuery(filedName, 2, 3, fallbackQuery);
    weight = query.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
    for (LeafReaderContext context : searcher.getLeafContexts()) {
      assertEquals(0, weight.count(context));
    }
    fallbackQuery = LongPoint.newRangeQuery(filedName, 10, 11);
    query = new IndexSortSortedNumericDocValuesRangeQuery(filedName, 10, 11, fallbackQuery);
    weight = query.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
    for (LeafReaderContext context : searcher.getLeafContexts()) {
      assertEquals(0, weight.count(context));
    }
    writer.close();
    reader.close();
    dir.close();
  }
  public void testRandomCountWithBkd() throws IOException {
    String filedName = "field";
    Directory dir = newDirectory();
    IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
    Sort indexSort = new Sort(new SortedNumericSortField(filedName, SortField.Type.LONG, false));
    iwc.setIndexSort(indexSort);
    RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc);
    Random random = random();
    for (int i = 0; i < 100; i++) {
      addDocWithBkd(writer, filedName, random.nextInt(1000), random.nextInt(1000));
    }
    writer.flush();
    writer.forceMerge(1);
    IndexReader reader = writer.getReader();
    IndexSearcher searcher = newSearcher(reader);
    for (int i = 0; i < 100; i++) {
      int random1 = random.nextInt(1100);
      int random2 = random.nextInt(1100);
      int low = Math.min(random1, random2);
      int upper = Math.max(random1, random2);
      Query rangeQuery = LongPoint.newRangeQuery(filedName, low, upper);
      Query indexSortRangeQuery =
          new IndexSortSortedNumericDocValuesRangeQuery(filedName, low, upper, rangeQuery);
      Weight indexSortRangeQueryWeight =
          indexSortRangeQuery.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
      Weight rangeQueryWeight = rangeQuery.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
      for (LeafReaderContext context : searcher.getLeafContexts()) {
        assertEquals(rangeQueryWeight.count(context), indexSortRangeQueryWeight.count(context));
      }
    }
    writer.close();
    reader.close();
    dir.close();
  }
  private void addDocWithBkd(RandomIndexWriter indexWriter, String field, long value, int repeat)
      throws IOException {
    for (int i = 0; i < repeat; i++) {
      Document doc = new Document();
      doc.add(new SortedNumericDocValuesField(field, value));
      doc.add(new LongPoint(field, value));
      indexWriter.addDocument(doc);
    }
  }
 }