Merge branch 'apache:main' into bpv21_main

2024-10-16 15:17:20 +05:30 · 2024-10-16 15:17:20 +05:30 · 7312d91394
parent 295531c9a3 789658819f
commit 7312d91394
9 changed files with 96 additions and 64 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -47,8 +47,16 @@ Improvements

 Optimizations
 ---------------------
+
 * GITHUB#13828: Reduce long[] array allocation for bitset in readBitSetIterator. (Zhang Chao)

+* GITHUB#13800: MaxScoreBulkScorer now recomputes scorer partitions when the
+  minimum competitive allows for a more favorable partitioning. (Adrien Grand)
+
+* GITHUB#13904: BlockMaxConjunctionBulkScorer can now early exit when the
+  leading clause has a single impact block (e.g. ConstantScoreQuery).
+  (Adrien Grand)
+
 Bug Fixes
 ---------------------
 * GITHUB#13832: Fixed an issue where the DefaultPassageFormatter.format method did not format passages as intended
--- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/BackwardsCompatibilityTestBase.java
+++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/BackwardsCompatibilityTestBase.java
@ -115,9 +115,6 @@ public abstract class BackwardsCompatibilityTestBase extends LuceneTestCase {
   */
  protected BackwardsCompatibilityTestBase(
      @Name("version") Version version, @Name("pattern") String indexPattern) {
-    // TODO: add 10.0.0 bw indices after 10.0.0 has been released, see
-    // https://github.com/apache/lucene/issues/13847
-    assumeTrue("Can only test with 10.0.0 has been released", version.major < 10);
    this.version = version;
    this.indexPattern = indexPattern;
  }
--- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestBasicBackwardsCompatibility.java
+++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestBasicBackwardsCompatibility.java
@ -832,7 +832,7 @@ public class TestBasicBackwardsCompatibility extends BackwardsCompatibilityTestB
          expectThrows(IllegalArgumentException.class, () -> TestUtil.addIndexesSlowly(w, reader));
      assertEquals(
          e.getMessage(),
-          "Cannot merge a segment that has been created with major version 9 into this index which has been created by major version 10");
+          "Cannot merge a segment that has been created with major version 10 into this index which has been created by major version 11");
      w.close();
      targetDir2.close();

--- a/lucene/core/src/java/org/apache/lucene/search/BlockMaxConjunctionBulkScorer.java
+++ b/lucene/core/src/java/org/apache/lucene/search/BlockMaxConjunctionBulkScorer.java
@ -85,9 +85,20 @@ final class BlockMaxConjunctionBulkScorer extends BulkScorer {

    int windowMin = Math.max(lead1.docID(), min);
    while (windowMin < max) {
-      // Use impacts of the least costly scorer to compute windows
-      // NOTE: windowMax is inclusive
-      int windowMax = Math.min(scorers[0].advanceShallow(windowMin), max - 1);
+      // Use impacts of the least costly scorer to compute windows to keep the per-block overhead
+      // under control.
+      // NOTE: windowMax is inclusive.
+      int windowMax = scorer1.advanceShallow(windowMin);
+      if (windowMax == DocIdSetIterator.NO_MORE_DOCS) {
+        // If the query doesn't have impacts anymore, or has a single block for the whole doc ID
+        // space (e.g. ConstantScoreQuery), then we try to create a block that has ~128 docs of the
+        // leading clause. This gives us higher chances to exit early based on the maximum scores of
+        // other clauses.
+        long windowSize = 128L * maxDoc / Math.max(1, lead1.cost());
+        windowSize = Math.max(windowSize, 128L);
+        windowMax = (int) Math.min(Integer.MAX_VALUE, windowMin + windowSize);
+      }
+      windowMax = Math.min(windowMax, max - 1);

      float maxWindowScore = Float.POSITIVE_INFINITY;
      if (0 < scorable.minCompetitiveScore) {
--- a/lucene/core/src/java/org/apache/lucene/search/MaxScoreBulkScorer.java
+++ b/lucene/core/src/java/org/apache/lucene/search/MaxScoreBulkScorer.java
@ -40,6 +40,8 @@ final class MaxScoreBulkScorer extends BulkScorer {
  // Index of the first scorer that is required, this scorer and all following scorers are required
  // for a document to match.
  int firstRequiredScorer;
+  // The minimum value of minCompetitiveScore that would produce a more favorable partitioning.
+  float nextMinCompetitiveScore;
  private final long cost;
  float minCompetitiveScore;
  private final Score scorable = new Score();
@ -114,9 +116,14 @@ final class MaxScoreBulkScorer extends BulkScorer {
      while (top.doc < outerWindowMax) {
        scoreInnerWindow(collector, acceptDocs, outerWindowMax);
        top = essentialQueue.top();
+        if (minCompetitiveScore >= nextMinCompetitiveScore) {
+          // The minimum competitive score increased substantially, so we can now partition scorers
+          // in a more favorable way.
+          break;
+        }
      }

-      outerWindowMin = outerWindowMax;
+      outerWindowMin = Math.min(top.doc, outerWindowMax);
    }

    return nextCandidate(max);
@ -337,6 +344,7 @@ final class MaxScoreBulkScorer extends BulkScorer {
        });
    double maxScoreSum = 0;
    firstEssentialScorer = 0;
+    nextMinCompetitiveScore = Float.POSITIVE_INFINITY;
    for (int i = 0; i < allScorers.length; ++i) {
      final DisiWrapper w = scratch[i];
      double newMaxScoreSum = maxScoreSum + w.maxWindowScore;
@ -349,6 +357,7 @@ final class MaxScoreBulkScorer extends BulkScorer {
        firstEssentialScorer++;
      } else {
        allScorers[allScorers.length - 1 - (i - firstEssentialScorer)] = w;
+        nextMinCompetitiveScore = Math.min(maxScoreSumFloat, nextMinCompetitiveScore);
      }
    }

--- a/lucene/core/src/java/org/apache/lucene/search/PointInSetQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/PointInSetQuery.java
@ -181,7 +181,7 @@ public abstract class PointInSetQuery extends Query implements Accountable {
            @Override
            public Scorer get(long leadCost) throws IOException {
              DocIdSetBuilder result = new DocIdSetBuilder(reader.maxDoc(), values, field);
-              values.intersect(new MergePointVisitor(sortedPackedPoints, result));
+              values.intersect(new MergePointVisitor(sortedPackedPoints.iterator(), result));
              DocIdSetIterator iterator = result.build().iterator();
              return new ConstantScoreScorer(score(), scoreMode, iterator);
            }
@ -192,7 +192,9 @@ public abstract class PointInSetQuery extends Query implements Accountable {
                if (cost == -1) {
                  // Computing the cost may be expensive, so only do it if necessary
                  DocIdSetBuilder result = new DocIdSetBuilder(reader.maxDoc(), values, field);
-                  cost = values.estimateDocCount(new MergePointVisitor(sortedPackedPoints, result));
+                  cost =
+                      values.estimateDocCount(
+                          new MergePointVisitor(sortedPackedPoints.iterator(), result));
                  assert cost >= 0;
                }
                return cost;
@ -260,18 +262,15 @@ public abstract class PointInSetQuery extends Query implements Accountable {
  private class MergePointVisitor implements IntersectVisitor {

    private final DocIdSetBuilder result;
-    private TermIterator iterator;
+    private final TermIterator iterator;
    private BytesRef nextQueryPoint;
    private final ByteArrayComparator comparator;
-    private final PrefixCodedTerms sortedPackedPoints;
    private DocIdSetBuilder.BulkAdder adder;

-    public MergePointVisitor(PrefixCodedTerms sortedPackedPoints, DocIdSetBuilder result)
-        throws IOException {
+    public MergePointVisitor(TermIterator iterator, DocIdSetBuilder result) throws IOException {
      this.result = result;
-      this.sortedPackedPoints = sortedPackedPoints;
      this.comparator = ArrayUtil.getUnsignedComparator(bytesPerDim);
-      this.iterator = this.sortedPackedPoints.iterator();
+      this.iterator = iterator;
      nextQueryPoint = iterator.next();
    }

--- a/lucene/core/src/java/org/apache/lucene/util/fst/Util.java
+++ b/lucene/core/src/java/org/apache/lucene/util/fst/Util.java
@ -775,10 +775,11 @@ public final class Util {

  /** Just takes unsigned byte values from the BytesRef and converts into an IntsRef. */
  public static IntsRef toIntsRef(BytesRef input, IntsRefBuilder scratch) {
-    scratch.clear();
+    scratch.growNoCopy(input.length);
    for (int i = 0; i < input.length; i++) {
-      scratch.append(input.bytes[i + input.offset] & 0xFF);
+      scratch.setIntAt(i, input.bytes[i + input.offset] & 0xFF);
    }
+    scratch.setLength(input.length);
    return scratch.get();
  }

--- a/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentIndexInput.java
+++ b/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentIndexInput.java
@ -530,7 +530,29 @@ abstract class MemorySegmentIndexInput extends IndexInput

  @Override
  public final MemorySegmentIndexInput clone() {
-    final MemorySegmentIndexInput clone = buildSlice((String) null, 0L, this.length);
+    ensureOpen();
+    ensureAccessible();
+    final MemorySegmentIndexInput clone;
+    if (segments.length == 1) {
+      clone =
+          new SingleSegmentImpl(
+              toString(),
+              null, // clones don't have an Arena, as they can't close)
+              segments[0],
+              length,
+              chunkSizePower,
+              confined);
+    } else {
+      clone =
+          new MultiSegmentImpl(
+              toString(),
+              null, // clones don't have an Arena, as they can't close)
+              segments,
+              ((MultiSegmentImpl) this).offset,
+              length,
+              chunkSizePower,
+              confined);
+    }
    try {
      clone.seek(getFilePointer());
    } catch (IOException ioe) {
@ -570,12 +592,20 @@ abstract class MemorySegmentIndexInput extends IndexInput
    if (NATIVE_ACCESS.isPresent() && advice != ReadAdvice.NORMAL) {
      // No need to madvise with a normal advice, since it's the OS' default.
      final NativeAccess nativeAccess = NATIVE_ACCESS.get();
-      slice.advise(
-          0,
-          slice.length,
-          segment -> {
-            nativeAccess.madvise(segment, advice);
-          });
+      if (length >= nativeAccess.getPageSize()) {
+        // Only set the read advice if the inner file is large enough. Otherwise the cons are likely
+        // outweighing the pros as we're:
+        //  - potentially overriding the advice of other files that share the same pages,
+        //  - paying the cost of a madvise system call for little value.
+        // We could align inner files with the page size to avoid the first issue, but again the
+        // pros don't clearly overweigh the cons.
+        slice.advise(
+            0,
+            slice.length,
+            segment -> {
+              nativeAccess.madvise(segment, advice);
+            });
+      }
    }
    return slice;
  }
@ -584,26 +614,30 @@ abstract class MemorySegmentIndexInput extends IndexInput
  MemorySegmentIndexInput buildSlice(String sliceDescription, long offset, long length) {
    ensureOpen();
    ensureAccessible();
+    final MemorySegment[] slices;
+    final boolean isClone = offset == 0 && length == this.length;
+    if (isClone) {
+      slices = segments;
+    } else {
+      final long sliceEnd = offset + length;
+      final int startIndex = (int) (offset >>> chunkSizePower);
+      final int endIndex = (int) (sliceEnd >>> chunkSizePower);
+      // we always allocate one more slice, the last one may be a 0 byte one after truncating with
+      // asSlice():
+      slices = ArrayUtil.copyOfSubArray(segments, startIndex, endIndex + 1);

-    final long sliceEnd = offset + length;
-    final int startIndex = (int) (offset >>> chunkSizePower);
-    final int endIndex = (int) (sliceEnd >>> chunkSizePower);
+      // set the last segment's limit for the sliced view.
+      slices[slices.length - 1] = slices[slices.length - 1].asSlice(0L, sliceEnd & chunkSizeMask);

-    // we always allocate one more slice, the last one may be a 0 byte one after truncating with
-    // asSlice():
-    final MemorySegment slices[] = ArrayUtil.copyOfSubArray(segments, startIndex, endIndex + 1);
-
-    // set the last segment's limit for the sliced view.
-    slices[slices.length - 1] = slices[slices.length - 1].asSlice(0L, sliceEnd & chunkSizeMask);
-
-    offset = offset & chunkSizeMask;
+      offset = offset & chunkSizeMask;
+    }

    final String newResourceDescription = getFullSliceDescription(sliceDescription);
    if (slices.length == 1) {
      return new SingleSegmentImpl(
          newResourceDescription,
          null, // clones don't have an Arena, as they can't close)
-          slices[0].asSlice(offset, length),
+          isClone ? slices[0] : slices[0].asSlice(offset, length),
          length,
          chunkSizePower,
          confined);
--- a/lucene/core/src/test/org/apache/lucene/search/TestMaxScoreBulkScorer.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestMaxScoreBulkScorer.java
@ -38,23 +38,6 @@ import org.apache.lucene.util.Bits;
 // These basic tests are similar to some of the tests in TestWANDScorer, and may not need to be kept
 public class TestMaxScoreBulkScorer extends LuceneTestCase {

-  private static class CapMaxScoreWindowAt2048Scorer extends FilterScorer {
-
-    public CapMaxScoreWindowAt2048Scorer(Scorer in) {
-      super(in);
-    }
-
-    @Override
-    public int advanceShallow(int target) throws IOException {
-      return Math.min(target | 0x7FF, in.advanceShallow(target));
-    }
-
-    @Override
-    public float getMaxScore(int upTo) throws IOException {
-      return in.getMaxScore(upTo);
-    }
-  }
-
  private void writeDocuments(Directory dir) throws IOException {
    try (IndexWriter w =
        new IndexWriter(dir, newIndexWriterConfig().setMergePolicy(newLogMergePolicy()))) {
@ -96,12 +79,10 @@ public class TestMaxScoreBulkScorer extends LuceneTestCase {
            searcher
                .createWeight(searcher.rewrite(clause1), ScoreMode.TOP_SCORES, 1f)
                .scorer(context);
-        scorer1 = new CapMaxScoreWindowAt2048Scorer(scorer1);
        Scorer scorer2 =
            searcher
                .createWeight(searcher.rewrite(clause2), ScoreMode.TOP_SCORES, 1f)
                .scorer(context);
-        scorer2 = new CapMaxScoreWindowAt2048Scorer(scorer2);

        BulkScorer scorer =
            new MaxScoreBulkScorer(context.reader().maxDoc(), Arrays.asList(scorer1, scorer2));
@ -168,12 +149,10 @@ public class TestMaxScoreBulkScorer extends LuceneTestCase {
            searcher
                .createWeight(searcher.rewrite(clause1), ScoreMode.TOP_SCORES, 1f)
                .scorer(context);
-        scorer1 = new CapMaxScoreWindowAt2048Scorer(scorer1);
        Scorer scorer2 =
            searcher
                .createWeight(searcher.rewrite(clause2), ScoreMode.TOP_SCORES, 1f)
                .scorer(context);
-        scorer2 = new CapMaxScoreWindowAt2048Scorer(scorer2);

        BulkScorer scorer =
            new MaxScoreBulkScorer(context.reader().maxDoc(), Arrays.asList(scorer1, scorer2));
@ -237,17 +216,14 @@ public class TestMaxScoreBulkScorer extends LuceneTestCase {
            searcher
                .createWeight(searcher.rewrite(clause1), ScoreMode.TOP_SCORES, 1f)
                .scorer(context);
-        scorer1 = new CapMaxScoreWindowAt2048Scorer(scorer1);
        Scorer scorer2 =
            searcher
                .createWeight(searcher.rewrite(clause2), ScoreMode.TOP_SCORES, 1f)
                .scorer(context);
-        scorer2 = new CapMaxScoreWindowAt2048Scorer(scorer2);
        Scorer scorer3 =
            searcher
                .createWeight(searcher.rewrite(clause3), ScoreMode.TOP_SCORES, 1f)
                .scorer(context);
-        scorer3 = new CapMaxScoreWindowAt2048Scorer(scorer3);

        BulkScorer scorer =
            new MaxScoreBulkScorer(
@ -317,17 +293,14 @@ public class TestMaxScoreBulkScorer extends LuceneTestCase {
            searcher
                .createWeight(searcher.rewrite(clause1), ScoreMode.TOP_SCORES, 1f)
                .scorer(context);
-        scorer1 = new CapMaxScoreWindowAt2048Scorer(scorer1);
        Scorer scorer2 =
            searcher
                .createWeight(searcher.rewrite(clause2), ScoreMode.TOP_SCORES, 1f)
                .scorer(context);
-        scorer2 = new CapMaxScoreWindowAt2048Scorer(scorer2);
        Scorer scorer3 =
            searcher
                .createWeight(searcher.rewrite(clause3), ScoreMode.TOP_SCORES, 1f)
                .scorer(context);
-        scorer3 = new CapMaxScoreWindowAt2048Scorer(scorer3);

        BulkScorer scorer =
            new MaxScoreBulkScorer(