Change BP reordering logic to help support document blocks later on. (#13123)

The current logic for reordering splits a slice of doc IDs into a left side and a right side, and for each document it computes the expected gain of moving to the other side. Then it swaps documents from both sides as long as the sum of the gain of moving the left doc to the right and the right doc to the left is positive. This works well, but I would like to extend BP reordering to also work with blocks, and the swapping logic is challenging to modify as two parent documents may have different numbers of children. One of the follow-up papers on BP suggested to use a different logic, where one would compute a bias for all documents that is negative when a document is attracted to the left and positive otherwise. Then we only have to partition doc IDs around the mid point, e.g. with quickselect. A benefit of this change is that it will make it easier to generalize BP reordering to indexes that have blocks, e.g. by using a stable sort on biases.
2024-02-23 09:09:19 +01:00 · 2024-02-23 09:09:19 +01:00 · 61f322905a
parent 17cbedccfc
commit 61f322905a
1 changed files with 84 additions and 103 deletions
--- a/lucene/misc/src/java/org/apache/lucene/misc/index/BPIndexReorderer.java
+++ b/lucene/misc/src/java/org/apache/lucene/misc/index/BPIndexReorderer.java
@ -49,7 +49,7 @@ import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.CloseableThreadLocal;
 import org.apache.lucene.util.IOUtils;
-import org.apache.lucene.util.IntroSorter;
+import org.apache.lucene.util.IntroSelector;
 import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.packed.PackedInts;
@ -251,17 +251,17 @@ public final class BPIndexReorderer {
  private class IndexReorderingTask extends BaseRecursiveAction {
    private final IntsRef docIDs;
-    private final float[] gains;
+    private final float[] biases;
    private final CloseableThreadLocal<PerThreadState> threadLocal;
    IndexReorderingTask(
        IntsRef docIDs,
-        float[] gains,
+        float[] biases,
        CloseableThreadLocal<PerThreadState> threadLocal,
        int depth) {
      super(depth);
      this.docIDs = docIDs;
-      this.gains = gains;
+      this.biases = biases;
      this.threadLocal = threadLocal;
    }
@ -293,14 +293,14 @@ public final class BPIndexReorderer {
        assert sorted(docIDs);
      }
-      int leftSize = docIDs.length / 2;
+      int halfLength = docIDs.length / 2;
-      if (leftSize < minPartitionSize) {
+      if (halfLength < minPartitionSize) {
        return;
      }
-      int rightSize = docIDs.length - leftSize;
+      IntsRef left = new IntsRef(docIDs.ints, docIDs.offset, halfLength);
-      IntsRef left = new IntsRef(docIDs.ints, docIDs.offset, leftSize);
+      IntsRef right =
-      IntsRef right = new IntsRef(docIDs.ints, docIDs.offset + leftSize, rightSize);
+          new IntsRef(docIDs.ints, docIDs.offset + halfLength, docIDs.length - halfLength);
      PerThreadState state = threadLocal.get();
      ForwardIndex forwardIndex = state.forwardIndex;
@ -313,7 +313,9 @@ public final class BPIndexReorderer {
      for (int iter = 0; iter < maxIters; ++iter) {
        boolean moved;
        try {
-          moved = shuffle(forwardIndex, left, right, leftDocFreqs, rightDocFreqs, gains, iter);
+          moved =
              shuffle(
                  forwardIndex, docIDs, right.offset, leftDocFreqs, rightDocFreqs, biases, iter);
        } catch (IOException e) {
          throw new UncheckedIOException(e);
        }
@ -322,10 +324,11 @@ public final class BPIndexReorderer {
        }
      }
-      // It is fine for all tasks to share the same docs / gains array since they all work on
+      // It is fine for all tasks to share the same docs / biases array since they all work on
      // different slices of the array at a given point in time.
-      IndexReorderingTask leftTask = new IndexReorderingTask(left, gains, threadLocal, depth + 1);
+      IndexReorderingTask leftTask = new IndexReorderingTask(left, biases, threadLocal, depth + 1);
-      IndexReorderingTask rightTask = new IndexReorderingTask(right, gains, threadLocal, depth + 1);
+      IndexReorderingTask rightTask =
          new IndexReorderingTask(right, biases, threadLocal, depth + 1);
      if (shouldFork(docIDs.length, docIDs.ints.length)) {
        invokeAll(leftTask, rightTask);
@ -341,116 +344,94 @@ public final class BPIndexReorderer {
     */
    private boolean shuffle(
        ForwardIndex forwardIndex,
-        IntsRef left,
+        IntsRef docIDs,
-        IntsRef right,
+        int midPoint,
        int[] leftDocFreqs,
        int[] rightDocFreqs,
-        float[] gains,
+        float[] biases,
        int iter)
        throws IOException {
      assert left.ints == right.ints;
      assert left.offset + left.length == right.offset;
-      // Computing gains is typically a bottleneck, because each iteration needs to iterate over all
+      // Computing biases is typically a bottleneck, because each iteration needs to iterate over
-      // postings to recompute gains, and the total number of postings is usually one order of
+      // all postings to recompute biases, and the total number of postings is usually one order of
      // magnitude or more larger than the number of docs. So we try to parallelize it.
-      ComputeGainsTask leftGainsTask =
+      new ComputeBiasTask(
-          new ComputeGainsTask(
+              docIDs.ints,
-              left.ints,
+              biases,
-              gains,
+              docIDs.offset,
-              left.offset,
+              docIDs.offset + docIDs.length,
              left.offset + left.length,
              leftDocFreqs,
              rightDocFreqs,
              threadLocal,
-              depth);
+              depth)
-      ComputeGainsTask rightGainsTask =
+          .compute();
-          new ComputeGainsTask(
+
-              right.ints,
+      float maxLeftBias = Float.NEGATIVE_INFINITY;
-              gains,
+      for (int i = docIDs.offset; i < midPoint; ++i) {
-              right.offset,
+        maxLeftBias = Math.max(maxLeftBias, biases[i]);
-              right.offset + right.length,
+      }
-              rightDocFreqs,
+      float minRightBias = Float.POSITIVE_INFINITY;
-              leftDocFreqs,
+      for (int i = midPoint, end = docIDs.offset + docIDs.length; i < end; ++i) {
-              threadLocal,
+        minRightBias = Math.min(minRightBias, biases[i]);
-              depth);
+      }
-      if (shouldFork(docIDs.length, docIDs.ints.length)) {
+      float gain = maxLeftBias - minRightBias;
-        invokeAll(leftGainsTask, rightGainsTask);
+      // This uses the simulated annealing proposed by Mackenzie et al in "Tradeoff Options for
-      } else {
+      // Bipartite Graph Partitioning" by comparing the gain of swapping the doc from the left side
-        leftGainsTask.compute();
+      // that is most attracted to the right and the doc from the right side that is most attracted
-        rightGainsTask.compute();
+      // to the left against `iter` rather than zero.
      if (gain <= iter) {
        return false;
      }
-      class ByDescendingGainSorter extends IntroSorter {
+      new IntroSelector() {
        int pivotDoc;
-        float pivotGain;
+        float pivotBias;
        @Override
        protected void setPivot(int i) {
-          pivotDoc = left.ints[i];
+          pivotDoc = docIDs.ints[i];
-          pivotGain = gains[i];
+          pivotBias = biases[i];
        }
        @Override
        protected int comparePivot(int j) {
-          // Compare in reverse order to get a descending sort
+          int cmp = Float.compare(pivotBias, biases[j]);
          int cmp = Float.compare(gains[j], pivotGain);
          if (cmp == 0) {
            // Tie break on the doc ID to preserve doc ID ordering as much as possible
-            cmp = pivotDoc - left.ints[j];
+            cmp = pivotDoc - docIDs.ints[j];
          }
          return cmp;
        }
        @Override
        protected void swap(int i, int j) {
-          int tmpDoc = left.ints[i];
+          float tmpBias = biases[i];
-          left.ints[i] = left.ints[j];
+          biases[i] = biases[j];
-          left.ints[j] = tmpDoc;
+          biases[j] = tmpBias;
-          float tmpGain = gains[i];
+          if (i < midPoint == j < midPoint) {
-          gains[i] = gains[j];
+            int tmpDoc = docIDs.ints[i];
-          gains[j] = tmpGain;
+            docIDs.ints[i] = docIDs.ints[j];
-        }
+            docIDs.ints[j] = tmpDoc;
-      }
+          } else {
-
+            // If we're swapping docs across the left and right sides, we need to keep doc freqs
-      Runnable leftSorter =
+            // up-to-date.
-          () -> new ByDescendingGainSorter().sort(left.offset, left.offset + left.length);
+            int left = Math.min(i, j);
-      Runnable rightSorter =
+            int right = Math.max(i, j);
-          () -> new ByDescendingGainSorter().sort(right.offset, right.offset + right.length);
+            try {
-
+              swapDocsAndFreqs(docIDs.ints, left, right, forwardIndex, leftDocFreqs, rightDocFreqs);
-      if (shouldFork(docIDs.length, docIDs.ints.length)) {
+            } catch (IOException e) {
-        // TODO: run it on more than 2 threads at most
+              throw new UncheckedIOException(e);
-        invokeAll(adapt(leftSorter), adapt(rightSorter));
+            }
      } else {
        leftSorter.run();
        rightSorter.run();
      }
      for (int i = 0; i < left.length; ++i) {
        // This uses the simulated annealing proposed by Mackenzie et al in "Tradeoff Options for
        // Bipartite Graph Partitioning" by comparing the gain against `iter` rather than zero.
        if (gains[left.offset + i] + gains[right.offset + i] <= iter) {
          if (i == 0) {
            return false;
          }
          break;
        }
-
+      }.select(docIDs.offset, docIDs.offset + docIDs.length, midPoint);
        swap(
            left.ints,
            left.offset + i,
            right.offset + i,
            forwardIndex,
            leftDocFreqs,
            rightDocFreqs);
      }
      return true;
    }
-    private static void swap(
+    private static void swapDocsAndFreqs(
        int[] docs,
        int left,
        int right,
@ -492,19 +473,19 @@ public final class BPIndexReorderer {
    }
  }
-  private class ComputeGainsTask extends BaseRecursiveAction {
+  private class ComputeBiasTask extends BaseRecursiveAction {
    private final int[] docs;
-    private final float[] gains;
+    private final float[] biases;
    private final int from;
    private final int to;
    private final int[] fromDocFreqs;
    private final int[] toDocFreqs;
    private final CloseableThreadLocal<PerThreadState> threadLocal;
-    ComputeGainsTask(
+    ComputeBiasTask(
        int[] docs,
-        float[] gains,
+        float[] biases,
        int from,
        int to,
        int[] fromDocFreqs,
@ -513,7 +494,7 @@ public final class BPIndexReorderer {
        int depth) {
      super(depth);
      this.docs = docs;
-      this.gains = gains;
+      this.biases = biases;
      this.from = from;
      this.to = to;
      this.fromDocFreqs = fromDocFreqs;
@ -527,15 +508,15 @@ public final class BPIndexReorderer {
      if (problemSize > 1 && shouldFork(problemSize, docs.length)) {
        final int mid = (from + to) >>> 1;
        invokeAll(
-            new ComputeGainsTask(
+            new ComputeBiasTask(
-                docs, gains, from, mid, fromDocFreqs, toDocFreqs, threadLocal, depth),
+                docs, biases, from, mid, fromDocFreqs, toDocFreqs, threadLocal, depth),
-            new ComputeGainsTask(
+            new ComputeBiasTask(
-                docs, gains, mid, to, fromDocFreqs, toDocFreqs, threadLocal, depth));
+                docs, biases, mid, to, fromDocFreqs, toDocFreqs, threadLocal, depth));
      } else {
        ForwardIndex forwardIndex = threadLocal.get().forwardIndex;
        try {
          for (int i = from; i < to; ++i) {
-            gains[i] = computeGain(docs[i], forwardIndex, fromDocFreqs, toDocFreqs);
+            biases[i] = computeBias(docs[i], forwardIndex, fromDocFreqs, toDocFreqs);
          }
        } catch (IOException e) {
          throw new UncheckedIOException(e);
@ -547,11 +528,11 @@ public final class BPIndexReorderer {
     * Compute a float that is negative when a document is attracted to the left and positive
     * otherwise.
     */
-    private static float computeGain(
+    private static float computeBias(
        int docID, ForwardIndex forwardIndex, int[] fromDocFreqs, int[] toDocFreqs)
        throws IOException {
      forwardIndex.seek(docID);
-      double gain = 0;
+      double bias = 0;
      for (IntsRef terms = forwardIndex.nextTerms();
          terms.length != 0;
          terms = forwardIndex.nextTerms()) {
@ -561,12 +542,12 @@ public final class BPIndexReorderer {
          final int toDocFreq = toDocFreqs[termID];
          assert fromDocFreq >= 0;
          assert toDocFreq >= 0;
-          gain +=
+          bias +=
              (toDocFreq == 0 ? 0 : fastLog2(toDocFreq))
                  - (fromDocFreq == 0 ? 0 : fastLog2(fromDocFreq));
        }
      }
-      return (float) gain;
+      return (float) bias;
    }
  }
@ -869,7 +850,7 @@ public final class BPIndexReorderer {
  }
  private static long docRAMRequirements(int maxDoc) {
-    // We need one int per doc for the doc map, plus one float to store the gain associated with
+    // We need one int per doc for the doc map, plus one float to store the bias associated with
    // this doc.
    return 2L * Integer.BYTES * maxDoc;
  }