Optimize outputs accumulating for SegmentTermsEnum and IntersectTermsEnum (#12699)

2023-11-28 13:04:41 +08:00 · 2023-11-28 13:04:41 +08:00 · d92efa3b5c
parent a6d788e113
commit d92efa3b5c
4 changed files with 131 additions and 78 deletions
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnum.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnum.java
@ -30,9 +30,7 @@ import org.apache.lucene.util.StringHelper;
 import org.apache.lucene.util.automaton.Automaton;
 import org.apache.lucene.util.automaton.RunAutomaton;
 import org.apache.lucene.util.automaton.Transition;
-import org.apache.lucene.util.fst.ByteSequenceOutputs;
 import org.apache.lucene.util.fst.FST;
-import org.apache.lucene.util.fst.Outputs;

 /**
 * This is used to implement efficient {@link Terms#intersect} for block-tree. Note that it cannot
@ -46,7 +44,6 @@ final class IntersectTermsEnum extends BaseTermsEnum {
  // static boolean DEBUG = BlockTreeTermsWriter.DEBUG;

  final IndexInput in;
-  static final Outputs<BytesRef> fstOutputs = ByteSequenceOutputs.getSingleton();

  IntersectTermsEnumFrame[] stack;

@ -68,6 +65,9 @@ final class IntersectTermsEnum extends BaseTermsEnum {

  private BytesRef savedStartTerm;

+  private final SegmentTermsEnum.OutputAccumulator outputAccumulator =
+      new SegmentTermsEnum.OutputAccumulator();
+
  // TODO: in some cases we can filter by length?  eg
  // regexp foo*bar must be at least length 6 bytes
  public IntersectTermsEnum(
@ -114,7 +114,6 @@ final class IntersectTermsEnum extends BaseTermsEnum {
    f.prefix = 0;
    f.setState(0);
    f.arc = arc;
-    f.outputPrefix = arc.output();
    f.load(fr.rootCode);

    // for assert:
@ -184,7 +183,9 @@ final class IntersectTermsEnum extends BaseTermsEnum {
    FST.Arc<BytesRef> arc = currentFrame.arc;
    int idx = currentFrame.prefix;
    assert currentFrame.suffix > 0;
-    BytesRef output = currentFrame.outputPrefix;
+
+    outputAccumulator.reset();
+    outputAccumulator.push(arc.output());
    while (idx < f.prefix) {
      final int target = term.bytes[idx] & 0xff;
      // TODO: we could be more efficient for the next()
@ -192,14 +193,14 @@ final class IntersectTermsEnum extends BaseTermsEnum {
      // passed to findTargetArc
      arc = fr.index.findTargetArc(target, arc, getArc(1 + idx), fstReader);
      assert arc != null;
-      output = fstOutputs.add(output, arc.output());
+      outputAccumulator.push(arc.output());
      idx++;
    }

    f.arc = arc;
-    f.outputPrefix = output;
    assert arc.isFinal();
-    f.load(fstOutputs.add(output, arc.nextFinalOutput()));
+    outputAccumulator.push(arc.nextFinalOutput());
+    f.load(outputAccumulator);
    return f;
  }

--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnumFrame.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnumFrame.java
@ -55,7 +55,6 @@ final class IntersectTermsEnumFrame {
  int statsSingletonRunLength = 0;
  final ByteArrayDataInput statsReader = new ByteArrayDataInput();

-  byte[] floorData = new byte[32];
  final ByteArrayDataInput floorDataReader = new ByteArrayDataInput();

  // Length of prefix shared by all terms in this block
@ -90,9 +89,6 @@ final class IntersectTermsEnumFrame {

  final ByteArrayDataInput bytesReader = new ByteArrayDataInput();

-  // Cumulative output so far
-  BytesRef outputPrefix;
-
  int startBytePos;
  int suffix;

@ -120,7 +116,7 @@ final class IntersectTermsEnumFrame {
      }
    } while (numFollowFloorBlocks != 0 && nextFloorLabel <= transition.min);

-    load(null);
+    load((Long) null);
  }

  public void setState(int state) {
@ -142,12 +138,22 @@ final class IntersectTermsEnumFrame {
  }

  void load(BytesRef frameIndexData) throws IOException {
-    if (frameIndexData != null) {
-      floorDataReader.reset(frameIndexData.bytes, frameIndexData.offset, frameIndexData.length);
-      // Skip first long -- has redundant fp, hasTerms
-      // flag, isFloor flag
-      final long code = ite.fr.readVLongOutput(floorDataReader);
-      if ((code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0) {
+    floorDataReader.reset(frameIndexData.bytes, frameIndexData.offset, frameIndexData.length);
+    load(ite.fr.readVLongOutput(floorDataReader));
+  }
+
+  void load(SegmentTermsEnum.OutputAccumulator outputAccumulator) throws IOException {
+    outputAccumulator.prepareRead();
+    long code = ite.fr.readVLongOutput(outputAccumulator);
+    outputAccumulator.setFloorData(floorDataReader);
+    load(code);
+  }
+
+  void load(Long blockCode) throws IOException {
+    if (blockCode != null) {
+      // This block is the first one in a possible sequence of floor blocks corresponding to a
+      // single seek point from the FST terms index
+      if ((blockCode & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0) {
        // Floor frame
        numFollowFloorBlocks = floorDataReader.readVInt();
        nextFloorLabel = floorDataReader.readByte() & 0xff;
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java
@ -24,6 +24,7 @@ import org.apache.lucene.index.ImpactsEnum;
 import org.apache.lucene.index.PostingsEnum;
 import org.apache.lucene.index.TermState;
 import org.apache.lucene.store.ByteArrayDataInput;
+import org.apache.lucene.store.DataInput;
 import org.apache.lucene.store.IndexInput;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
@ -48,7 +49,7 @@ final class SegmentTermsEnum extends BaseTermsEnum {

  // static boolean DEBUG = BlockTreeTermsWriter.DEBUG;

-  private final ByteArrayDataInput scratchReader = new ByteArrayDataInput();
+  private final OutputAccumulator outputAccumulator = new OutputAccumulator();

  // What prefix of the current term was present in the index; when we only next() through the
  // index, this stays at 0.  It's only set when
@ -232,18 +233,24 @@ final class SegmentTermsEnum extends BaseTermsEnum {
    return arcs[ord];
  }

-  // Pushes a frame we seek'd to
  SegmentTermsEnumFrame pushFrame(FST.Arc<BytesRef> arc, BytesRef frameData, int length)
      throws IOException {
-    scratchReader.reset(frameData.bytes, frameData.offset, frameData.length);
-    final long code = fr.readVLongOutput(scratchReader);
+    outputAccumulator.reset();
+    outputAccumulator.push(frameData);
+    return pushFrame(arc, length);
+  }
+
+  // Pushes a frame we seek'd to
+  SegmentTermsEnumFrame pushFrame(FST.Arc<BytesRef> arc, int length) throws IOException {
+    outputAccumulator.prepareRead();
+    final long code = fr.readVLongOutput(outputAccumulator);
    final long fpSeek = code >>> Lucene90BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS;
    final SegmentTermsEnumFrame f = getFrame(1 + currentFrame.ord);
    f.hasTerms = (code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS) != 0;
    f.hasTermsOrig = f.hasTerms;
    f.isFloor = (code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0;
    if (f.isFloor) {
-      f.setFloorData(scratchReader, frameData);
+      f.setFloorData(outputAccumulator);
    }
    pushFrame(arc, fpSeek, length);

@ -344,9 +351,9 @@ final class SegmentTermsEnum extends BaseTermsEnum {

    FST.Arc<BytesRef> arc;
    int targetUpto;
-    BytesRef output;

    targetBeforeCurrentLength = currentFrame.ord;
+    outputAccumulator.reset();

    if (currentFrame != staticFrame) {

@ -363,7 +370,7 @@ final class SegmentTermsEnum extends BaseTermsEnum {

      arc = arcs[0];
      assert arc.isFinal();
-      output = arc.output();
+      outputAccumulator.push(arc.output());
      targetUpto = 0;

      SegmentTermsEnumFrame lastFrame = stack[0];
@ -373,9 +380,6 @@ final class SegmentTermsEnum extends BaseTermsEnum {

      int cmp = 0;

-      // TODO: reverse vLong byte order for better FST
-      // prefix output sharing
-
      // First compare up to valid seek frames:
      while (targetUpto < targetLimit) {
        cmp = (term.byteAt(targetUpto) & 0xFF) - (target.bytes[target.offset + targetUpto] & 0xFF);
@ -394,9 +398,8 @@ final class SegmentTermsEnum extends BaseTermsEnum {
                + (char) arc.label()
                + " targetLabel="
                + (char) (target.bytes[target.offset + targetUpto] & 0xFF);
-        if (arc.output() != Lucene90BlockTreeTermsReader.NO_OUTPUT) {
-          output = Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.output());
-        }
+        outputAccumulator.push(arc.output());
+
        if (arc.isFinal()) {
          lastFrame = stack[1 + lastFrame.ord];
        }
@ -484,15 +487,15 @@ final class SegmentTermsEnum extends BaseTermsEnum {
      //   System.out.println("    no seek state; push root frame");
      // }

-      output = arc.output();
+      outputAccumulator.push(arc.output());

      currentFrame = staticFrame;

      // term.length = 0;
      targetUpto = 0;
-      currentFrame =
-          pushFrame(
-              arc, Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.nextFinalOutput()), 0);
+      outputAccumulator.push(arc.nextFinalOutput());
+      currentFrame = pushFrame(arc, 0);
+      outputAccumulator.pop();
    }

    // if (DEBUG) {
@ -554,9 +557,7 @@ final class SegmentTermsEnum extends BaseTermsEnum {
        term.setByteAt(targetUpto, (byte) targetLabel);
        // Aggregate output as we go:
        assert arc.output() != null;
-        if (arc.output() != Lucene90BlockTreeTermsReader.NO_OUTPUT) {
-          output = Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.output());
-        }
+        outputAccumulator.push(arc.output());

        // if (DEBUG) {
        //   System.out.println("    index: follow label=" + toHex(target.bytes[target.offset +
@ -566,11 +567,9 @@ final class SegmentTermsEnum extends BaseTermsEnum {

        if (arc.isFinal()) {
          // if (DEBUG) System.out.println("    arc is final!");
-          currentFrame =
-              pushFrame(
-                  arc,
-                  Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.nextFinalOutput()),
-                  targetUpto);
+          outputAccumulator.push(arc.nextFinalOutput());
+          currentFrame = pushFrame(arc, targetUpto);
+          outputAccumulator.pop();
          // if (DEBUG) System.out.println("    curFrame.ord=" + currentFrame.ord + " hasTerms=" +
          // currentFrame.hasTerms);
        }
@ -630,9 +629,9 @@ final class SegmentTermsEnum extends BaseTermsEnum {

    FST.Arc<BytesRef> arc;
    int targetUpto;
-    BytesRef output;

    targetBeforeCurrentLength = currentFrame.ord;
+    outputAccumulator.reset();

    if (currentFrame != staticFrame) {

@ -649,7 +648,7 @@ final class SegmentTermsEnum extends BaseTermsEnum {

      arc = arcs[0];
      assert arc.isFinal();
-      output = arc.output();
+      outputAccumulator.push(arc.output());
      targetUpto = 0;

      SegmentTermsEnumFrame lastFrame = stack[0];
@ -659,9 +658,6 @@ final class SegmentTermsEnum extends BaseTermsEnum {

      int cmp = 0;

-      // TODO: we should write our vLong backwards (MSB
-      // first) to get better sharing from the FST
-
      // First compare up to valid seek frames:
      while (targetUpto < targetLimit) {
        cmp = (term.byteAt(targetUpto) & 0xFF) - (target.bytes[target.offset + targetUpto] & 0xFF);
@ -680,14 +676,8 @@ final class SegmentTermsEnum extends BaseTermsEnum {
                + (char) arc.label()
                + " targetLabel="
                + (char) (target.bytes[target.offset + targetUpto] & 0xFF);
-        // TODO: we could save the outputs in local
-        // byte[][] instead of making new objs ever
-        // seek; but, often the FST doesn't have any
-        // shared bytes (but this could change if we
-        // reverse vLong byte order)
-        if (arc.output() != Lucene90BlockTreeTermsReader.NO_OUTPUT) {
-          output = Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.output());
-        }
+
+        outputAccumulator.push(arc.output());
        if (arc.isFinal()) {
          lastFrame = stack[1 + lastFrame.ord];
        }
@ -769,15 +759,15 @@ final class SegmentTermsEnum extends BaseTermsEnum {
      // System.out.println("    no seek state; push root frame");
      // }

-      output = arc.output();
+      outputAccumulator.push(arc.output());

      currentFrame = staticFrame;

      // term.length = 0;
      targetUpto = 0;
-      currentFrame =
-          pushFrame(
-              arc, Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.nextFinalOutput()), 0);
+      outputAccumulator.push(arc.nextFinalOutput());
+      currentFrame = pushFrame(arc, 0);
+      outputAccumulator.pop();
    }

    // if (DEBUG) {
@ -839,9 +829,7 @@ final class SegmentTermsEnum extends BaseTermsEnum {
        arc = nextArc;
        // Aggregate output as we go:
        assert arc.output() != null;
-        if (arc.output() != Lucene90BlockTreeTermsReader.NO_OUTPUT) {
-          output = Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.output());
-        }
+        outputAccumulator.push(arc.output());

        // if (DEBUG) {
        // System.out.println("    index: follow label=" + (target.bytes[target.offset +
@ -851,11 +839,9 @@ final class SegmentTermsEnum extends BaseTermsEnum {

        if (arc.isFinal()) {
          // if (DEBUG) System.out.println("    arc is final!");
-          currentFrame =
-              pushFrame(
-                  arc,
-                  Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.nextFinalOutput()),
-                  targetUpto);
+          outputAccumulator.push(arc.nextFinalOutput());
+          currentFrame = pushFrame(arc, targetUpto);
+          outputAccumulator.pop();
          // if (DEBUG) System.out.println("    curFrame.ord=" + currentFrame.ord + " hasTerms=" +
          // currentFrame.hasTerms);
        }
@ -1190,4 +1176,68 @@ final class SegmentTermsEnum extends BaseTermsEnum {
  public long ord() {
    throw new UnsupportedOperationException();
  }
+
+  static class OutputAccumulator extends DataInput {
+
+    BytesRef[] outputs = new BytesRef[16];
+    BytesRef current;
+    int num;
+    int outputIndex;
+    int index;
+
+    void push(BytesRef output) {
+      if (output != Lucene90BlockTreeTermsReader.NO_OUTPUT) {
+        outputs = ArrayUtil.grow(outputs, num + 1);
+        outputs[num++] = output;
+      }
+    }
+
+    void pop() {
+      assert num > 0;
+      num--;
+    }
+
+    void reset() {
+      num = 0;
+    }
+
+    void prepareRead() {
+      index = 0;
+      outputIndex = 0;
+      current = outputs[0];
+    }
+
+    /**
+     * Set the last arc as the source of the floorData. This won't change the reading position of
+     * this {@link OutputAccumulator}
+     */
+    void setFloorData(ByteArrayDataInput floorData) {
+      assert outputIndex == num - 1
+          : "floor data should be stored in last arc, get outputIndex: "
+              + outputIndex
+              + ", num: "
+              + num;
+      BytesRef output = outputs[outputIndex];
+      floorData.reset(output.bytes, output.offset + index, output.length - index);
+    }
+
+    @Override
+    public byte readByte() throws IOException {
+      if (index >= current.length) {
+        current = outputs[++outputIndex];
+        index = 0;
+      }
+      return current.bytes[current.offset + index++];
+    }
+
+    @Override
+    public void readBytes(byte[] b, int offset, int len) throws IOException {
+      throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public void skipBytes(long numBytes) throws IOException {
+      throw new UnsupportedOperationException();
+    }
+  }
 }
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnumFrame.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnumFrame.java
@ -55,7 +55,7 @@ final class SegmentTermsEnumFrame {
  int statsSingletonRunLength = 0;
  final ByteArrayDataInput statsReader = new ByteArrayDataInput();

-  byte[] floorData = new byte[32];
+  int rewindPos;
  final ByteArrayDataInput floorDataReader = new ByteArrayDataInput();

  // Length of prefix shared by all terms in this block
@ -104,13 +104,9 @@ final class SegmentTermsEnumFrame {
    suffixLengthsReader = new ByteArrayDataInput();
  }

-  public void setFloorData(ByteArrayDataInput in, BytesRef source) {
-    final int numBytes = source.length - (in.getPosition() - source.offset);
-    if (numBytes > floorData.length) {
-      floorData = new byte[ArrayUtil.oversize(numBytes, 1)];
-    }
-    System.arraycopy(source.bytes, source.offset + in.getPosition(), floorData, 0, numBytes);
-    floorDataReader.reset(floorData, 0, numBytes);
+  public void setFloorData(SegmentTermsEnum.OutputAccumulator outputAccumulator) {
+    outputAccumulator.setFloorData(floorDataReader);
+    rewindPos = floorDataReader.getPosition();
    numFollowFloorBlocks = floorDataReader.readVInt();
    nextFloorLabel = floorDataReader.readByte() & 0xff;
    // if (DEBUG) {
@ -247,7 +243,7 @@ final class SegmentTermsEnumFrame {
    nextEnt = -1;
    hasTerms = hasTermsOrig;
    if (isFloor) {
-      floorDataReader.rewind();
+      floorDataReader.setPosition(rewindPos);
      numFollowFloorBlocks = floorDataReader.readVInt();
      assert numFollowFloorBlocks > 0;
      nextFloorLabel = floorDataReader.readByte() & 0xff;