From 9574cbd1f18dc751c98e09cc1cfc3da5ecacaca8 Mon Sep 17 00:00:00 2001 From: gf2121 <52390227+gf2121@users.noreply.github.com> Date: Tue, 28 Nov 2023 13:04:41 +0800 Subject: [PATCH] Optimize outputs accumulating for SegmentTermsEnum and IntersectTermsEnum (#12699) --- .../blocktree/IntersectTermsEnum.java | 17 +- .../blocktree/IntersectTermsEnumFrame.java | 28 ++-- .../lucene90/blocktree/SegmentTermsEnum.java | 150 ++++++++++++------ .../blocktree/SegmentTermsEnumFrame.java | 14 +- 4 files changed, 131 insertions(+), 78 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnum.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnum.java index 5773e4a5f0f..9475c0de5f8 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnum.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnum.java @@ -30,9 +30,7 @@ import org.apache.lucene.util.StringHelper; import org.apache.lucene.util.automaton.ByteRunnable; import org.apache.lucene.util.automaton.Transition; import org.apache.lucene.util.automaton.TransitionAccessor; -import org.apache.lucene.util.fst.ByteSequenceOutputs; import org.apache.lucene.util.fst.FST; -import org.apache.lucene.util.fst.Outputs; /** * This is used to implement efficient {@link Terms#intersect} for block-tree. Note that it cannot @@ -46,7 +44,6 @@ final class IntersectTermsEnum extends BaseTermsEnum { // static boolean DEBUG = BlockTreeTermsWriter.DEBUG; final IndexInput in; - static final Outputs fstOutputs = ByteSequenceOutputs.getSingleton(); IntersectTermsEnumFrame[] stack; @@ -68,6 +65,9 @@ final class IntersectTermsEnum extends BaseTermsEnum { private BytesRef savedStartTerm; + private final SegmentTermsEnum.OutputAccumulator outputAccumulator = + new SegmentTermsEnum.OutputAccumulator(); + // TODO: in some cases we can filter by length? eg // regexp foo*bar must be at least length 6 bytes public IntersectTermsEnum( @@ -114,7 +114,6 @@ final class IntersectTermsEnum extends BaseTermsEnum { f.prefix = 0; f.setState(0); f.arc = arc; - f.outputPrefix = arc.output(); f.load(fr.rootCode); // for assert: @@ -184,7 +183,9 @@ final class IntersectTermsEnum extends BaseTermsEnum { FST.Arc arc = currentFrame.arc; int idx = currentFrame.prefix; assert currentFrame.suffix > 0; - BytesRef output = currentFrame.outputPrefix; + + outputAccumulator.reset(); + outputAccumulator.push(arc.output()); while (idx < f.prefix) { final int target = term.bytes[idx] & 0xff; // TODO: we could be more efficient for the next() @@ -192,14 +193,14 @@ final class IntersectTermsEnum extends BaseTermsEnum { // passed to findTargetArc arc = fr.index.findTargetArc(target, arc, getArc(1 + idx), fstReader); assert arc != null; - output = fstOutputs.add(output, arc.output()); + outputAccumulator.push(arc.output()); idx++; } f.arc = arc; - f.outputPrefix = output; assert arc.isFinal(); - f.load(fstOutputs.add(output, arc.nextFinalOutput())); + outputAccumulator.push(arc.nextFinalOutput()); + f.load(outputAccumulator); return f; } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnumFrame.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnumFrame.java index d9ca7a9bbd8..2b0e05a0b09 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnumFrame.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnumFrame.java @@ -55,7 +55,6 @@ final class IntersectTermsEnumFrame { int statsSingletonRunLength = 0; final ByteArrayDataInput statsReader = new ByteArrayDataInput(); - byte[] floorData = new byte[32]; final ByteArrayDataInput floorDataReader = new ByteArrayDataInput(); // Length of prefix shared by all terms in this block @@ -90,9 +89,6 @@ final class IntersectTermsEnumFrame { final ByteArrayDataInput bytesReader = new ByteArrayDataInput(); - // Cumulative output so far - BytesRef outputPrefix; - int startBytePos; int suffix; @@ -120,7 +116,7 @@ final class IntersectTermsEnumFrame { } } while (numFollowFloorBlocks != 0 && nextFloorLabel <= transition.min); - load(null); + load((Long) null); } public void setState(int state) { @@ -142,12 +138,22 @@ final class IntersectTermsEnumFrame { } void load(BytesRef frameIndexData) throws IOException { - if (frameIndexData != null) { - floorDataReader.reset(frameIndexData.bytes, frameIndexData.offset, frameIndexData.length); - // Skip first long -- has redundant fp, hasTerms - // flag, isFloor flag - final long code = ite.fr.readVLongOutput(floorDataReader); - if ((code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0) { + floorDataReader.reset(frameIndexData.bytes, frameIndexData.offset, frameIndexData.length); + load(ite.fr.readVLongOutput(floorDataReader)); + } + + void load(SegmentTermsEnum.OutputAccumulator outputAccumulator) throws IOException { + outputAccumulator.prepareRead(); + long code = ite.fr.readVLongOutput(outputAccumulator); + outputAccumulator.setFloorData(floorDataReader); + load(code); + } + + void load(Long blockCode) throws IOException { + if (blockCode != null) { + // This block is the first one in a possible sequence of floor blocks corresponding to a + // single seek point from the FST terms index + if ((blockCode & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0) { // Floor frame numFollowFloorBlocks = floorDataReader.readVInt(); nextFloorLabel = floorDataReader.readByte() & 0xff; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java index cb5577d8d6c..30a4529c5da 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java @@ -24,6 +24,7 @@ import org.apache.lucene.index.ImpactsEnum; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.TermState; import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.store.DataInput; import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; @@ -48,7 +49,7 @@ final class SegmentTermsEnum extends BaseTermsEnum { // static boolean DEBUG = BlockTreeTermsWriter.DEBUG; - private final ByteArrayDataInput scratchReader = new ByteArrayDataInput(); + private final OutputAccumulator outputAccumulator = new OutputAccumulator(); // What prefix of the current term was present in the index; when we only next() through the // index, this stays at 0. It's only set when @@ -232,18 +233,24 @@ final class SegmentTermsEnum extends BaseTermsEnum { return arcs[ord]; } - // Pushes a frame we seek'd to SegmentTermsEnumFrame pushFrame(FST.Arc arc, BytesRef frameData, int length) throws IOException { - scratchReader.reset(frameData.bytes, frameData.offset, frameData.length); - final long code = fr.readVLongOutput(scratchReader); + outputAccumulator.reset(); + outputAccumulator.push(frameData); + return pushFrame(arc, length); + } + + // Pushes a frame we seek'd to + SegmentTermsEnumFrame pushFrame(FST.Arc arc, int length) throws IOException { + outputAccumulator.prepareRead(); + final long code = fr.readVLongOutput(outputAccumulator); final long fpSeek = code >>> Lucene90BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS; final SegmentTermsEnumFrame f = getFrame(1 + currentFrame.ord); f.hasTerms = (code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS) != 0; f.hasTermsOrig = f.hasTerms; f.isFloor = (code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0; if (f.isFloor) { - f.setFloorData(scratchReader, frameData); + f.setFloorData(outputAccumulator); } pushFrame(arc, fpSeek, length); @@ -344,9 +351,9 @@ final class SegmentTermsEnum extends BaseTermsEnum { FST.Arc arc; int targetUpto; - BytesRef output; targetBeforeCurrentLength = currentFrame.ord; + outputAccumulator.reset(); if (currentFrame != staticFrame) { @@ -363,7 +370,7 @@ final class SegmentTermsEnum extends BaseTermsEnum { arc = arcs[0]; assert arc.isFinal(); - output = arc.output(); + outputAccumulator.push(arc.output()); targetUpto = 0; SegmentTermsEnumFrame lastFrame = stack[0]; @@ -373,9 +380,6 @@ final class SegmentTermsEnum extends BaseTermsEnum { int cmp = 0; - // TODO: reverse vLong byte order for better FST - // prefix output sharing - // First compare up to valid seek frames: while (targetUpto < targetLimit) { cmp = (term.byteAt(targetUpto) & 0xFF) - (target.bytes[target.offset + targetUpto] & 0xFF); @@ -394,9 +398,8 @@ final class SegmentTermsEnum extends BaseTermsEnum { + (char) arc.label() + " targetLabel=" + (char) (target.bytes[target.offset + targetUpto] & 0xFF); - if (arc.output() != Lucene90BlockTreeTermsReader.NO_OUTPUT) { - output = Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.output()); - } + outputAccumulator.push(arc.output()); + if (arc.isFinal()) { lastFrame = stack[1 + lastFrame.ord]; } @@ -484,15 +487,15 @@ final class SegmentTermsEnum extends BaseTermsEnum { // System.out.println(" no seek state; push root frame"); // } - output = arc.output(); + outputAccumulator.push(arc.output()); currentFrame = staticFrame; // term.length = 0; targetUpto = 0; - currentFrame = - pushFrame( - arc, Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.nextFinalOutput()), 0); + outputAccumulator.push(arc.nextFinalOutput()); + currentFrame = pushFrame(arc, 0); + outputAccumulator.pop(); } // if (DEBUG) { @@ -554,9 +557,7 @@ final class SegmentTermsEnum extends BaseTermsEnum { term.setByteAt(targetUpto, (byte) targetLabel); // Aggregate output as we go: assert arc.output() != null; - if (arc.output() != Lucene90BlockTreeTermsReader.NO_OUTPUT) { - output = Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.output()); - } + outputAccumulator.push(arc.output()); // if (DEBUG) { // System.out.println(" index: follow label=" + toHex(target.bytes[target.offset + @@ -566,11 +567,9 @@ final class SegmentTermsEnum extends BaseTermsEnum { if (arc.isFinal()) { // if (DEBUG) System.out.println(" arc is final!"); - currentFrame = - pushFrame( - arc, - Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.nextFinalOutput()), - targetUpto); + outputAccumulator.push(arc.nextFinalOutput()); + currentFrame = pushFrame(arc, targetUpto); + outputAccumulator.pop(); // if (DEBUG) System.out.println(" curFrame.ord=" + currentFrame.ord + " hasTerms=" + // currentFrame.hasTerms); } @@ -630,9 +629,9 @@ final class SegmentTermsEnum extends BaseTermsEnum { FST.Arc arc; int targetUpto; - BytesRef output; targetBeforeCurrentLength = currentFrame.ord; + outputAccumulator.reset(); if (currentFrame != staticFrame) { @@ -649,7 +648,7 @@ final class SegmentTermsEnum extends BaseTermsEnum { arc = arcs[0]; assert arc.isFinal(); - output = arc.output(); + outputAccumulator.push(arc.output()); targetUpto = 0; SegmentTermsEnumFrame lastFrame = stack[0]; @@ -659,9 +658,6 @@ final class SegmentTermsEnum extends BaseTermsEnum { int cmp = 0; - // TODO: we should write our vLong backwards (MSB - // first) to get better sharing from the FST - // First compare up to valid seek frames: while (targetUpto < targetLimit) { cmp = (term.byteAt(targetUpto) & 0xFF) - (target.bytes[target.offset + targetUpto] & 0xFF); @@ -680,14 +676,8 @@ final class SegmentTermsEnum extends BaseTermsEnum { + (char) arc.label() + " targetLabel=" + (char) (target.bytes[target.offset + targetUpto] & 0xFF); - // TODO: we could save the outputs in local - // byte[][] instead of making new objs ever - // seek; but, often the FST doesn't have any - // shared bytes (but this could change if we - // reverse vLong byte order) - if (arc.output() != Lucene90BlockTreeTermsReader.NO_OUTPUT) { - output = Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.output()); - } + + outputAccumulator.push(arc.output()); if (arc.isFinal()) { lastFrame = stack[1 + lastFrame.ord]; } @@ -769,15 +759,15 @@ final class SegmentTermsEnum extends BaseTermsEnum { // System.out.println(" no seek state; push root frame"); // } - output = arc.output(); + outputAccumulator.push(arc.output()); currentFrame = staticFrame; // term.length = 0; targetUpto = 0; - currentFrame = - pushFrame( - arc, Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.nextFinalOutput()), 0); + outputAccumulator.push(arc.nextFinalOutput()); + currentFrame = pushFrame(arc, 0); + outputAccumulator.pop(); } // if (DEBUG) { @@ -839,9 +829,7 @@ final class SegmentTermsEnum extends BaseTermsEnum { arc = nextArc; // Aggregate output as we go: assert arc.output() != null; - if (arc.output() != Lucene90BlockTreeTermsReader.NO_OUTPUT) { - output = Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.output()); - } + outputAccumulator.push(arc.output()); // if (DEBUG) { // System.out.println(" index: follow label=" + (target.bytes[target.offset + @@ -851,11 +839,9 @@ final class SegmentTermsEnum extends BaseTermsEnum { if (arc.isFinal()) { // if (DEBUG) System.out.println(" arc is final!"); - currentFrame = - pushFrame( - arc, - Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.nextFinalOutput()), - targetUpto); + outputAccumulator.push(arc.nextFinalOutput()); + currentFrame = pushFrame(arc, targetUpto); + outputAccumulator.pop(); // if (DEBUG) System.out.println(" curFrame.ord=" + currentFrame.ord + " hasTerms=" + // currentFrame.hasTerms); } @@ -1190,4 +1176,68 @@ final class SegmentTermsEnum extends BaseTermsEnum { public long ord() { throw new UnsupportedOperationException(); } + + static class OutputAccumulator extends DataInput { + + BytesRef[] outputs = new BytesRef[16]; + BytesRef current; + int num; + int outputIndex; + int index; + + void push(BytesRef output) { + if (output != Lucene90BlockTreeTermsReader.NO_OUTPUT) { + outputs = ArrayUtil.grow(outputs, num + 1); + outputs[num++] = output; + } + } + + void pop() { + assert num > 0; + num--; + } + + void reset() { + num = 0; + } + + void prepareRead() { + index = 0; + outputIndex = 0; + current = outputs[0]; + } + + /** + * Set the last arc as the source of the floorData. This won't change the reading position of + * this {@link OutputAccumulator} + */ + void setFloorData(ByteArrayDataInput floorData) { + assert outputIndex == num - 1 + : "floor data should be stored in last arc, get outputIndex: " + + outputIndex + + ", num: " + + num; + BytesRef output = outputs[outputIndex]; + floorData.reset(output.bytes, output.offset + index, output.length - index); + } + + @Override + public byte readByte() throws IOException { + if (index >= current.length) { + current = outputs[++outputIndex]; + index = 0; + } + return current.bytes[current.offset + index++]; + } + + @Override + public void readBytes(byte[] b, int offset, int len) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public void skipBytes(long numBytes) throws IOException { + throw new UnsupportedOperationException(); + } + } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnumFrame.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnumFrame.java index 48c4fd0a6d4..4016b5c784d 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnumFrame.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnumFrame.java @@ -55,7 +55,7 @@ final class SegmentTermsEnumFrame { int statsSingletonRunLength = 0; final ByteArrayDataInput statsReader = new ByteArrayDataInput(); - byte[] floorData = new byte[32]; + int rewindPos; final ByteArrayDataInput floorDataReader = new ByteArrayDataInput(); // Length of prefix shared by all terms in this block @@ -104,13 +104,9 @@ final class SegmentTermsEnumFrame { suffixLengthsReader = new ByteArrayDataInput(); } - public void setFloorData(ByteArrayDataInput in, BytesRef source) { - final int numBytes = source.length - (in.getPosition() - source.offset); - if (numBytes > floorData.length) { - floorData = new byte[ArrayUtil.oversize(numBytes, 1)]; - } - System.arraycopy(source.bytes, source.offset + in.getPosition(), floorData, 0, numBytes); - floorDataReader.reset(floorData, 0, numBytes); + public void setFloorData(SegmentTermsEnum.OutputAccumulator outputAccumulator) { + outputAccumulator.setFloorData(floorDataReader); + rewindPos = floorDataReader.getPosition(); numFollowFloorBlocks = floorDataReader.readVInt(); nextFloorLabel = floorDataReader.readByte() & 0xff; // if (DEBUG) { @@ -247,7 +243,7 @@ final class SegmentTermsEnumFrame { nextEnt = -1; hasTerms = hasTermsOrig; if (isFloor) { - floorDataReader.rewind(); + floorDataReader.setPosition(rewindPos); numFollowFloorBlocks = floorDataReader.readVInt(); assert numFollowFloorBlocks > 0; nextFloorLabel = floorDataReader.readByte() & 0xff;