Optimize outputs accumulating for SegmentTermsEnum and IntersectTermsEnum (#12699)

This commit is contained in:
gf2121 2023-11-28 13:04:41 +08:00 committed by GitHub
parent 38ca8d3e42
commit 9574cbd1f1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 131 additions and 78 deletions

View File

@ -30,9 +30,7 @@ import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.automaton.ByteRunnable;
import org.apache.lucene.util.automaton.Transition;
import org.apache.lucene.util.automaton.TransitionAccessor;
import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.Outputs;
/**
* This is used to implement efficient {@link Terms#intersect} for block-tree. Note that it cannot
@ -46,7 +44,6 @@ final class IntersectTermsEnum extends BaseTermsEnum {
// static boolean DEBUG = BlockTreeTermsWriter.DEBUG;
final IndexInput in;
static final Outputs<BytesRef> fstOutputs = ByteSequenceOutputs.getSingleton();
IntersectTermsEnumFrame[] stack;
@ -68,6 +65,9 @@ final class IntersectTermsEnum extends BaseTermsEnum {
private BytesRef savedStartTerm;
private final SegmentTermsEnum.OutputAccumulator outputAccumulator =
new SegmentTermsEnum.OutputAccumulator();
// TODO: in some cases we can filter by length? eg
// regexp foo*bar must be at least length 6 bytes
public IntersectTermsEnum(
@ -114,7 +114,6 @@ final class IntersectTermsEnum extends BaseTermsEnum {
f.prefix = 0;
f.setState(0);
f.arc = arc;
f.outputPrefix = arc.output();
f.load(fr.rootCode);
// for assert:
@ -184,7 +183,9 @@ final class IntersectTermsEnum extends BaseTermsEnum {
FST.Arc<BytesRef> arc = currentFrame.arc;
int idx = currentFrame.prefix;
assert currentFrame.suffix > 0;
BytesRef output = currentFrame.outputPrefix;
outputAccumulator.reset();
outputAccumulator.push(arc.output());
while (idx < f.prefix) {
final int target = term.bytes[idx] & 0xff;
// TODO: we could be more efficient for the next()
@ -192,14 +193,14 @@ final class IntersectTermsEnum extends BaseTermsEnum {
// passed to findTargetArc
arc = fr.index.findTargetArc(target, arc, getArc(1 + idx), fstReader);
assert arc != null;
output = fstOutputs.add(output, arc.output());
outputAccumulator.push(arc.output());
idx++;
}
f.arc = arc;
f.outputPrefix = output;
assert arc.isFinal();
f.load(fstOutputs.add(output, arc.nextFinalOutput()));
outputAccumulator.push(arc.nextFinalOutput());
f.load(outputAccumulator);
return f;
}

View File

@ -55,7 +55,6 @@ final class IntersectTermsEnumFrame {
int statsSingletonRunLength = 0;
final ByteArrayDataInput statsReader = new ByteArrayDataInput();
byte[] floorData = new byte[32];
final ByteArrayDataInput floorDataReader = new ByteArrayDataInput();
// Length of prefix shared by all terms in this block
@ -90,9 +89,6 @@ final class IntersectTermsEnumFrame {
final ByteArrayDataInput bytesReader = new ByteArrayDataInput();
// Cumulative output so far
BytesRef outputPrefix;
int startBytePos;
int suffix;
@ -120,7 +116,7 @@ final class IntersectTermsEnumFrame {
}
} while (numFollowFloorBlocks != 0 && nextFloorLabel <= transition.min);
load(null);
load((Long) null);
}
public void setState(int state) {
@ -142,12 +138,22 @@ final class IntersectTermsEnumFrame {
}
void load(BytesRef frameIndexData) throws IOException {
if (frameIndexData != null) {
floorDataReader.reset(frameIndexData.bytes, frameIndexData.offset, frameIndexData.length);
// Skip first long -- has redundant fp, hasTerms
// flag, isFloor flag
final long code = ite.fr.readVLongOutput(floorDataReader);
if ((code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0) {
floorDataReader.reset(frameIndexData.bytes, frameIndexData.offset, frameIndexData.length);
load(ite.fr.readVLongOutput(floorDataReader));
}
void load(SegmentTermsEnum.OutputAccumulator outputAccumulator) throws IOException {
outputAccumulator.prepareRead();
long code = ite.fr.readVLongOutput(outputAccumulator);
outputAccumulator.setFloorData(floorDataReader);
load(code);
}
void load(Long blockCode) throws IOException {
if (blockCode != null) {
// This block is the first one in a possible sequence of floor blocks corresponding to a
// single seek point from the FST terms index
if ((blockCode & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0) {
// Floor frame
numFollowFloorBlocks = floorDataReader.readVInt();
nextFloorLabel = floorDataReader.readByte() & 0xff;

View File

@ -24,6 +24,7 @@ import org.apache.lucene.index.ImpactsEnum;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.TermState;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
@ -48,7 +49,7 @@ final class SegmentTermsEnum extends BaseTermsEnum {
// static boolean DEBUG = BlockTreeTermsWriter.DEBUG;
private final ByteArrayDataInput scratchReader = new ByteArrayDataInput();
private final OutputAccumulator outputAccumulator = new OutputAccumulator();
// What prefix of the current term was present in the index; when we only next() through the
// index, this stays at 0. It's only set when
@ -232,18 +233,24 @@ final class SegmentTermsEnum extends BaseTermsEnum {
return arcs[ord];
}
// Pushes a frame we seek'd to
SegmentTermsEnumFrame pushFrame(FST.Arc<BytesRef> arc, BytesRef frameData, int length)
throws IOException {
scratchReader.reset(frameData.bytes, frameData.offset, frameData.length);
final long code = fr.readVLongOutput(scratchReader);
outputAccumulator.reset();
outputAccumulator.push(frameData);
return pushFrame(arc, length);
}
// Pushes a frame we seek'd to
SegmentTermsEnumFrame pushFrame(FST.Arc<BytesRef> arc, int length) throws IOException {
outputAccumulator.prepareRead();
final long code = fr.readVLongOutput(outputAccumulator);
final long fpSeek = code >>> Lucene90BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS;
final SegmentTermsEnumFrame f = getFrame(1 + currentFrame.ord);
f.hasTerms = (code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS) != 0;
f.hasTermsOrig = f.hasTerms;
f.isFloor = (code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0;
if (f.isFloor) {
f.setFloorData(scratchReader, frameData);
f.setFloorData(outputAccumulator);
}
pushFrame(arc, fpSeek, length);
@ -344,9 +351,9 @@ final class SegmentTermsEnum extends BaseTermsEnum {
FST.Arc<BytesRef> arc;
int targetUpto;
BytesRef output;
targetBeforeCurrentLength = currentFrame.ord;
outputAccumulator.reset();
if (currentFrame != staticFrame) {
@ -363,7 +370,7 @@ final class SegmentTermsEnum extends BaseTermsEnum {
arc = arcs[0];
assert arc.isFinal();
output = arc.output();
outputAccumulator.push(arc.output());
targetUpto = 0;
SegmentTermsEnumFrame lastFrame = stack[0];
@ -373,9 +380,6 @@ final class SegmentTermsEnum extends BaseTermsEnum {
int cmp = 0;
// TODO: reverse vLong byte order for better FST
// prefix output sharing
// First compare up to valid seek frames:
while (targetUpto < targetLimit) {
cmp = (term.byteAt(targetUpto) & 0xFF) - (target.bytes[target.offset + targetUpto] & 0xFF);
@ -394,9 +398,8 @@ final class SegmentTermsEnum extends BaseTermsEnum {
+ (char) arc.label()
+ " targetLabel="
+ (char) (target.bytes[target.offset + targetUpto] & 0xFF);
if (arc.output() != Lucene90BlockTreeTermsReader.NO_OUTPUT) {
output = Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.output());
}
outputAccumulator.push(arc.output());
if (arc.isFinal()) {
lastFrame = stack[1 + lastFrame.ord];
}
@ -484,15 +487,15 @@ final class SegmentTermsEnum extends BaseTermsEnum {
// System.out.println(" no seek state; push root frame");
// }
output = arc.output();
outputAccumulator.push(arc.output());
currentFrame = staticFrame;
// term.length = 0;
targetUpto = 0;
currentFrame =
pushFrame(
arc, Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.nextFinalOutput()), 0);
outputAccumulator.push(arc.nextFinalOutput());
currentFrame = pushFrame(arc, 0);
outputAccumulator.pop();
}
// if (DEBUG) {
@ -554,9 +557,7 @@ final class SegmentTermsEnum extends BaseTermsEnum {
term.setByteAt(targetUpto, (byte) targetLabel);
// Aggregate output as we go:
assert arc.output() != null;
if (arc.output() != Lucene90BlockTreeTermsReader.NO_OUTPUT) {
output = Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.output());
}
outputAccumulator.push(arc.output());
// if (DEBUG) {
// System.out.println(" index: follow label=" + toHex(target.bytes[target.offset +
@ -566,11 +567,9 @@ final class SegmentTermsEnum extends BaseTermsEnum {
if (arc.isFinal()) {
// if (DEBUG) System.out.println(" arc is final!");
currentFrame =
pushFrame(
arc,
Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.nextFinalOutput()),
targetUpto);
outputAccumulator.push(arc.nextFinalOutput());
currentFrame = pushFrame(arc, targetUpto);
outputAccumulator.pop();
// if (DEBUG) System.out.println(" curFrame.ord=" + currentFrame.ord + " hasTerms=" +
// currentFrame.hasTerms);
}
@ -630,9 +629,9 @@ final class SegmentTermsEnum extends BaseTermsEnum {
FST.Arc<BytesRef> arc;
int targetUpto;
BytesRef output;
targetBeforeCurrentLength = currentFrame.ord;
outputAccumulator.reset();
if (currentFrame != staticFrame) {
@ -649,7 +648,7 @@ final class SegmentTermsEnum extends BaseTermsEnum {
arc = arcs[0];
assert arc.isFinal();
output = arc.output();
outputAccumulator.push(arc.output());
targetUpto = 0;
SegmentTermsEnumFrame lastFrame = stack[0];
@ -659,9 +658,6 @@ final class SegmentTermsEnum extends BaseTermsEnum {
int cmp = 0;
// TODO: we should write our vLong backwards (MSB
// first) to get better sharing from the FST
// First compare up to valid seek frames:
while (targetUpto < targetLimit) {
cmp = (term.byteAt(targetUpto) & 0xFF) - (target.bytes[target.offset + targetUpto] & 0xFF);
@ -680,14 +676,8 @@ final class SegmentTermsEnum extends BaseTermsEnum {
+ (char) arc.label()
+ " targetLabel="
+ (char) (target.bytes[target.offset + targetUpto] & 0xFF);
// TODO: we could save the outputs in local
// byte[][] instead of making new objs ever
// seek; but, often the FST doesn't have any
// shared bytes (but this could change if we
// reverse vLong byte order)
if (arc.output() != Lucene90BlockTreeTermsReader.NO_OUTPUT) {
output = Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.output());
}
outputAccumulator.push(arc.output());
if (arc.isFinal()) {
lastFrame = stack[1 + lastFrame.ord];
}
@ -769,15 +759,15 @@ final class SegmentTermsEnum extends BaseTermsEnum {
// System.out.println(" no seek state; push root frame");
// }
output = arc.output();
outputAccumulator.push(arc.output());
currentFrame = staticFrame;
// term.length = 0;
targetUpto = 0;
currentFrame =
pushFrame(
arc, Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.nextFinalOutput()), 0);
outputAccumulator.push(arc.nextFinalOutput());
currentFrame = pushFrame(arc, 0);
outputAccumulator.pop();
}
// if (DEBUG) {
@ -839,9 +829,7 @@ final class SegmentTermsEnum extends BaseTermsEnum {
arc = nextArc;
// Aggregate output as we go:
assert arc.output() != null;
if (arc.output() != Lucene90BlockTreeTermsReader.NO_OUTPUT) {
output = Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.output());
}
outputAccumulator.push(arc.output());
// if (DEBUG) {
// System.out.println(" index: follow label=" + (target.bytes[target.offset +
@ -851,11 +839,9 @@ final class SegmentTermsEnum extends BaseTermsEnum {
if (arc.isFinal()) {
// if (DEBUG) System.out.println(" arc is final!");
currentFrame =
pushFrame(
arc,
Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.nextFinalOutput()),
targetUpto);
outputAccumulator.push(arc.nextFinalOutput());
currentFrame = pushFrame(arc, targetUpto);
outputAccumulator.pop();
// if (DEBUG) System.out.println(" curFrame.ord=" + currentFrame.ord + " hasTerms=" +
// currentFrame.hasTerms);
}
@ -1190,4 +1176,68 @@ final class SegmentTermsEnum extends BaseTermsEnum {
public long ord() {
throw new UnsupportedOperationException();
}
static class OutputAccumulator extends DataInput {
BytesRef[] outputs = new BytesRef[16];
BytesRef current;
int num;
int outputIndex;
int index;
void push(BytesRef output) {
if (output != Lucene90BlockTreeTermsReader.NO_OUTPUT) {
outputs = ArrayUtil.grow(outputs, num + 1);
outputs[num++] = output;
}
}
void pop() {
assert num > 0;
num--;
}
void reset() {
num = 0;
}
void prepareRead() {
index = 0;
outputIndex = 0;
current = outputs[0];
}
/**
* Set the last arc as the source of the floorData. This won't change the reading position of
* this {@link OutputAccumulator}
*/
void setFloorData(ByteArrayDataInput floorData) {
assert outputIndex == num - 1
: "floor data should be stored in last arc, get outputIndex: "
+ outputIndex
+ ", num: "
+ num;
BytesRef output = outputs[outputIndex];
floorData.reset(output.bytes, output.offset + index, output.length - index);
}
@Override
public byte readByte() throws IOException {
if (index >= current.length) {
current = outputs[++outputIndex];
index = 0;
}
return current.bytes[current.offset + index++];
}
@Override
public void readBytes(byte[] b, int offset, int len) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public void skipBytes(long numBytes) throws IOException {
throw new UnsupportedOperationException();
}
}
}

View File

@ -55,7 +55,7 @@ final class SegmentTermsEnumFrame {
int statsSingletonRunLength = 0;
final ByteArrayDataInput statsReader = new ByteArrayDataInput();
byte[] floorData = new byte[32];
int rewindPos;
final ByteArrayDataInput floorDataReader = new ByteArrayDataInput();
// Length of prefix shared by all terms in this block
@ -104,13 +104,9 @@ final class SegmentTermsEnumFrame {
suffixLengthsReader = new ByteArrayDataInput();
}
public void setFloorData(ByteArrayDataInput in, BytesRef source) {
final int numBytes = source.length - (in.getPosition() - source.offset);
if (numBytes > floorData.length) {
floorData = new byte[ArrayUtil.oversize(numBytes, 1)];
}
System.arraycopy(source.bytes, source.offset + in.getPosition(), floorData, 0, numBytes);
floorDataReader.reset(floorData, 0, numBytes);
public void setFloorData(SegmentTermsEnum.OutputAccumulator outputAccumulator) {
outputAccumulator.setFloorData(floorDataReader);
rewindPos = floorDataReader.getPosition();
numFollowFloorBlocks = floorDataReader.readVInt();
nextFloorLabel = floorDataReader.readByte() & 0xff;
// if (DEBUG) {
@ -247,7 +243,7 @@ final class SegmentTermsEnumFrame {
nextEnt = -1;
hasTerms = hasTermsOrig;
if (isFloor) {
floorDataReader.rewind();
floorDataReader.setPosition(rewindPos);
numFollowFloorBlocks = floorDataReader.readVInt();
assert numFollowFloorBlocks > 0;
nextFloorLabel = floorDataReader.readByte() & 0xff;