mirror of https://github.com/apache/lucene.git
Optimize outputs accumulating for SegmentTermsEnum and IntersectTermsEnum (#12699)
This commit is contained in:
parent
a6d788e113
commit
d92efa3b5c
|
@ -30,9 +30,7 @@ import org.apache.lucene.util.StringHelper;
|
|||
import org.apache.lucene.util.automaton.Automaton;
|
||||
import org.apache.lucene.util.automaton.RunAutomaton;
|
||||
import org.apache.lucene.util.automaton.Transition;
|
||||
import org.apache.lucene.util.fst.ByteSequenceOutputs;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
import org.apache.lucene.util.fst.Outputs;
|
||||
|
||||
/**
|
||||
* This is used to implement efficient {@link Terms#intersect} for block-tree. Note that it cannot
|
||||
|
@ -46,7 +44,6 @@ final class IntersectTermsEnum extends BaseTermsEnum {
|
|||
// static boolean DEBUG = BlockTreeTermsWriter.DEBUG;
|
||||
|
||||
final IndexInput in;
|
||||
static final Outputs<BytesRef> fstOutputs = ByteSequenceOutputs.getSingleton();
|
||||
|
||||
IntersectTermsEnumFrame[] stack;
|
||||
|
||||
|
@ -68,6 +65,9 @@ final class IntersectTermsEnum extends BaseTermsEnum {
|
|||
|
||||
private BytesRef savedStartTerm;
|
||||
|
||||
private final SegmentTermsEnum.OutputAccumulator outputAccumulator =
|
||||
new SegmentTermsEnum.OutputAccumulator();
|
||||
|
||||
// TODO: in some cases we can filter by length? eg
|
||||
// regexp foo*bar must be at least length 6 bytes
|
||||
public IntersectTermsEnum(
|
||||
|
@ -114,7 +114,6 @@ final class IntersectTermsEnum extends BaseTermsEnum {
|
|||
f.prefix = 0;
|
||||
f.setState(0);
|
||||
f.arc = arc;
|
||||
f.outputPrefix = arc.output();
|
||||
f.load(fr.rootCode);
|
||||
|
||||
// for assert:
|
||||
|
@ -184,7 +183,9 @@ final class IntersectTermsEnum extends BaseTermsEnum {
|
|||
FST.Arc<BytesRef> arc = currentFrame.arc;
|
||||
int idx = currentFrame.prefix;
|
||||
assert currentFrame.suffix > 0;
|
||||
BytesRef output = currentFrame.outputPrefix;
|
||||
|
||||
outputAccumulator.reset();
|
||||
outputAccumulator.push(arc.output());
|
||||
while (idx < f.prefix) {
|
||||
final int target = term.bytes[idx] & 0xff;
|
||||
// TODO: we could be more efficient for the next()
|
||||
|
@ -192,14 +193,14 @@ final class IntersectTermsEnum extends BaseTermsEnum {
|
|||
// passed to findTargetArc
|
||||
arc = fr.index.findTargetArc(target, arc, getArc(1 + idx), fstReader);
|
||||
assert arc != null;
|
||||
output = fstOutputs.add(output, arc.output());
|
||||
outputAccumulator.push(arc.output());
|
||||
idx++;
|
||||
}
|
||||
|
||||
f.arc = arc;
|
||||
f.outputPrefix = output;
|
||||
assert arc.isFinal();
|
||||
f.load(fstOutputs.add(output, arc.nextFinalOutput()));
|
||||
outputAccumulator.push(arc.nextFinalOutput());
|
||||
f.load(outputAccumulator);
|
||||
return f;
|
||||
}
|
||||
|
||||
|
|
|
@ -55,7 +55,6 @@ final class IntersectTermsEnumFrame {
|
|||
int statsSingletonRunLength = 0;
|
||||
final ByteArrayDataInput statsReader = new ByteArrayDataInput();
|
||||
|
||||
byte[] floorData = new byte[32];
|
||||
final ByteArrayDataInput floorDataReader = new ByteArrayDataInput();
|
||||
|
||||
// Length of prefix shared by all terms in this block
|
||||
|
@ -90,9 +89,6 @@ final class IntersectTermsEnumFrame {
|
|||
|
||||
final ByteArrayDataInput bytesReader = new ByteArrayDataInput();
|
||||
|
||||
// Cumulative output so far
|
||||
BytesRef outputPrefix;
|
||||
|
||||
int startBytePos;
|
||||
int suffix;
|
||||
|
||||
|
@ -120,7 +116,7 @@ final class IntersectTermsEnumFrame {
|
|||
}
|
||||
} while (numFollowFloorBlocks != 0 && nextFloorLabel <= transition.min);
|
||||
|
||||
load(null);
|
||||
load((Long) null);
|
||||
}
|
||||
|
||||
public void setState(int state) {
|
||||
|
@ -142,12 +138,22 @@ final class IntersectTermsEnumFrame {
|
|||
}
|
||||
|
||||
void load(BytesRef frameIndexData) throws IOException {
|
||||
if (frameIndexData != null) {
|
||||
floorDataReader.reset(frameIndexData.bytes, frameIndexData.offset, frameIndexData.length);
|
||||
// Skip first long -- has redundant fp, hasTerms
|
||||
// flag, isFloor flag
|
||||
final long code = ite.fr.readVLongOutput(floorDataReader);
|
||||
if ((code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0) {
|
||||
load(ite.fr.readVLongOutput(floorDataReader));
|
||||
}
|
||||
|
||||
void load(SegmentTermsEnum.OutputAccumulator outputAccumulator) throws IOException {
|
||||
outputAccumulator.prepareRead();
|
||||
long code = ite.fr.readVLongOutput(outputAccumulator);
|
||||
outputAccumulator.setFloorData(floorDataReader);
|
||||
load(code);
|
||||
}
|
||||
|
||||
void load(Long blockCode) throws IOException {
|
||||
if (blockCode != null) {
|
||||
// This block is the first one in a possible sequence of floor blocks corresponding to a
|
||||
// single seek point from the FST terms index
|
||||
if ((blockCode & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0) {
|
||||
// Floor frame
|
||||
numFollowFloorBlocks = floorDataReader.readVInt();
|
||||
nextFloorLabel = floorDataReader.readByte() & 0xff;
|
||||
|
|
|
@ -24,6 +24,7 @@ import org.apache.lucene.index.ImpactsEnum;
|
|||
import org.apache.lucene.index.PostingsEnum;
|
||||
import org.apache.lucene.index.TermState;
|
||||
import org.apache.lucene.store.ByteArrayDataInput;
|
||||
import org.apache.lucene.store.DataInput;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
@ -48,7 +49,7 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
|||
|
||||
// static boolean DEBUG = BlockTreeTermsWriter.DEBUG;
|
||||
|
||||
private final ByteArrayDataInput scratchReader = new ByteArrayDataInput();
|
||||
private final OutputAccumulator outputAccumulator = new OutputAccumulator();
|
||||
|
||||
// What prefix of the current term was present in the index; when we only next() through the
|
||||
// index, this stays at 0. It's only set when
|
||||
|
@ -232,18 +233,24 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
|||
return arcs[ord];
|
||||
}
|
||||
|
||||
// Pushes a frame we seek'd to
|
||||
SegmentTermsEnumFrame pushFrame(FST.Arc<BytesRef> arc, BytesRef frameData, int length)
|
||||
throws IOException {
|
||||
scratchReader.reset(frameData.bytes, frameData.offset, frameData.length);
|
||||
final long code = fr.readVLongOutput(scratchReader);
|
||||
outputAccumulator.reset();
|
||||
outputAccumulator.push(frameData);
|
||||
return pushFrame(arc, length);
|
||||
}
|
||||
|
||||
// Pushes a frame we seek'd to
|
||||
SegmentTermsEnumFrame pushFrame(FST.Arc<BytesRef> arc, int length) throws IOException {
|
||||
outputAccumulator.prepareRead();
|
||||
final long code = fr.readVLongOutput(outputAccumulator);
|
||||
final long fpSeek = code >>> Lucene90BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS;
|
||||
final SegmentTermsEnumFrame f = getFrame(1 + currentFrame.ord);
|
||||
f.hasTerms = (code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS) != 0;
|
||||
f.hasTermsOrig = f.hasTerms;
|
||||
f.isFloor = (code & Lucene90BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0;
|
||||
if (f.isFloor) {
|
||||
f.setFloorData(scratchReader, frameData);
|
||||
f.setFloorData(outputAccumulator);
|
||||
}
|
||||
pushFrame(arc, fpSeek, length);
|
||||
|
||||
|
@ -344,9 +351,9 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
|||
|
||||
FST.Arc<BytesRef> arc;
|
||||
int targetUpto;
|
||||
BytesRef output;
|
||||
|
||||
targetBeforeCurrentLength = currentFrame.ord;
|
||||
outputAccumulator.reset();
|
||||
|
||||
if (currentFrame != staticFrame) {
|
||||
|
||||
|
@ -363,7 +370,7 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
|||
|
||||
arc = arcs[0];
|
||||
assert arc.isFinal();
|
||||
output = arc.output();
|
||||
outputAccumulator.push(arc.output());
|
||||
targetUpto = 0;
|
||||
|
||||
SegmentTermsEnumFrame lastFrame = stack[0];
|
||||
|
@ -373,9 +380,6 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
|||
|
||||
int cmp = 0;
|
||||
|
||||
// TODO: reverse vLong byte order for better FST
|
||||
// prefix output sharing
|
||||
|
||||
// First compare up to valid seek frames:
|
||||
while (targetUpto < targetLimit) {
|
||||
cmp = (term.byteAt(targetUpto) & 0xFF) - (target.bytes[target.offset + targetUpto] & 0xFF);
|
||||
|
@ -394,9 +398,8 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
|||
+ (char) arc.label()
|
||||
+ " targetLabel="
|
||||
+ (char) (target.bytes[target.offset + targetUpto] & 0xFF);
|
||||
if (arc.output() != Lucene90BlockTreeTermsReader.NO_OUTPUT) {
|
||||
output = Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.output());
|
||||
}
|
||||
outputAccumulator.push(arc.output());
|
||||
|
||||
if (arc.isFinal()) {
|
||||
lastFrame = stack[1 + lastFrame.ord];
|
||||
}
|
||||
|
@ -484,15 +487,15 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
|||
// System.out.println(" no seek state; push root frame");
|
||||
// }
|
||||
|
||||
output = arc.output();
|
||||
outputAccumulator.push(arc.output());
|
||||
|
||||
currentFrame = staticFrame;
|
||||
|
||||
// term.length = 0;
|
||||
targetUpto = 0;
|
||||
currentFrame =
|
||||
pushFrame(
|
||||
arc, Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.nextFinalOutput()), 0);
|
||||
outputAccumulator.push(arc.nextFinalOutput());
|
||||
currentFrame = pushFrame(arc, 0);
|
||||
outputAccumulator.pop();
|
||||
}
|
||||
|
||||
// if (DEBUG) {
|
||||
|
@ -554,9 +557,7 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
|||
term.setByteAt(targetUpto, (byte) targetLabel);
|
||||
// Aggregate output as we go:
|
||||
assert arc.output() != null;
|
||||
if (arc.output() != Lucene90BlockTreeTermsReader.NO_OUTPUT) {
|
||||
output = Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.output());
|
||||
}
|
||||
outputAccumulator.push(arc.output());
|
||||
|
||||
// if (DEBUG) {
|
||||
// System.out.println(" index: follow label=" + toHex(target.bytes[target.offset +
|
||||
|
@ -566,11 +567,9 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
|||
|
||||
if (arc.isFinal()) {
|
||||
// if (DEBUG) System.out.println(" arc is final!");
|
||||
currentFrame =
|
||||
pushFrame(
|
||||
arc,
|
||||
Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.nextFinalOutput()),
|
||||
targetUpto);
|
||||
outputAccumulator.push(arc.nextFinalOutput());
|
||||
currentFrame = pushFrame(arc, targetUpto);
|
||||
outputAccumulator.pop();
|
||||
// if (DEBUG) System.out.println(" curFrame.ord=" + currentFrame.ord + " hasTerms=" +
|
||||
// currentFrame.hasTerms);
|
||||
}
|
||||
|
@ -630,9 +629,9 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
|||
|
||||
FST.Arc<BytesRef> arc;
|
||||
int targetUpto;
|
||||
BytesRef output;
|
||||
|
||||
targetBeforeCurrentLength = currentFrame.ord;
|
||||
outputAccumulator.reset();
|
||||
|
||||
if (currentFrame != staticFrame) {
|
||||
|
||||
|
@ -649,7 +648,7 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
|||
|
||||
arc = arcs[0];
|
||||
assert arc.isFinal();
|
||||
output = arc.output();
|
||||
outputAccumulator.push(arc.output());
|
||||
targetUpto = 0;
|
||||
|
||||
SegmentTermsEnumFrame lastFrame = stack[0];
|
||||
|
@ -659,9 +658,6 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
|||
|
||||
int cmp = 0;
|
||||
|
||||
// TODO: we should write our vLong backwards (MSB
|
||||
// first) to get better sharing from the FST
|
||||
|
||||
// First compare up to valid seek frames:
|
||||
while (targetUpto < targetLimit) {
|
||||
cmp = (term.byteAt(targetUpto) & 0xFF) - (target.bytes[target.offset + targetUpto] & 0xFF);
|
||||
|
@ -680,14 +676,8 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
|||
+ (char) arc.label()
|
||||
+ " targetLabel="
|
||||
+ (char) (target.bytes[target.offset + targetUpto] & 0xFF);
|
||||
// TODO: we could save the outputs in local
|
||||
// byte[][] instead of making new objs ever
|
||||
// seek; but, often the FST doesn't have any
|
||||
// shared bytes (but this could change if we
|
||||
// reverse vLong byte order)
|
||||
if (arc.output() != Lucene90BlockTreeTermsReader.NO_OUTPUT) {
|
||||
output = Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.output());
|
||||
}
|
||||
|
||||
outputAccumulator.push(arc.output());
|
||||
if (arc.isFinal()) {
|
||||
lastFrame = stack[1 + lastFrame.ord];
|
||||
}
|
||||
|
@ -769,15 +759,15 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
|||
// System.out.println(" no seek state; push root frame");
|
||||
// }
|
||||
|
||||
output = arc.output();
|
||||
outputAccumulator.push(arc.output());
|
||||
|
||||
currentFrame = staticFrame;
|
||||
|
||||
// term.length = 0;
|
||||
targetUpto = 0;
|
||||
currentFrame =
|
||||
pushFrame(
|
||||
arc, Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.nextFinalOutput()), 0);
|
||||
outputAccumulator.push(arc.nextFinalOutput());
|
||||
currentFrame = pushFrame(arc, 0);
|
||||
outputAccumulator.pop();
|
||||
}
|
||||
|
||||
// if (DEBUG) {
|
||||
|
@ -839,9 +829,7 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
|||
arc = nextArc;
|
||||
// Aggregate output as we go:
|
||||
assert arc.output() != null;
|
||||
if (arc.output() != Lucene90BlockTreeTermsReader.NO_OUTPUT) {
|
||||
output = Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.output());
|
||||
}
|
||||
outputAccumulator.push(arc.output());
|
||||
|
||||
// if (DEBUG) {
|
||||
// System.out.println(" index: follow label=" + (target.bytes[target.offset +
|
||||
|
@ -851,11 +839,9 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
|||
|
||||
if (arc.isFinal()) {
|
||||
// if (DEBUG) System.out.println(" arc is final!");
|
||||
currentFrame =
|
||||
pushFrame(
|
||||
arc,
|
||||
Lucene90BlockTreeTermsReader.FST_OUTPUTS.add(output, arc.nextFinalOutput()),
|
||||
targetUpto);
|
||||
outputAccumulator.push(arc.nextFinalOutput());
|
||||
currentFrame = pushFrame(arc, targetUpto);
|
||||
outputAccumulator.pop();
|
||||
// if (DEBUG) System.out.println(" curFrame.ord=" + currentFrame.ord + " hasTerms=" +
|
||||
// currentFrame.hasTerms);
|
||||
}
|
||||
|
@ -1190,4 +1176,68 @@ final class SegmentTermsEnum extends BaseTermsEnum {
|
|||
public long ord() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
static class OutputAccumulator extends DataInput {
|
||||
|
||||
BytesRef[] outputs = new BytesRef[16];
|
||||
BytesRef current;
|
||||
int num;
|
||||
int outputIndex;
|
||||
int index;
|
||||
|
||||
void push(BytesRef output) {
|
||||
if (output != Lucene90BlockTreeTermsReader.NO_OUTPUT) {
|
||||
outputs = ArrayUtil.grow(outputs, num + 1);
|
||||
outputs[num++] = output;
|
||||
}
|
||||
}
|
||||
|
||||
void pop() {
|
||||
assert num > 0;
|
||||
num--;
|
||||
}
|
||||
|
||||
void reset() {
|
||||
num = 0;
|
||||
}
|
||||
|
||||
void prepareRead() {
|
||||
index = 0;
|
||||
outputIndex = 0;
|
||||
current = outputs[0];
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the last arc as the source of the floorData. This won't change the reading position of
|
||||
* this {@link OutputAccumulator}
|
||||
*/
|
||||
void setFloorData(ByteArrayDataInput floorData) {
|
||||
assert outputIndex == num - 1
|
||||
: "floor data should be stored in last arc, get outputIndex: "
|
||||
+ outputIndex
|
||||
+ ", num: "
|
||||
+ num;
|
||||
BytesRef output = outputs[outputIndex];
|
||||
floorData.reset(output.bytes, output.offset + index, output.length - index);
|
||||
}
|
||||
|
||||
@Override
|
||||
public byte readByte() throws IOException {
|
||||
if (index >= current.length) {
|
||||
current = outputs[++outputIndex];
|
||||
index = 0;
|
||||
}
|
||||
return current.bytes[current.offset + index++];
|
||||
}
|
||||
|
||||
@Override
|
||||
public void readBytes(byte[] b, int offset, int len) throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void skipBytes(long numBytes) throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -55,7 +55,7 @@ final class SegmentTermsEnumFrame {
|
|||
int statsSingletonRunLength = 0;
|
||||
final ByteArrayDataInput statsReader = new ByteArrayDataInput();
|
||||
|
||||
byte[] floorData = new byte[32];
|
||||
int rewindPos;
|
||||
final ByteArrayDataInput floorDataReader = new ByteArrayDataInput();
|
||||
|
||||
// Length of prefix shared by all terms in this block
|
||||
|
@ -104,13 +104,9 @@ final class SegmentTermsEnumFrame {
|
|||
suffixLengthsReader = new ByteArrayDataInput();
|
||||
}
|
||||
|
||||
public void setFloorData(ByteArrayDataInput in, BytesRef source) {
|
||||
final int numBytes = source.length - (in.getPosition() - source.offset);
|
||||
if (numBytes > floorData.length) {
|
||||
floorData = new byte[ArrayUtil.oversize(numBytes, 1)];
|
||||
}
|
||||
System.arraycopy(source.bytes, source.offset + in.getPosition(), floorData, 0, numBytes);
|
||||
floorDataReader.reset(floorData, 0, numBytes);
|
||||
public void setFloorData(SegmentTermsEnum.OutputAccumulator outputAccumulator) {
|
||||
outputAccumulator.setFloorData(floorDataReader);
|
||||
rewindPos = floorDataReader.getPosition();
|
||||
numFollowFloorBlocks = floorDataReader.readVInt();
|
||||
nextFloorLabel = floorDataReader.readByte() & 0xff;
|
||||
// if (DEBUG) {
|
||||
|
@ -247,7 +243,7 @@ final class SegmentTermsEnumFrame {
|
|||
nextEnt = -1;
|
||||
hasTerms = hasTermsOrig;
|
||||
if (isFloor) {
|
||||
floorDataReader.rewind();
|
||||
floorDataReader.setPosition(rewindPos);
|
||||
numFollowFloorBlocks = floorDataReader.readVInt();
|
||||
assert numFollowFloorBlocks > 0;
|
||||
nextFloorLabel = floorDataReader.readByte() & 0xff;
|
||||
|
|
Loading…
Reference in New Issue