LUCENE-8920: Disable direct addressing of arcs. (#950)

This commit is contained in:
Adrien Grand 2019-10-14 18:30:36 +02:00 committed by GitHub
parent b6ea7d60b7
commit 81f598c2e6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 10 additions and 99 deletions

View File

@ -50,9 +50,6 @@ import org.apache.lucene.util.fst.FST.INPUT_TYPE; // javadoc
public class Builder<T> {
// The amount of Arc array oversizing used to enable direct addressing of Arcs by their labels
static final int DIRECT_ARC_LOAD_FACTOR = 4;
private final NodeHash<T> dedupHash;
final FST<T> fst;
private final T NO_OUTPUT;

View File

@ -88,8 +88,6 @@ public final class FST<T> implements Accountable {
// this means either of these things in different contexts
// in the midst of a direct array:
private static final byte BIT_MISSING_ARC = 1 << 6;
// at the start of a direct array:
private static final byte ARCS_AS_ARRAY_WITH_GAPS = BIT_MISSING_ARC;
/**
* @see #shouldExpand(Builder, Builder.UnCompiledNode)
@ -109,7 +107,7 @@ public final class FST<T> implements Accountable {
// Increment version to change it
private static final String FILE_FORMAT_NAME = "FST";
private static final int VERSION_START = 6;
private static final int VERSION_CURRENT = 7;
private static final int VERSION_CURRENT = VERSION_START;
// Never serialized; just used to represent the virtual
// final node w/ no arcs:
@ -682,34 +680,19 @@ public final class FST<T> implements Accountable {
assert maxBytesPerArc > 0;
// 2nd pass just "expands" all arcs to take up a fixed byte size
// If more than (1 / DIRECT_ARC_LOAD_FACTOR) of the "slots" would be occupied, write an arc
// array that may have holes in it so that we can address the arcs directly by label without
// binary search
int labelRange = nodeIn.arcs[nodeIn.numArcs - 1].label - nodeIn.arcs[0].label + 1;
boolean writeDirectly = labelRange > 0 && labelRange < Builder.DIRECT_ARC_LOAD_FACTOR * nodeIn.numArcs;
// create the header
// TODO: clean this up: or just rewind+reuse and deal with it
byte[] header = new byte[MAX_HEADER_SIZE];
ByteArrayDataOutput bad = new ByteArrayDataOutput(header);
// write a "false" first arc:
if (writeDirectly) {
bad.writeByte(ARCS_AS_ARRAY_WITH_GAPS);
bad.writeVInt(labelRange);
} else {
bad.writeByte(ARCS_AS_ARRAY_PACKED);
bad.writeVInt(nodeIn.numArcs);
}
bad.writeVInt(maxBytesPerArc);
int headerLen = bad.getPosition();
final long fixedArrayStart = startAddress + headerLen;
if (writeDirectly) {
writeArrayWithGaps(builder, nodeIn, fixedArrayStart, maxBytesPerArc, labelRange);
} else {
writeArrayPacked(builder, nodeIn, fixedArrayStart, maxBytesPerArc);
}
// now write the header
builder.bytes.writeBytes(startAddress, header, 0, headerLen);
@ -743,43 +726,6 @@ public final class FST<T> implements Accountable {
}
}
private void writeArrayWithGaps(Builder<T> builder, Builder.UnCompiledNode<T> nodeIn, long fixedArrayStart, int maxBytesPerArc, int labelRange) {
// expand the arcs in place, backwards
long srcPos = builder.bytes.getPosition();
long destPos = fixedArrayStart + labelRange * maxBytesPerArc;
// if destPos == srcPos it means all the arcs were the same length, and the array of them is *already* direct
assert destPos >= srcPos;
if (destPos > srcPos) {
builder.bytes.skipBytes((int) (destPos - srcPos));
int arcIdx = nodeIn.numArcs - 1;
int firstLabel = nodeIn.arcs[0].label;
int nextLabel = nodeIn.arcs[arcIdx].label;
for (int directArcIdx = labelRange - 1; directArcIdx >= 0; directArcIdx--) {
destPos -= maxBytesPerArc;
if (directArcIdx == nextLabel - firstLabel) {
int arcLen = builder.reusedBytesPerArc[arcIdx];
srcPos -= arcLen;
//System.out.println(" direct pack idx=" + directArcIdx + " arcIdx=" + arcIdx + " srcPos=" + srcPos + " destPos=" + destPos + " label=" + nextLabel);
if (srcPos != destPos) {
//System.out.println(" copy len=" + builder.reusedBytesPerArc[arcIdx]);
assert destPos > srcPos: "destPos=" + destPos + " srcPos=" + srcPos + " arcIdx=" + arcIdx + " maxBytesPerArc=" + maxBytesPerArc + " reusedBytesPerArc[arcIdx]=" + builder.reusedBytesPerArc[arcIdx] + " nodeIn.numArcs=" + nodeIn.numArcs;
builder.bytes.copyBytes(srcPos, destPos, arcLen);
if (arcIdx == 0) {
break;
}
}
--arcIdx;
nextLabel = nodeIn.arcs[arcIdx].label;
} else {
assert directArcIdx > arcIdx;
// mark this as a missing arc
//System.out.println(" direct pack idx=" + directArcIdx + " no arc");
builder.bytes.writeByte(destPos, BIT_MISSING_ARC);
}
}
}
}
/** Fills virtual 'start' arc, ie, an empty incoming arc to the FST's start node */
public Arc<T> getFirstArc(Arc<T> arc) {
T NO_OUTPUT = outputs.getNoOutput();
@ -821,18 +767,13 @@ public final class FST<T> implements Accountable {
} else {
in.setPosition(follow.target());
final byte b = in.readByte();
if (b == ARCS_AS_ARRAY_PACKED || b == ARCS_AS_ARRAY_WITH_GAPS) {
if (b == ARCS_AS_ARRAY_PACKED) {
// array: jump straight to end
arc.numArcs = in.readVInt();
arc.bytesPerArc = in.readVInt();
//System.out.println(" array numArcs=" + arc.numArcs + " bpa=" + arc.bytesPerArc);
arc.posArcsStart = in.getPosition();
if (b == ARCS_AS_ARRAY_WITH_GAPS) {
arc.arcIdx = Integer.MIN_VALUE;
arc.nextArc = arc.posArcsStart() - (arc.numArcs() - 1) * arc.bytesPerArc();
} else {
arc.arcIdx = arc.numArcs() - 2;
}
} else {
arc.flags = b;
// non-array: linear scan
@ -902,7 +843,7 @@ public final class FST<T> implements Accountable {
//System.out.println(" flags=" + arc.flags);
byte flags = in.readByte();
if (flags == ARCS_AS_ARRAY_PACKED || flags == ARCS_AS_ARRAY_WITH_GAPS) {
if (flags == ARCS_AS_ARRAY_PACKED) {
//System.out.println(" fixedArray");
// this is first arc in a fixed-array
arc.numArcs = in.readVInt();
@ -935,7 +876,7 @@ public final class FST<T> implements Accountable {
} else {
in.setPosition(follow.target());
byte flags = in.readByte();
return flags == ARCS_AS_ARRAY_PACKED || flags == ARCS_AS_ARRAY_WITH_GAPS;
return flags == ARCS_AS_ARRAY_PACKED;
}
}
@ -965,7 +906,7 @@ public final class FST<T> implements Accountable {
in.setPosition(pos);
final byte flags = in.readByte();
if (flags == ARCS_AS_ARRAY_PACKED || flags == ARCS_AS_ARRAY_WITH_GAPS) {
if (flags == ARCS_AS_ARRAY_PACKED) {
//System.out.println(" nextArc fixed array");
in.readVInt();
@ -1213,34 +1154,7 @@ public final class FST<T> implements Accountable {
// System.out.println("fta label=" + (char) labelToMatch);
byte flags = in.readByte();
if (flags == ARCS_AS_ARRAY_WITH_GAPS) {
arc.numArcs = in.readVInt();
arc.bytesPerArc = in.readVInt();
arc.posArcsStart = in.getPosition();
// Array is direct; address by label
in.skipBytes(1);
int firstLabel = readLabel(in);
int arcPos = labelToMatch - firstLabel;
if (arcPos == 0) {
arc.nextArc = arc.posArcsStart();
} else if (arcPos > 0) {
if (arcPos >= arc.numArcs()) {
return null;
}
in.setPosition(arc.posArcsStart() - arc.bytesPerArc() * arcPos);
flags = in.readByte();
if (flag(flags, BIT_MISSING_ARC)) {
return null;
}
// point to flags that we just read
arc.nextArc = in.getPosition() + 1;
} else {
return null;
}
arc.arcIdx = Integer.MIN_VALUE;
return readNextRealArc(arc, in);
} else if (flags == ARCS_AS_ARRAY_PACKED) {
if (flags == ARCS_AS_ARRAY_PACKED) {
arc.numArcs = in.readVInt();
arc.bytesPerArc = in.readVInt();
arc.posArcsStart = in.getPosition();