mirror of https://github.com/apache/lucene.git
LUCENE-8920: Disable direct addressing of arcs. (#950)
This commit is contained in:
parent
b6ea7d60b7
commit
81f598c2e6
Binary file not shown.
Binary file not shown.
|
@ -50,9 +50,6 @@ import org.apache.lucene.util.fst.FST.INPUT_TYPE; // javadoc
|
||||||
|
|
||||||
public class Builder<T> {
|
public class Builder<T> {
|
||||||
|
|
||||||
// The amount of Arc array oversizing used to enable direct addressing of Arcs by their labels
|
|
||||||
static final int DIRECT_ARC_LOAD_FACTOR = 4;
|
|
||||||
|
|
||||||
private final NodeHash<T> dedupHash;
|
private final NodeHash<T> dedupHash;
|
||||||
final FST<T> fst;
|
final FST<T> fst;
|
||||||
private final T NO_OUTPUT;
|
private final T NO_OUTPUT;
|
||||||
|
|
|
@ -88,8 +88,6 @@ public final class FST<T> implements Accountable {
|
||||||
// this means either of these things in different contexts
|
// this means either of these things in different contexts
|
||||||
// in the midst of a direct array:
|
// in the midst of a direct array:
|
||||||
private static final byte BIT_MISSING_ARC = 1 << 6;
|
private static final byte BIT_MISSING_ARC = 1 << 6;
|
||||||
// at the start of a direct array:
|
|
||||||
private static final byte ARCS_AS_ARRAY_WITH_GAPS = BIT_MISSING_ARC;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @see #shouldExpand(Builder, Builder.UnCompiledNode)
|
* @see #shouldExpand(Builder, Builder.UnCompiledNode)
|
||||||
|
@ -109,7 +107,7 @@ public final class FST<T> implements Accountable {
|
||||||
// Increment version to change it
|
// Increment version to change it
|
||||||
private static final String FILE_FORMAT_NAME = "FST";
|
private static final String FILE_FORMAT_NAME = "FST";
|
||||||
private static final int VERSION_START = 6;
|
private static final int VERSION_START = 6;
|
||||||
private static final int VERSION_CURRENT = 7;
|
private static final int VERSION_CURRENT = VERSION_START;
|
||||||
|
|
||||||
// Never serialized; just used to represent the virtual
|
// Never serialized; just used to represent the virtual
|
||||||
// final node w/ no arcs:
|
// final node w/ no arcs:
|
||||||
|
@ -682,34 +680,19 @@ public final class FST<T> implements Accountable {
|
||||||
assert maxBytesPerArc > 0;
|
assert maxBytesPerArc > 0;
|
||||||
// 2nd pass just "expands" all arcs to take up a fixed byte size
|
// 2nd pass just "expands" all arcs to take up a fixed byte size
|
||||||
|
|
||||||
// If more than (1 / DIRECT_ARC_LOAD_FACTOR) of the "slots" would be occupied, write an arc
|
|
||||||
// array that may have holes in it so that we can address the arcs directly by label without
|
|
||||||
// binary search
|
|
||||||
int labelRange = nodeIn.arcs[nodeIn.numArcs - 1].label - nodeIn.arcs[0].label + 1;
|
|
||||||
boolean writeDirectly = labelRange > 0 && labelRange < Builder.DIRECT_ARC_LOAD_FACTOR * nodeIn.numArcs;
|
|
||||||
|
|
||||||
// create the header
|
// create the header
|
||||||
// TODO: clean this up: or just rewind+reuse and deal with it
|
// TODO: clean this up: or just rewind+reuse and deal with it
|
||||||
byte[] header = new byte[MAX_HEADER_SIZE];
|
byte[] header = new byte[MAX_HEADER_SIZE];
|
||||||
ByteArrayDataOutput bad = new ByteArrayDataOutput(header);
|
ByteArrayDataOutput bad = new ByteArrayDataOutput(header);
|
||||||
// write a "false" first arc:
|
// write a "false" first arc:
|
||||||
if (writeDirectly) {
|
|
||||||
bad.writeByte(ARCS_AS_ARRAY_WITH_GAPS);
|
|
||||||
bad.writeVInt(labelRange);
|
|
||||||
} else {
|
|
||||||
bad.writeByte(ARCS_AS_ARRAY_PACKED);
|
bad.writeByte(ARCS_AS_ARRAY_PACKED);
|
||||||
bad.writeVInt(nodeIn.numArcs);
|
bad.writeVInt(nodeIn.numArcs);
|
||||||
}
|
|
||||||
bad.writeVInt(maxBytesPerArc);
|
bad.writeVInt(maxBytesPerArc);
|
||||||
int headerLen = bad.getPosition();
|
int headerLen = bad.getPosition();
|
||||||
|
|
||||||
final long fixedArrayStart = startAddress + headerLen;
|
final long fixedArrayStart = startAddress + headerLen;
|
||||||
|
|
||||||
if (writeDirectly) {
|
|
||||||
writeArrayWithGaps(builder, nodeIn, fixedArrayStart, maxBytesPerArc, labelRange);
|
|
||||||
} else {
|
|
||||||
writeArrayPacked(builder, nodeIn, fixedArrayStart, maxBytesPerArc);
|
writeArrayPacked(builder, nodeIn, fixedArrayStart, maxBytesPerArc);
|
||||||
}
|
|
||||||
|
|
||||||
// now write the header
|
// now write the header
|
||||||
builder.bytes.writeBytes(startAddress, header, 0, headerLen);
|
builder.bytes.writeBytes(startAddress, header, 0, headerLen);
|
||||||
|
@ -743,43 +726,6 @@ public final class FST<T> implements Accountable {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void writeArrayWithGaps(Builder<T> builder, Builder.UnCompiledNode<T> nodeIn, long fixedArrayStart, int maxBytesPerArc, int labelRange) {
|
|
||||||
// expand the arcs in place, backwards
|
|
||||||
long srcPos = builder.bytes.getPosition();
|
|
||||||
long destPos = fixedArrayStart + labelRange * maxBytesPerArc;
|
|
||||||
// if destPos == srcPos it means all the arcs were the same length, and the array of them is *already* direct
|
|
||||||
assert destPos >= srcPos;
|
|
||||||
if (destPos > srcPos) {
|
|
||||||
builder.bytes.skipBytes((int) (destPos - srcPos));
|
|
||||||
int arcIdx = nodeIn.numArcs - 1;
|
|
||||||
int firstLabel = nodeIn.arcs[0].label;
|
|
||||||
int nextLabel = nodeIn.arcs[arcIdx].label;
|
|
||||||
for (int directArcIdx = labelRange - 1; directArcIdx >= 0; directArcIdx--) {
|
|
||||||
destPos -= maxBytesPerArc;
|
|
||||||
if (directArcIdx == nextLabel - firstLabel) {
|
|
||||||
int arcLen = builder.reusedBytesPerArc[arcIdx];
|
|
||||||
srcPos -= arcLen;
|
|
||||||
//System.out.println(" direct pack idx=" + directArcIdx + " arcIdx=" + arcIdx + " srcPos=" + srcPos + " destPos=" + destPos + " label=" + nextLabel);
|
|
||||||
if (srcPos != destPos) {
|
|
||||||
//System.out.println(" copy len=" + builder.reusedBytesPerArc[arcIdx]);
|
|
||||||
assert destPos > srcPos: "destPos=" + destPos + " srcPos=" + srcPos + " arcIdx=" + arcIdx + " maxBytesPerArc=" + maxBytesPerArc + " reusedBytesPerArc[arcIdx]=" + builder.reusedBytesPerArc[arcIdx] + " nodeIn.numArcs=" + nodeIn.numArcs;
|
|
||||||
builder.bytes.copyBytes(srcPos, destPos, arcLen);
|
|
||||||
if (arcIdx == 0) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
--arcIdx;
|
|
||||||
nextLabel = nodeIn.arcs[arcIdx].label;
|
|
||||||
} else {
|
|
||||||
assert directArcIdx > arcIdx;
|
|
||||||
// mark this as a missing arc
|
|
||||||
//System.out.println(" direct pack idx=" + directArcIdx + " no arc");
|
|
||||||
builder.bytes.writeByte(destPos, BIT_MISSING_ARC);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Fills virtual 'start' arc, ie, an empty incoming arc to the FST's start node */
|
/** Fills virtual 'start' arc, ie, an empty incoming arc to the FST's start node */
|
||||||
public Arc<T> getFirstArc(Arc<T> arc) {
|
public Arc<T> getFirstArc(Arc<T> arc) {
|
||||||
T NO_OUTPUT = outputs.getNoOutput();
|
T NO_OUTPUT = outputs.getNoOutput();
|
||||||
|
@ -821,18 +767,13 @@ public final class FST<T> implements Accountable {
|
||||||
} else {
|
} else {
|
||||||
in.setPosition(follow.target());
|
in.setPosition(follow.target());
|
||||||
final byte b = in.readByte();
|
final byte b = in.readByte();
|
||||||
if (b == ARCS_AS_ARRAY_PACKED || b == ARCS_AS_ARRAY_WITH_GAPS) {
|
if (b == ARCS_AS_ARRAY_PACKED) {
|
||||||
// array: jump straight to end
|
// array: jump straight to end
|
||||||
arc.numArcs = in.readVInt();
|
arc.numArcs = in.readVInt();
|
||||||
arc.bytesPerArc = in.readVInt();
|
arc.bytesPerArc = in.readVInt();
|
||||||
//System.out.println(" array numArcs=" + arc.numArcs + " bpa=" + arc.bytesPerArc);
|
//System.out.println(" array numArcs=" + arc.numArcs + " bpa=" + arc.bytesPerArc);
|
||||||
arc.posArcsStart = in.getPosition();
|
arc.posArcsStart = in.getPosition();
|
||||||
if (b == ARCS_AS_ARRAY_WITH_GAPS) {
|
|
||||||
arc.arcIdx = Integer.MIN_VALUE;
|
|
||||||
arc.nextArc = arc.posArcsStart() - (arc.numArcs() - 1) * arc.bytesPerArc();
|
|
||||||
} else {
|
|
||||||
arc.arcIdx = arc.numArcs() - 2;
|
arc.arcIdx = arc.numArcs() - 2;
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
arc.flags = b;
|
arc.flags = b;
|
||||||
// non-array: linear scan
|
// non-array: linear scan
|
||||||
|
@ -902,7 +843,7 @@ public final class FST<T> implements Accountable {
|
||||||
//System.out.println(" flags=" + arc.flags);
|
//System.out.println(" flags=" + arc.flags);
|
||||||
|
|
||||||
byte flags = in.readByte();
|
byte flags = in.readByte();
|
||||||
if (flags == ARCS_AS_ARRAY_PACKED || flags == ARCS_AS_ARRAY_WITH_GAPS) {
|
if (flags == ARCS_AS_ARRAY_PACKED) {
|
||||||
//System.out.println(" fixedArray");
|
//System.out.println(" fixedArray");
|
||||||
// this is first arc in a fixed-array
|
// this is first arc in a fixed-array
|
||||||
arc.numArcs = in.readVInt();
|
arc.numArcs = in.readVInt();
|
||||||
|
@ -935,7 +876,7 @@ public final class FST<T> implements Accountable {
|
||||||
} else {
|
} else {
|
||||||
in.setPosition(follow.target());
|
in.setPosition(follow.target());
|
||||||
byte flags = in.readByte();
|
byte flags = in.readByte();
|
||||||
return flags == ARCS_AS_ARRAY_PACKED || flags == ARCS_AS_ARRAY_WITH_GAPS;
|
return flags == ARCS_AS_ARRAY_PACKED;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -965,7 +906,7 @@ public final class FST<T> implements Accountable {
|
||||||
in.setPosition(pos);
|
in.setPosition(pos);
|
||||||
|
|
||||||
final byte flags = in.readByte();
|
final byte flags = in.readByte();
|
||||||
if (flags == ARCS_AS_ARRAY_PACKED || flags == ARCS_AS_ARRAY_WITH_GAPS) {
|
if (flags == ARCS_AS_ARRAY_PACKED) {
|
||||||
//System.out.println(" nextArc fixed array");
|
//System.out.println(" nextArc fixed array");
|
||||||
in.readVInt();
|
in.readVInt();
|
||||||
|
|
||||||
|
@ -1213,34 +1154,7 @@ public final class FST<T> implements Accountable {
|
||||||
// System.out.println("fta label=" + (char) labelToMatch);
|
// System.out.println("fta label=" + (char) labelToMatch);
|
||||||
|
|
||||||
byte flags = in.readByte();
|
byte flags = in.readByte();
|
||||||
if (flags == ARCS_AS_ARRAY_WITH_GAPS) {
|
if (flags == ARCS_AS_ARRAY_PACKED) {
|
||||||
arc.numArcs = in.readVInt();
|
|
||||||
arc.bytesPerArc = in.readVInt();
|
|
||||||
arc.posArcsStart = in.getPosition();
|
|
||||||
|
|
||||||
// Array is direct; address by label
|
|
||||||
in.skipBytes(1);
|
|
||||||
int firstLabel = readLabel(in);
|
|
||||||
int arcPos = labelToMatch - firstLabel;
|
|
||||||
if (arcPos == 0) {
|
|
||||||
arc.nextArc = arc.posArcsStart();
|
|
||||||
} else if (arcPos > 0) {
|
|
||||||
if (arcPos >= arc.numArcs()) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
in.setPosition(arc.posArcsStart() - arc.bytesPerArc() * arcPos);
|
|
||||||
flags = in.readByte();
|
|
||||||
if (flag(flags, BIT_MISSING_ARC)) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
// point to flags that we just read
|
|
||||||
arc.nextArc = in.getPosition() + 1;
|
|
||||||
} else {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
arc.arcIdx = Integer.MIN_VALUE;
|
|
||||||
return readNextRealArc(arc, in);
|
|
||||||
} else if (flags == ARCS_AS_ARRAY_PACKED) {
|
|
||||||
arc.numArcs = in.readVInt();
|
arc.numArcs = in.readVInt();
|
||||||
arc.bytesPerArc = in.readVInt();
|
arc.bytesPerArc = in.readVInt();
|
||||||
arc.posArcsStart = in.getPosition();
|
arc.posArcsStart = in.getPosition();
|
||||||
|
|
Loading…
Reference in New Issue