mirror of https://github.com/apache/lucene.git
Specialize arc store for continuous label in FST (#12748)
* init * review fix and reuse duplicate code * rebase * tidy * CHANGES.txt * bump version * rebase * CHANGES.txt
This commit is contained in:
parent
a71d64a598
commit
570832eb74
|
@ -264,6 +264,8 @@ Optimizations
|
|||
|
||||
* GITHUB#12381: Skip docs with DocValues in NumericLeafComparator. (Lu Xugang, Adrien Grand)
|
||||
|
||||
* GITHUB#12748: Specialize arc store for continuous label in FST. (Guo Feng, Chao Zhang)
|
||||
|
||||
Changes in runtime behavior
|
||||
---------------------
|
||||
|
||||
|
|
|
@ -86,8 +86,11 @@ public final class Lucene90BlockTreeTermsReader extends FieldsProducer {
|
|||
*/
|
||||
public static final int VERSION_MSB_VLONG_OUTPUT = 1;
|
||||
|
||||
/** The version that specialize arc store for continuous label in FST. */
|
||||
public static final int VERSION_FST_CONTINUOUS_ARCS = 2;
|
||||
|
||||
/** Current terms format. */
|
||||
public static final int VERSION_CURRENT = VERSION_MSB_VLONG_OUTPUT;
|
||||
public static final int VERSION_CURRENT = VERSION_FST_CONTINUOUS_ARCS;
|
||||
|
||||
/** Extension of terms index file */
|
||||
static final String TERMS_INDEX_EXTENSION = "tip";
|
||||
|
|
|
@ -98,11 +98,19 @@ public final class FST<T> implements Accountable {
|
|||
*/
|
||||
static final byte ARCS_FOR_DIRECT_ADDRESSING = 1 << 6;
|
||||
|
||||
/**
|
||||
* Value of the arc flags to declare a node with continuous arcs designed for pos the arc directly
|
||||
* with labelToPos - firstLabel. like {@link #ARCS_FOR_BINARY_SEARCH} we use flag combinations
|
||||
* that will not occur at the same time.
|
||||
*/
|
||||
static final byte ARCS_FOR_CONTINUOUS = ARCS_FOR_DIRECT_ADDRESSING + ARCS_FOR_BINARY_SEARCH;
|
||||
|
||||
// Increment version to change it
|
||||
private static final String FILE_FORMAT_NAME = "FST";
|
||||
private static final int VERSION_START = 6;
|
||||
private static final int VERSION_LITTLE_ENDIAN = 8;
|
||||
static final int VERSION_CURRENT = VERSION_LITTLE_ENDIAN;
|
||||
private static final int VERSION_CONTINUOUS_ARCS = 9;
|
||||
static final int VERSION_CURRENT = VERSION_CONTINUOUS_ARCS;
|
||||
|
||||
// Never serialized; just used to represent the virtual
|
||||
// final node w/ no arcs:
|
||||
|
@ -243,7 +251,10 @@ public final class FST<T> implements Accountable {
|
|||
.append(numArcs())
|
||||
.append(")")
|
||||
.append("(")
|
||||
.append(nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING ? "da" : "bs")
|
||||
.append(
|
||||
nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING
|
||||
? "da"
|
||||
: nodeFlags() == ARCS_FOR_CONTINUOUS ? "cs" : "bs")
|
||||
.append(")");
|
||||
}
|
||||
return b.toString();
|
||||
|
@ -285,8 +296,8 @@ public final class FST<T> implements Accountable {
|
|||
|
||||
/**
|
||||
* Node header flags. Only meaningful to check if the value is either {@link
|
||||
* #ARCS_FOR_BINARY_SEARCH} or {@link #ARCS_FOR_DIRECT_ADDRESSING} (other value when bytesPerArc
|
||||
* == 0).
|
||||
* #ARCS_FOR_BINARY_SEARCH} or {@link #ARCS_FOR_DIRECT_ADDRESSING} or {@link
|
||||
* #ARCS_FOR_CONTINUOUS} (other value when bytesPerArc == 0).
|
||||
*/
|
||||
public byte nodeFlags() {
|
||||
return nodeFlags;
|
||||
|
@ -318,7 +329,7 @@ public final class FST<T> implements Accountable {
|
|||
|
||||
/**
|
||||
* First label of a direct addressing node. Only valid if nodeFlags == {@link
|
||||
* #ARCS_FOR_DIRECT_ADDRESSING}.
|
||||
* #ARCS_FOR_DIRECT_ADDRESSING} or {@link #ARCS_FOR_CONTINUOUS}.
|
||||
*/
|
||||
int firstLabel() {
|
||||
return firstLabel;
|
||||
|
@ -653,7 +664,9 @@ public final class FST<T> implements Accountable {
|
|||
} else {
|
||||
in.setPosition(follow.target());
|
||||
byte flags = arc.nodeFlags = in.readByte();
|
||||
if (flags == ARCS_FOR_BINARY_SEARCH || flags == ARCS_FOR_DIRECT_ADDRESSING) {
|
||||
if (flags == ARCS_FOR_BINARY_SEARCH
|
||||
|| flags == ARCS_FOR_DIRECT_ADDRESSING
|
||||
|| flags == ARCS_FOR_CONTINUOUS) {
|
||||
// Special arc which is actually a node header for fixed length arcs.
|
||||
// Jump straight to end to find the last arc.
|
||||
arc.numArcs = in.readVInt();
|
||||
|
@ -664,10 +677,14 @@ public final class FST<T> implements Accountable {
|
|||
arc.firstLabel = readLabel(in);
|
||||
arc.posArcsStart = in.getPosition();
|
||||
readLastArcByDirectAddressing(arc, in);
|
||||
} else {
|
||||
} else if (flags == ARCS_FOR_BINARY_SEARCH) {
|
||||
arc.arcIdx = arc.numArcs() - 2;
|
||||
arc.posArcsStart = in.getPosition();
|
||||
readNextRealArc(arc, in);
|
||||
} else {
|
||||
arc.firstLabel = readLabel(in);
|
||||
arc.posArcsStart = in.getPosition();
|
||||
readLastArcByContinuous(arc, in);
|
||||
}
|
||||
} else {
|
||||
arc.flags = flags;
|
||||
|
@ -740,7 +757,9 @@ public final class FST<T> implements Accountable {
|
|||
in.setPosition(nodeAddress);
|
||||
|
||||
byte flags = arc.nodeFlags = in.readByte();
|
||||
if (flags == ARCS_FOR_BINARY_SEARCH || flags == ARCS_FOR_DIRECT_ADDRESSING) {
|
||||
if (flags == ARCS_FOR_BINARY_SEARCH
|
||||
|| flags == ARCS_FOR_DIRECT_ADDRESSING
|
||||
|| flags == ARCS_FOR_CONTINUOUS) {
|
||||
// Special arc which is actually a node header for fixed length arcs.
|
||||
arc.numArcs = in.readVInt();
|
||||
arc.bytesPerArc = in.readVInt();
|
||||
|
@ -749,6 +768,8 @@ public final class FST<T> implements Accountable {
|
|||
readPresenceBytes(arc, in);
|
||||
arc.firstLabel = readLabel(in);
|
||||
arc.presenceIndex = -1;
|
||||
} else if (flags == ARCS_FOR_CONTINUOUS) {
|
||||
arc.firstLabel = readLabel(in);
|
||||
}
|
||||
arc.posArcsStart = in.getPosition();
|
||||
} else {
|
||||
|
@ -773,7 +794,9 @@ public final class FST<T> implements Accountable {
|
|||
} else {
|
||||
in.setPosition(follow.target());
|
||||
byte flags = in.readByte();
|
||||
return flags == ARCS_FOR_BINARY_SEARCH || flags == ARCS_FOR_DIRECT_ADDRESSING;
|
||||
return flags == ARCS_FOR_BINARY_SEARCH
|
||||
|| flags == ARCS_FOR_DIRECT_ADDRESSING
|
||||
|| flags == ARCS_FOR_CONTINUOUS;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -801,16 +824,18 @@ public final class FST<T> implements Accountable {
|
|||
|
||||
in.setPosition(arc.nextArc());
|
||||
byte flags = in.readByte();
|
||||
if (flags == ARCS_FOR_BINARY_SEARCH || flags == ARCS_FOR_DIRECT_ADDRESSING) {
|
||||
if (flags == ARCS_FOR_BINARY_SEARCH
|
||||
|| flags == ARCS_FOR_DIRECT_ADDRESSING
|
||||
|| flags == ARCS_FOR_CONTINUOUS) {
|
||||
// System.out.println(" nextArc fixed length arc");
|
||||
// Special arc which is actually a node header for fixed length arcs.
|
||||
int numArcs = in.readVInt();
|
||||
in.readVInt(); // Skip bytesPerArc.
|
||||
if (flags == ARCS_FOR_BINARY_SEARCH) {
|
||||
in.readByte(); // Skip arc flags.
|
||||
} else {
|
||||
} else if (flags == ARCS_FOR_DIRECT_ADDRESSING) {
|
||||
in.skipBytes(getNumPresenceBytes(numArcs));
|
||||
}
|
||||
} // Nothing to do for ARCS_FOR_CONTINUOUS
|
||||
}
|
||||
} else {
|
||||
switch (arc.nodeFlags()) {
|
||||
|
@ -826,6 +851,8 @@ public final class FST<T> implements Accountable {
|
|||
int nextIndex = BitTable.nextBitSet(arc.arcIdx(), arc, in);
|
||||
assert nextIndex != -1;
|
||||
return arc.firstLabel() + nextIndex;
|
||||
case ARCS_FOR_CONTINUOUS:
|
||||
return arc.firstLabel() + arc.arcIdx() + 1;
|
||||
default:
|
||||
// Variable length arcs - linear search.
|
||||
assert arc.bytesPerArc() == 0;
|
||||
|
@ -849,6 +876,20 @@ public final class FST<T> implements Accountable {
|
|||
return readArc(arc, in);
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads a Continuous node arc, with the provided index in the label range.
|
||||
*
|
||||
* @param rangeIndex The index of the arc in the label range. It must be within the label range.
|
||||
*/
|
||||
public Arc<T> readArcByContinuous(Arc<T> arc, final BytesReader in, int rangeIndex)
|
||||
throws IOException {
|
||||
assert rangeIndex >= 0 && rangeIndex < arc.numArcs();
|
||||
in.setPosition(arc.posArcsStart() - rangeIndex * (long) arc.bytesPerArc());
|
||||
arc.arcIdx = rangeIndex;
|
||||
arc.flags = in.readByte();
|
||||
return readArc(arc, in);
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads a present direct addressing node arc, with the provided index in the label range.
|
||||
*
|
||||
|
@ -888,6 +929,11 @@ public final class FST<T> implements Accountable {
|
|||
return readArcByDirectAddressing(arc, in, arc.numArcs() - 1, presenceIndex);
|
||||
}
|
||||
|
||||
/** Reads the last arc of a continuous node. */
|
||||
public Arc<T> readLastArcByContinuous(Arc<T> arc, final BytesReader in) throws IOException {
|
||||
return readArcByContinuous(arc, in, arc.numArcs() - 1);
|
||||
}
|
||||
|
||||
/** Never returns null, but you should never call this if arc.isLast() is true. */
|
||||
public Arc<T> readNextRealArc(Arc<T> arc, final BytesReader in) throws IOException {
|
||||
|
||||
|
@ -896,6 +942,7 @@ public final class FST<T> implements Accountable {
|
|||
|
||||
switch (arc.nodeFlags()) {
|
||||
case ARCS_FOR_BINARY_SEARCH:
|
||||
case ARCS_FOR_CONTINUOUS:
|
||||
assert arc.bytesPerArc() > 0;
|
||||
arc.arcIdx++;
|
||||
assert arc.arcIdx() >= 0 && arc.arcIdx() < arc.numArcs();
|
||||
|
@ -924,7 +971,7 @@ public final class FST<T> implements Accountable {
|
|||
* positioned just after the arc flags byte.
|
||||
*/
|
||||
private Arc<T> readArc(Arc<T> arc, BytesReader in) throws IOException {
|
||||
if (arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING) {
|
||||
if (arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING || arc.nodeFlags() == ARCS_FOR_CONTINUOUS) {
|
||||
arc.label = arc.firstLabel() + arc.arcIdx();
|
||||
} else {
|
||||
arc.label = readLabel(in);
|
||||
|
@ -1067,6 +1114,17 @@ public final class FST<T> implements Accountable {
|
|||
}
|
||||
}
|
||||
return null;
|
||||
} else if (flags == ARCS_FOR_CONTINUOUS) {
|
||||
arc.numArcs = in.readVInt();
|
||||
arc.bytesPerArc = in.readVInt();
|
||||
arc.firstLabel = readLabel(in);
|
||||
arc.posArcsStart = in.getPosition();
|
||||
int arcIndex = labelToMatch - arc.firstLabel();
|
||||
if (arcIndex < 0 || arcIndex >= arc.numArcs()) {
|
||||
return null; // Before or after label range.
|
||||
}
|
||||
arc.arcIdx = arcIndex - 1;
|
||||
return readNextRealArc(arc, in);
|
||||
}
|
||||
|
||||
// Linear scan
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
package org.apache.lucene.util.fst;
|
||||
|
||||
import static org.apache.lucene.util.fst.FST.ARCS_FOR_BINARY_SEARCH;
|
||||
import static org.apache.lucene.util.fst.FST.ARCS_FOR_CONTINUOUS;
|
||||
import static org.apache.lucene.util.fst.FST.ARCS_FOR_DIRECT_ADDRESSING;
|
||||
import static org.apache.lucene.util.fst.FST.BIT_ARC_HAS_FINAL_OUTPUT;
|
||||
import static org.apache.lucene.util.fst.FST.BIT_ARC_HAS_OUTPUT;
|
||||
|
@ -113,6 +114,7 @@ public class FSTCompiler<T> {
|
|||
long nodeCount;
|
||||
long binarySearchNodeCount;
|
||||
long directAddressingNodeCount;
|
||||
long continuousNodeCount;
|
||||
|
||||
final boolean allowFixedLengthArcs;
|
||||
final float directAddressingMaxOversizingFactor;
|
||||
|
@ -445,9 +447,15 @@ public class FSTCompiler<T> {
|
|||
|
||||
int labelRange = nodeIn.arcs[nodeIn.numArcs - 1].label - nodeIn.arcs[0].label + 1;
|
||||
assert labelRange > 0;
|
||||
if (shouldExpandNodeWithDirectAddressing(
|
||||
boolean continuousLabel = labelRange == nodeIn.numArcs;
|
||||
if (continuousLabel) {
|
||||
writeNodeForDirectAddressingOrContinuous(
|
||||
nodeIn, startAddress, maxBytesPerArcWithoutLabel, labelRange, true);
|
||||
continuousNodeCount++;
|
||||
} else if (shouldExpandNodeWithDirectAddressing(
|
||||
nodeIn, maxBytesPerArc, maxBytesPerArcWithoutLabel, labelRange)) {
|
||||
writeNodeForDirectAddressing(nodeIn, startAddress, maxBytesPerArcWithoutLabel, labelRange);
|
||||
writeNodeForDirectAddressingOrContinuous(
|
||||
nodeIn, startAddress, maxBytesPerArcWithoutLabel, labelRange, false);
|
||||
directAddressingNodeCount++;
|
||||
} else {
|
||||
writeNodeForBinarySearch(nodeIn, startAddress, maxBytesPerArc);
|
||||
|
@ -578,18 +586,19 @@ public class FSTCompiler<T> {
|
|||
bytes.writeBytes(startAddress, fixedLengthArcsBuffer.getBytes(), 0, headerLen);
|
||||
}
|
||||
|
||||
private void writeNodeForDirectAddressing(
|
||||
private void writeNodeForDirectAddressingOrContinuous(
|
||||
FSTCompiler.UnCompiledNode<T> nodeIn,
|
||||
long startAddress,
|
||||
int maxBytesPerArcWithoutLabel,
|
||||
int labelRange) {
|
||||
int labelRange,
|
||||
boolean continuous) {
|
||||
// Expand the arcs backwards in a buffer because we remove the labels.
|
||||
// So the obtained arcs might occupy less space. This is the reason why this
|
||||
// whole method is more complex.
|
||||
// Drop the label bytes since we can infer the label based on the arc index,
|
||||
// the presence bits, and the first label. Keep the first label.
|
||||
int headerMaxLen = 11;
|
||||
int numPresenceBytes = getNumPresenceBytes(labelRange);
|
||||
int numPresenceBytes = continuous ? 0 : getNumPresenceBytes(labelRange);
|
||||
long srcPos = bytes.getPosition();
|
||||
int totalArcBytes = numLabelBytesPerArc[0] + nodeIn.numArcs * maxBytesPerArcWithoutLabel;
|
||||
int bufferOffset = headerMaxLen + numPresenceBytes + totalArcBytes;
|
||||
|
@ -620,7 +629,7 @@ public class FSTCompiler<T> {
|
|||
// metadata.
|
||||
fixedLengthArcsBuffer
|
||||
.resetPosition()
|
||||
.writeByte(ARCS_FOR_DIRECT_ADDRESSING)
|
||||
.writeByte(continuous ? ARCS_FOR_CONTINUOUS : ARCS_FOR_DIRECT_ADDRESSING)
|
||||
.writeVInt(labelRange) // labelRange instead of numArcs.
|
||||
.writeVInt(
|
||||
maxBytesPerArcWithoutLabel); // maxBytesPerArcWithoutLabel instead of maxBytesPerArc.
|
||||
|
@ -642,8 +651,10 @@ public class FSTCompiler<T> {
|
|||
writeOffset += headerLen;
|
||||
|
||||
// Write the presence bits
|
||||
if (continuous == false) {
|
||||
writePresenceBits(nodeIn, writeOffset, numPresenceBytes);
|
||||
writeOffset += numPresenceBytes;
|
||||
}
|
||||
|
||||
// Write the first label and the arcs.
|
||||
bytes.writeBytes(writeOffset, fixedLengthArcsBuffer.getBytes(), bufferOffset, totalArcBytes);
|
||||
|
|
|
@ -149,9 +149,11 @@ abstract class FSTEnum<T> {
|
|||
final FST.BytesReader in = fst.getBytesReader();
|
||||
if (arc.nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING) {
|
||||
arc = doSeekCeilArrayDirectAddressing(arc, targetLabel, in);
|
||||
} else {
|
||||
assert arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH;
|
||||
} else if (arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH) {
|
||||
arc = doSeekCeilArrayPacked(arc, targetLabel, in);
|
||||
} else {
|
||||
assert arc.nodeFlags() == FST.ARCS_FOR_CONTINUOUS;
|
||||
arc = doSeekCeilArrayContinuous(arc, targetLabel, in);
|
||||
}
|
||||
} else {
|
||||
arc = doSeekCeilList(arc, targetLabel);
|
||||
|
@ -159,6 +161,33 @@ abstract class FSTEnum<T> {
|
|||
}
|
||||
}
|
||||
|
||||
private FST.Arc<T> doSeekCeilArrayContinuous(
|
||||
final FST.Arc<T> arc, final int targetLabel, final FST.BytesReader in) throws IOException {
|
||||
int targetIndex = targetLabel - arc.firstLabel();
|
||||
if (targetIndex >= arc.numArcs()) {
|
||||
rollbackToLastForkThenPush();
|
||||
return null;
|
||||
} else {
|
||||
if (targetIndex < 0) {
|
||||
fst.readArcByContinuous(arc, in, 0);
|
||||
assert arc.label() > targetLabel;
|
||||
pushFirst();
|
||||
return null;
|
||||
} else {
|
||||
fst.readArcByContinuous(arc, in, targetIndex);
|
||||
assert arc.label() == targetLabel;
|
||||
// found -- copy pasta from below
|
||||
output[upto] = fst.outputs.add(output[upto - 1], arc.output());
|
||||
if (targetLabel == FST.END_LABEL) {
|
||||
return null;
|
||||
}
|
||||
setCurrentLabel(arc.label());
|
||||
incr();
|
||||
return fst.readFirstTargetArc(arc, getArc(upto), fstReader);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private FST.Arc<T> doSeekCeilArrayDirectAddressing(
|
||||
final FST.Arc<T> arc, final int targetLabel, final FST.BytesReader in) throws IOException {
|
||||
// The array is addressed directly by label, with presence bits to compute the actual arc
|
||||
|
@ -166,24 +195,8 @@ abstract class FSTEnum<T> {
|
|||
|
||||
int targetIndex = targetLabel - arc.firstLabel();
|
||||
if (targetIndex >= arc.numArcs()) {
|
||||
// Target is beyond the last arc, out of label range.
|
||||
// Dead end (target is after the last arc);
|
||||
// rollback to last fork then push
|
||||
upto--;
|
||||
while (true) {
|
||||
if (upto == 0) {
|
||||
rollbackToLastForkThenPush();
|
||||
return null;
|
||||
}
|
||||
final FST.Arc<T> prevArc = getArc(upto);
|
||||
// System.out.println(" rollback upto=" + upto + " arc.label=" + prevArc.label + "
|
||||
// isLast?=" + prevArc.isLast());
|
||||
if (!prevArc.isLast()) {
|
||||
fst.readNextArc(prevArc, fstReader);
|
||||
pushFirst();
|
||||
return null;
|
||||
}
|
||||
upto--;
|
||||
}
|
||||
} else {
|
||||
if (targetIndex < 0) {
|
||||
targetIndex = -1;
|
||||
|
@ -332,9 +345,11 @@ abstract class FSTEnum<T> {
|
|||
final FST.BytesReader in = fst.getBytesReader();
|
||||
if (arc.nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING) {
|
||||
arc = doSeekFloorArrayDirectAddressing(arc, targetLabel, in);
|
||||
} else {
|
||||
assert arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH;
|
||||
} else if (arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH) {
|
||||
arc = doSeekFloorArrayPacked(arc, targetLabel, in);
|
||||
} else {
|
||||
assert arc.nodeFlags() == FST.ARCS_FOR_CONTINUOUS;
|
||||
arc = doSeekFloorContinuous(arc, targetLabel, in);
|
||||
}
|
||||
} else {
|
||||
arc = doSeekFloorList(arc, targetLabel);
|
||||
|
@ -342,6 +357,34 @@ abstract class FSTEnum<T> {
|
|||
}
|
||||
}
|
||||
|
||||
private FST.Arc<T> doSeekFloorContinuous(FST.Arc<T> arc, int targetLabel, FST.BytesReader in)
|
||||
throws IOException {
|
||||
int targetIndex = targetLabel - arc.firstLabel();
|
||||
if (targetIndex < 0) {
|
||||
// Before first arc.
|
||||
return backtrackToFloorArc(arc, targetLabel, in);
|
||||
} else if (targetIndex >= arc.numArcs()) {
|
||||
// After last arc.
|
||||
fst.readLastArcByContinuous(arc, in);
|
||||
assert arc.label() < targetLabel;
|
||||
assert arc.isLast();
|
||||
pushLast();
|
||||
return null;
|
||||
} else {
|
||||
// Within label range.
|
||||
fst.readArcByContinuous(arc, in, targetIndex);
|
||||
assert arc.label() == targetLabel;
|
||||
// found -- copy pasta from below
|
||||
output[upto] = fst.outputs.add(output[upto - 1], arc.output());
|
||||
if (targetLabel == FST.END_LABEL) {
|
||||
return null;
|
||||
}
|
||||
setCurrentLabel(arc.label());
|
||||
incr();
|
||||
return fst.readFirstTargetArc(arc, getArc(upto), fstReader);
|
||||
}
|
||||
}
|
||||
|
||||
private FST.Arc<T> doSeekFloorArrayDirectAddressing(
|
||||
FST.Arc<T> arc, int targetLabel, FST.BytesReader in) throws IOException {
|
||||
// The array is addressed directly by label, with presence bits to compute the actual arc
|
||||
|
@ -383,6 +426,28 @@ abstract class FSTEnum<T> {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Target is beyond the last arc, out of label range. Dead end (target is after the last arc);
|
||||
* rollback to last fork then push
|
||||
*/
|
||||
private void rollbackToLastForkThenPush() throws IOException {
|
||||
upto--;
|
||||
while (true) {
|
||||
if (upto == 0) {
|
||||
return;
|
||||
}
|
||||
final FST.Arc<T> prevArc = getArc(upto);
|
||||
// System.out.println(" rollback upto=" + upto + " arc.label=" + prevArc.label + "
|
||||
// isLast?=" + prevArc.isLast());
|
||||
if (!prevArc.isLast()) {
|
||||
fst.readNextArc(prevArc, fstReader);
|
||||
pushFirst();
|
||||
return;
|
||||
}
|
||||
upto--;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Backtracks until it finds a node which first arc is before our target label.` Then on the node,
|
||||
* finds the arc just before the targetLabel.
|
||||
|
@ -400,9 +465,11 @@ abstract class FSTEnum<T> {
|
|||
if (arc.bytesPerArc() != 0 && arc.label() != FST.END_LABEL) {
|
||||
if (arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH) {
|
||||
findNextFloorArcBinarySearch(arc, targetLabel, in);
|
||||
} else {
|
||||
assert arc.nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING;
|
||||
} else if (arc.nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING) {
|
||||
findNextFloorArcDirectAddressing(arc, targetLabel, in);
|
||||
} else {
|
||||
assert arc.nodeFlags() == FST.ARCS_FOR_CONTINUOUS;
|
||||
findNextFloorArcContinuous(arc, targetLabel, in);
|
||||
}
|
||||
} else {
|
||||
while (!arc.isLast() && fst.readNextArcLabel(arc, in) < targetLabel) {
|
||||
|
@ -452,6 +519,24 @@ abstract class FSTEnum<T> {
|
|||
}
|
||||
}
|
||||
|
||||
/** Same as {@link #findNextFloorArcDirectAddressing} for continuous node. */
|
||||
private void findNextFloorArcContinuous(FST.Arc<T> arc, int targetLabel, final FST.BytesReader in)
|
||||
throws IOException {
|
||||
assert arc.nodeFlags() == FST.ARCS_FOR_CONTINUOUS;
|
||||
assert arc.label() != FST.END_LABEL;
|
||||
assert arc.label() == arc.firstLabel();
|
||||
if (arc.numArcs() > 1) {
|
||||
int targetIndex = targetLabel - arc.firstLabel();
|
||||
assert targetIndex >= 0;
|
||||
if (targetIndex >= arc.numArcs()) {
|
||||
// Beyond last arc. Take last arc.
|
||||
fst.readLastArcByContinuous(arc, in);
|
||||
} else {
|
||||
fst.readArcByContinuous(arc, in, targetIndex - 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Same as {@link #findNextFloorArcDirectAddressing} for binary search node. */
|
||||
private void findNextFloorArcBinarySearch(FST.Arc<T> arc, int targetLabel, FST.BytesReader in)
|
||||
throws IOException {
|
||||
|
|
|
@ -408,6 +408,12 @@ final class NodeHash<T> {
|
|||
return -1;
|
||||
}
|
||||
break;
|
||||
case FST.ARCS_FOR_CONTINUOUS:
|
||||
if ((node.arcs[node.numArcs - 1].label - node.arcs[0].label + 1)
|
||||
!= scratchArc.numArcs()) {
|
||||
return -1;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
throw new AssertionError("unhandled scratchArc.nodeFlag() " + scratchArc.nodeFlags());
|
||||
}
|
||||
|
|
|
@ -854,6 +854,17 @@ public final class Util {
|
|||
}
|
||||
return arc;
|
||||
}
|
||||
} else if (arc.nodeFlags() == FST.ARCS_FOR_CONTINUOUS) {
|
||||
int targetIndex = label - arc.label();
|
||||
if (targetIndex >= arc.numArcs()) {
|
||||
return null;
|
||||
} else if (targetIndex < 0) {
|
||||
return arc;
|
||||
} else {
|
||||
fst.readArcByContinuous(arc, in, targetIndex);
|
||||
assert arc.label() == label;
|
||||
return arc;
|
||||
}
|
||||
}
|
||||
// Fixed length arcs in a binary search node.
|
||||
int idx = binarySearch(fst, arc, label);
|
||||
|
|
|
@ -137,7 +137,9 @@ public class TestFSTDirectAddressing extends LuceneTestCase {
|
|||
directAddressingMemoryIncreasePercent));
|
||||
System.out.println("num nodes = " + fstCompiler.nodeCount);
|
||||
long fixedLengthArcNodeCount =
|
||||
fstCompiler.directAddressingNodeCount + fstCompiler.binarySearchNodeCount;
|
||||
fstCompiler.directAddressingNodeCount
|
||||
+ fstCompiler.binarySearchNodeCount
|
||||
+ fstCompiler.continuousNodeCount;
|
||||
System.out.println(
|
||||
"num fixed-length-arc nodes = "
|
||||
+ fixedLengthArcNodeCount
|
||||
|
@ -161,6 +163,13 @@ public class TestFSTDirectAddressing extends LuceneTestCase {
|
|||
((double) (fstCompiler.directAddressingNodeCount)
|
||||
/ fixedLengthArcNodeCount
|
||||
* 100)));
|
||||
System.out.println(
|
||||
"num continuous-arcs nodes = "
|
||||
+ (fstCompiler.continuousNodeCount)
|
||||
+ String.format(
|
||||
Locale.ENGLISH,
|
||||
" (%.2f %% of fixed-length-arc nodes)",
|
||||
((double) (fstCompiler.continuousNodeCount) / fixedLengthArcNodeCount * 100)));
|
||||
}
|
||||
|
||||
private static FSTCompiler<Object> createFSTCompiler(float directAddressingMaxOversizingFactor) {
|
||||
|
@ -211,18 +220,25 @@ public class TestFSTDirectAddressing extends LuceneTestCase {
|
|||
DataInput in = new ByteArrayDataInput(buf);
|
||||
FST<BytesRef> fst = new FST<>(in, in, ByteSequenceOutputs.getSingleton());
|
||||
BytesRefFSTEnum<BytesRef> fstEnum = new BytesRefFSTEnum<>(fst);
|
||||
int binarySearchArcCount = 0, directAddressingArcCount = 0, listArcCount = 0;
|
||||
int binarySearchArcCount = 0,
|
||||
directAddressingArcCount = 0,
|
||||
listArcCount = 0,
|
||||
continuousArcCount = 0;
|
||||
while (fstEnum.next() != null) {
|
||||
if (fstEnum.arcs[fstEnum.upto].bytesPerArc() == 0) {
|
||||
listArcCount++;
|
||||
} else if (fstEnum.arcs[fstEnum.upto].nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING) {
|
||||
directAddressingArcCount++;
|
||||
} else if (fstEnum.arcs[fstEnum.upto].nodeFlags() == FST.ARCS_FOR_CONTINUOUS) {
|
||||
continuousArcCount++;
|
||||
} else {
|
||||
binarySearchArcCount++;
|
||||
}
|
||||
}
|
||||
System.out.println(
|
||||
"direct addressing arcs = "
|
||||
"continuous arcs = "
|
||||
+ continuousArcCount
|
||||
+ ", direct addressing arcs = "
|
||||
+ directAddressingArcCount
|
||||
+ ", binary search arcs = "
|
||||
+ binarySearchArcCount
|
||||
|
|
|
@ -43,6 +43,26 @@ public class TestUtil extends LuceneTestCase {
|
|||
assertEquals(-7, Util.binarySearch(fst, arc, 'P'));
|
||||
}
|
||||
|
||||
public void testContinuous() throws Exception {
|
||||
List<String> letters = Arrays.asList("A", "B", "C", "D", "E", "F", "G", "H");
|
||||
FST<Object> fst = buildFST(letters, true, false);
|
||||
FST.Arc<Object> first = fst.getFirstArc(new FST.Arc<>());
|
||||
FST.Arc<Object> arc = new FST.Arc<>();
|
||||
FST.BytesReader in = fst.getBytesReader();
|
||||
|
||||
for (String letter : letters) {
|
||||
char c = letter.charAt(0);
|
||||
arc = Util.readCeilArc(c, fst, first, arc, in);
|
||||
assertNotNull(arc);
|
||||
assertEquals(c, arc.label());
|
||||
}
|
||||
|
||||
// in the middle
|
||||
assertEquals('F', Util.readCeilArc('F', fst, first, arc, in).label());
|
||||
// no following arcs
|
||||
assertNull(Util.readCeilArc('A', fst, arc, arc, in));
|
||||
}
|
||||
|
||||
public void testReadCeilArcPackedArray() throws Exception {
|
||||
List<String> letters = Arrays.asList("A", "E", "J", "K", "L", "O", "T", "z");
|
||||
verifyReadCeilArc(letters, true, false);
|
||||
|
|
Loading…
Reference in New Issue