Specialize arc store for continuous label in FST (#12748)

* init

* review fix and reuse duplicate code

* rebase

* tidy

* CHANGES.txt

* bump version

* rebase

* CHANGES.txt
This commit is contained in:
Zhang Chao 2023-11-09 19:01:32 +08:00 committed by GitHub
parent a71d64a598
commit 570832eb74
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 261 additions and 49 deletions

View File

@ -264,6 +264,8 @@ Optimizations
* GITHUB#12381: Skip docs with DocValues in NumericLeafComparator. (Lu Xugang, Adrien Grand) * GITHUB#12381: Skip docs with DocValues in NumericLeafComparator. (Lu Xugang, Adrien Grand)
* GITHUB#12748: Specialize arc store for continuous label in FST. (Guo Feng, Chao Zhang)
Changes in runtime behavior Changes in runtime behavior
--------------------- ---------------------

View File

@ -86,8 +86,11 @@ public final class Lucene90BlockTreeTermsReader extends FieldsProducer {
*/ */
public static final int VERSION_MSB_VLONG_OUTPUT = 1; public static final int VERSION_MSB_VLONG_OUTPUT = 1;
/** The version that specialize arc store for continuous label in FST. */
public static final int VERSION_FST_CONTINUOUS_ARCS = 2;
/** Current terms format. */ /** Current terms format. */
public static final int VERSION_CURRENT = VERSION_MSB_VLONG_OUTPUT; public static final int VERSION_CURRENT = VERSION_FST_CONTINUOUS_ARCS;
/** Extension of terms index file */ /** Extension of terms index file */
static final String TERMS_INDEX_EXTENSION = "tip"; static final String TERMS_INDEX_EXTENSION = "tip";

View File

@ -98,11 +98,19 @@ public final class FST<T> implements Accountable {
*/ */
static final byte ARCS_FOR_DIRECT_ADDRESSING = 1 << 6; static final byte ARCS_FOR_DIRECT_ADDRESSING = 1 << 6;
/**
* Value of the arc flags to declare a node with continuous arcs designed for pos the arc directly
* with labelToPos - firstLabel. like {@link #ARCS_FOR_BINARY_SEARCH} we use flag combinations
* that will not occur at the same time.
*/
static final byte ARCS_FOR_CONTINUOUS = ARCS_FOR_DIRECT_ADDRESSING + ARCS_FOR_BINARY_SEARCH;
// Increment version to change it // Increment version to change it
private static final String FILE_FORMAT_NAME = "FST"; private static final String FILE_FORMAT_NAME = "FST";
private static final int VERSION_START = 6; private static final int VERSION_START = 6;
private static final int VERSION_LITTLE_ENDIAN = 8; private static final int VERSION_LITTLE_ENDIAN = 8;
static final int VERSION_CURRENT = VERSION_LITTLE_ENDIAN; private static final int VERSION_CONTINUOUS_ARCS = 9;
static final int VERSION_CURRENT = VERSION_CONTINUOUS_ARCS;
// Never serialized; just used to represent the virtual // Never serialized; just used to represent the virtual
// final node w/ no arcs: // final node w/ no arcs:
@ -243,7 +251,10 @@ public final class FST<T> implements Accountable {
.append(numArcs()) .append(numArcs())
.append(")") .append(")")
.append("(") .append("(")
.append(nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING ? "da" : "bs") .append(
nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING
? "da"
: nodeFlags() == ARCS_FOR_CONTINUOUS ? "cs" : "bs")
.append(")"); .append(")");
} }
return b.toString(); return b.toString();
@ -285,8 +296,8 @@ public final class FST<T> implements Accountable {
/** /**
* Node header flags. Only meaningful to check if the value is either {@link * Node header flags. Only meaningful to check if the value is either {@link
* #ARCS_FOR_BINARY_SEARCH} or {@link #ARCS_FOR_DIRECT_ADDRESSING} (other value when bytesPerArc * #ARCS_FOR_BINARY_SEARCH} or {@link #ARCS_FOR_DIRECT_ADDRESSING} or {@link
* == 0). * #ARCS_FOR_CONTINUOUS} (other value when bytesPerArc == 0).
*/ */
public byte nodeFlags() { public byte nodeFlags() {
return nodeFlags; return nodeFlags;
@ -318,7 +329,7 @@ public final class FST<T> implements Accountable {
/** /**
* First label of a direct addressing node. Only valid if nodeFlags == {@link * First label of a direct addressing node. Only valid if nodeFlags == {@link
* #ARCS_FOR_DIRECT_ADDRESSING}. * #ARCS_FOR_DIRECT_ADDRESSING} or {@link #ARCS_FOR_CONTINUOUS}.
*/ */
int firstLabel() { int firstLabel() {
return firstLabel; return firstLabel;
@ -653,7 +664,9 @@ public final class FST<T> implements Accountable {
} else { } else {
in.setPosition(follow.target()); in.setPosition(follow.target());
byte flags = arc.nodeFlags = in.readByte(); byte flags = arc.nodeFlags = in.readByte();
if (flags == ARCS_FOR_BINARY_SEARCH || flags == ARCS_FOR_DIRECT_ADDRESSING) { if (flags == ARCS_FOR_BINARY_SEARCH
|| flags == ARCS_FOR_DIRECT_ADDRESSING
|| flags == ARCS_FOR_CONTINUOUS) {
// Special arc which is actually a node header for fixed length arcs. // Special arc which is actually a node header for fixed length arcs.
// Jump straight to end to find the last arc. // Jump straight to end to find the last arc.
arc.numArcs = in.readVInt(); arc.numArcs = in.readVInt();
@ -664,10 +677,14 @@ public final class FST<T> implements Accountable {
arc.firstLabel = readLabel(in); arc.firstLabel = readLabel(in);
arc.posArcsStart = in.getPosition(); arc.posArcsStart = in.getPosition();
readLastArcByDirectAddressing(arc, in); readLastArcByDirectAddressing(arc, in);
} else { } else if (flags == ARCS_FOR_BINARY_SEARCH) {
arc.arcIdx = arc.numArcs() - 2; arc.arcIdx = arc.numArcs() - 2;
arc.posArcsStart = in.getPosition(); arc.posArcsStart = in.getPosition();
readNextRealArc(arc, in); readNextRealArc(arc, in);
} else {
arc.firstLabel = readLabel(in);
arc.posArcsStart = in.getPosition();
readLastArcByContinuous(arc, in);
} }
} else { } else {
arc.flags = flags; arc.flags = flags;
@ -740,7 +757,9 @@ public final class FST<T> implements Accountable {
in.setPosition(nodeAddress); in.setPosition(nodeAddress);
byte flags = arc.nodeFlags = in.readByte(); byte flags = arc.nodeFlags = in.readByte();
if (flags == ARCS_FOR_BINARY_SEARCH || flags == ARCS_FOR_DIRECT_ADDRESSING) { if (flags == ARCS_FOR_BINARY_SEARCH
|| flags == ARCS_FOR_DIRECT_ADDRESSING
|| flags == ARCS_FOR_CONTINUOUS) {
// Special arc which is actually a node header for fixed length arcs. // Special arc which is actually a node header for fixed length arcs.
arc.numArcs = in.readVInt(); arc.numArcs = in.readVInt();
arc.bytesPerArc = in.readVInt(); arc.bytesPerArc = in.readVInt();
@ -749,6 +768,8 @@ public final class FST<T> implements Accountable {
readPresenceBytes(arc, in); readPresenceBytes(arc, in);
arc.firstLabel = readLabel(in); arc.firstLabel = readLabel(in);
arc.presenceIndex = -1; arc.presenceIndex = -1;
} else if (flags == ARCS_FOR_CONTINUOUS) {
arc.firstLabel = readLabel(in);
} }
arc.posArcsStart = in.getPosition(); arc.posArcsStart = in.getPosition();
} else { } else {
@ -773,7 +794,9 @@ public final class FST<T> implements Accountable {
} else { } else {
in.setPosition(follow.target()); in.setPosition(follow.target());
byte flags = in.readByte(); byte flags = in.readByte();
return flags == ARCS_FOR_BINARY_SEARCH || flags == ARCS_FOR_DIRECT_ADDRESSING; return flags == ARCS_FOR_BINARY_SEARCH
|| flags == ARCS_FOR_DIRECT_ADDRESSING
|| flags == ARCS_FOR_CONTINUOUS;
} }
} }
@ -801,16 +824,18 @@ public final class FST<T> implements Accountable {
in.setPosition(arc.nextArc()); in.setPosition(arc.nextArc());
byte flags = in.readByte(); byte flags = in.readByte();
if (flags == ARCS_FOR_BINARY_SEARCH || flags == ARCS_FOR_DIRECT_ADDRESSING) { if (flags == ARCS_FOR_BINARY_SEARCH
|| flags == ARCS_FOR_DIRECT_ADDRESSING
|| flags == ARCS_FOR_CONTINUOUS) {
// System.out.println(" nextArc fixed length arc"); // System.out.println(" nextArc fixed length arc");
// Special arc which is actually a node header for fixed length arcs. // Special arc which is actually a node header for fixed length arcs.
int numArcs = in.readVInt(); int numArcs = in.readVInt();
in.readVInt(); // Skip bytesPerArc. in.readVInt(); // Skip bytesPerArc.
if (flags == ARCS_FOR_BINARY_SEARCH) { if (flags == ARCS_FOR_BINARY_SEARCH) {
in.readByte(); // Skip arc flags. in.readByte(); // Skip arc flags.
} else { } else if (flags == ARCS_FOR_DIRECT_ADDRESSING) {
in.skipBytes(getNumPresenceBytes(numArcs)); in.skipBytes(getNumPresenceBytes(numArcs));
} } // Nothing to do for ARCS_FOR_CONTINUOUS
} }
} else { } else {
switch (arc.nodeFlags()) { switch (arc.nodeFlags()) {
@ -826,6 +851,8 @@ public final class FST<T> implements Accountable {
int nextIndex = BitTable.nextBitSet(arc.arcIdx(), arc, in); int nextIndex = BitTable.nextBitSet(arc.arcIdx(), arc, in);
assert nextIndex != -1; assert nextIndex != -1;
return arc.firstLabel() + nextIndex; return arc.firstLabel() + nextIndex;
case ARCS_FOR_CONTINUOUS:
return arc.firstLabel() + arc.arcIdx() + 1;
default: default:
// Variable length arcs - linear search. // Variable length arcs - linear search.
assert arc.bytesPerArc() == 0; assert arc.bytesPerArc() == 0;
@ -849,6 +876,20 @@ public final class FST<T> implements Accountable {
return readArc(arc, in); return readArc(arc, in);
} }
/**
* Reads a Continuous node arc, with the provided index in the label range.
*
* @param rangeIndex The index of the arc in the label range. It must be within the label range.
*/
public Arc<T> readArcByContinuous(Arc<T> arc, final BytesReader in, int rangeIndex)
throws IOException {
assert rangeIndex >= 0 && rangeIndex < arc.numArcs();
in.setPosition(arc.posArcsStart() - rangeIndex * (long) arc.bytesPerArc());
arc.arcIdx = rangeIndex;
arc.flags = in.readByte();
return readArc(arc, in);
}
/** /**
* Reads a present direct addressing node arc, with the provided index in the label range. * Reads a present direct addressing node arc, with the provided index in the label range.
* *
@ -888,6 +929,11 @@ public final class FST<T> implements Accountable {
return readArcByDirectAddressing(arc, in, arc.numArcs() - 1, presenceIndex); return readArcByDirectAddressing(arc, in, arc.numArcs() - 1, presenceIndex);
} }
/** Reads the last arc of a continuous node. */
public Arc<T> readLastArcByContinuous(Arc<T> arc, final BytesReader in) throws IOException {
return readArcByContinuous(arc, in, arc.numArcs() - 1);
}
/** Never returns null, but you should never call this if arc.isLast() is true. */ /** Never returns null, but you should never call this if arc.isLast() is true. */
public Arc<T> readNextRealArc(Arc<T> arc, final BytesReader in) throws IOException { public Arc<T> readNextRealArc(Arc<T> arc, final BytesReader in) throws IOException {
@ -896,6 +942,7 @@ public final class FST<T> implements Accountable {
switch (arc.nodeFlags()) { switch (arc.nodeFlags()) {
case ARCS_FOR_BINARY_SEARCH: case ARCS_FOR_BINARY_SEARCH:
case ARCS_FOR_CONTINUOUS:
assert arc.bytesPerArc() > 0; assert arc.bytesPerArc() > 0;
arc.arcIdx++; arc.arcIdx++;
assert arc.arcIdx() >= 0 && arc.arcIdx() < arc.numArcs(); assert arc.arcIdx() >= 0 && arc.arcIdx() < arc.numArcs();
@ -924,7 +971,7 @@ public final class FST<T> implements Accountable {
* positioned just after the arc flags byte. * positioned just after the arc flags byte.
*/ */
private Arc<T> readArc(Arc<T> arc, BytesReader in) throws IOException { private Arc<T> readArc(Arc<T> arc, BytesReader in) throws IOException {
if (arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING) { if (arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING || arc.nodeFlags() == ARCS_FOR_CONTINUOUS) {
arc.label = arc.firstLabel() + arc.arcIdx(); arc.label = arc.firstLabel() + arc.arcIdx();
} else { } else {
arc.label = readLabel(in); arc.label = readLabel(in);
@ -1067,6 +1114,17 @@ public final class FST<T> implements Accountable {
} }
} }
return null; return null;
} else if (flags == ARCS_FOR_CONTINUOUS) {
arc.numArcs = in.readVInt();
arc.bytesPerArc = in.readVInt();
arc.firstLabel = readLabel(in);
arc.posArcsStart = in.getPosition();
int arcIndex = labelToMatch - arc.firstLabel();
if (arcIndex < 0 || arcIndex >= arc.numArcs()) {
return null; // Before or after label range.
}
arc.arcIdx = arcIndex - 1;
return readNextRealArc(arc, in);
} }
// Linear scan // Linear scan

View File

@ -17,6 +17,7 @@
package org.apache.lucene.util.fst; package org.apache.lucene.util.fst;
import static org.apache.lucene.util.fst.FST.ARCS_FOR_BINARY_SEARCH; import static org.apache.lucene.util.fst.FST.ARCS_FOR_BINARY_SEARCH;
import static org.apache.lucene.util.fst.FST.ARCS_FOR_CONTINUOUS;
import static org.apache.lucene.util.fst.FST.ARCS_FOR_DIRECT_ADDRESSING; import static org.apache.lucene.util.fst.FST.ARCS_FOR_DIRECT_ADDRESSING;
import static org.apache.lucene.util.fst.FST.BIT_ARC_HAS_FINAL_OUTPUT; import static org.apache.lucene.util.fst.FST.BIT_ARC_HAS_FINAL_OUTPUT;
import static org.apache.lucene.util.fst.FST.BIT_ARC_HAS_OUTPUT; import static org.apache.lucene.util.fst.FST.BIT_ARC_HAS_OUTPUT;
@ -113,6 +114,7 @@ public class FSTCompiler<T> {
long nodeCount; long nodeCount;
long binarySearchNodeCount; long binarySearchNodeCount;
long directAddressingNodeCount; long directAddressingNodeCount;
long continuousNodeCount;
final boolean allowFixedLengthArcs; final boolean allowFixedLengthArcs;
final float directAddressingMaxOversizingFactor; final float directAddressingMaxOversizingFactor;
@ -445,9 +447,15 @@ public class FSTCompiler<T> {
int labelRange = nodeIn.arcs[nodeIn.numArcs - 1].label - nodeIn.arcs[0].label + 1; int labelRange = nodeIn.arcs[nodeIn.numArcs - 1].label - nodeIn.arcs[0].label + 1;
assert labelRange > 0; assert labelRange > 0;
if (shouldExpandNodeWithDirectAddressing( boolean continuousLabel = labelRange == nodeIn.numArcs;
if (continuousLabel) {
writeNodeForDirectAddressingOrContinuous(
nodeIn, startAddress, maxBytesPerArcWithoutLabel, labelRange, true);
continuousNodeCount++;
} else if (shouldExpandNodeWithDirectAddressing(
nodeIn, maxBytesPerArc, maxBytesPerArcWithoutLabel, labelRange)) { nodeIn, maxBytesPerArc, maxBytesPerArcWithoutLabel, labelRange)) {
writeNodeForDirectAddressing(nodeIn, startAddress, maxBytesPerArcWithoutLabel, labelRange); writeNodeForDirectAddressingOrContinuous(
nodeIn, startAddress, maxBytesPerArcWithoutLabel, labelRange, false);
directAddressingNodeCount++; directAddressingNodeCount++;
} else { } else {
writeNodeForBinarySearch(nodeIn, startAddress, maxBytesPerArc); writeNodeForBinarySearch(nodeIn, startAddress, maxBytesPerArc);
@ -578,18 +586,19 @@ public class FSTCompiler<T> {
bytes.writeBytes(startAddress, fixedLengthArcsBuffer.getBytes(), 0, headerLen); bytes.writeBytes(startAddress, fixedLengthArcsBuffer.getBytes(), 0, headerLen);
} }
private void writeNodeForDirectAddressing( private void writeNodeForDirectAddressingOrContinuous(
FSTCompiler.UnCompiledNode<T> nodeIn, FSTCompiler.UnCompiledNode<T> nodeIn,
long startAddress, long startAddress,
int maxBytesPerArcWithoutLabel, int maxBytesPerArcWithoutLabel,
int labelRange) { int labelRange,
boolean continuous) {
// Expand the arcs backwards in a buffer because we remove the labels. // Expand the arcs backwards in a buffer because we remove the labels.
// So the obtained arcs might occupy less space. This is the reason why this // So the obtained arcs might occupy less space. This is the reason why this
// whole method is more complex. // whole method is more complex.
// Drop the label bytes since we can infer the label based on the arc index, // Drop the label bytes since we can infer the label based on the arc index,
// the presence bits, and the first label. Keep the first label. // the presence bits, and the first label. Keep the first label.
int headerMaxLen = 11; int headerMaxLen = 11;
int numPresenceBytes = getNumPresenceBytes(labelRange); int numPresenceBytes = continuous ? 0 : getNumPresenceBytes(labelRange);
long srcPos = bytes.getPosition(); long srcPos = bytes.getPosition();
int totalArcBytes = numLabelBytesPerArc[0] + nodeIn.numArcs * maxBytesPerArcWithoutLabel; int totalArcBytes = numLabelBytesPerArc[0] + nodeIn.numArcs * maxBytesPerArcWithoutLabel;
int bufferOffset = headerMaxLen + numPresenceBytes + totalArcBytes; int bufferOffset = headerMaxLen + numPresenceBytes + totalArcBytes;
@ -620,7 +629,7 @@ public class FSTCompiler<T> {
// metadata. // metadata.
fixedLengthArcsBuffer fixedLengthArcsBuffer
.resetPosition() .resetPosition()
.writeByte(ARCS_FOR_DIRECT_ADDRESSING) .writeByte(continuous ? ARCS_FOR_CONTINUOUS : ARCS_FOR_DIRECT_ADDRESSING)
.writeVInt(labelRange) // labelRange instead of numArcs. .writeVInt(labelRange) // labelRange instead of numArcs.
.writeVInt( .writeVInt(
maxBytesPerArcWithoutLabel); // maxBytesPerArcWithoutLabel instead of maxBytesPerArc. maxBytesPerArcWithoutLabel); // maxBytesPerArcWithoutLabel instead of maxBytesPerArc.
@ -642,8 +651,10 @@ public class FSTCompiler<T> {
writeOffset += headerLen; writeOffset += headerLen;
// Write the presence bits // Write the presence bits
writePresenceBits(nodeIn, writeOffset, numPresenceBytes); if (continuous == false) {
writeOffset += numPresenceBytes; writePresenceBits(nodeIn, writeOffset, numPresenceBytes);
writeOffset += numPresenceBytes;
}
// Write the first label and the arcs. // Write the first label and the arcs.
bytes.writeBytes(writeOffset, fixedLengthArcsBuffer.getBytes(), bufferOffset, totalArcBytes); bytes.writeBytes(writeOffset, fixedLengthArcsBuffer.getBytes(), bufferOffset, totalArcBytes);

View File

@ -149,9 +149,11 @@ abstract class FSTEnum<T> {
final FST.BytesReader in = fst.getBytesReader(); final FST.BytesReader in = fst.getBytesReader();
if (arc.nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING) { if (arc.nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING) {
arc = doSeekCeilArrayDirectAddressing(arc, targetLabel, in); arc = doSeekCeilArrayDirectAddressing(arc, targetLabel, in);
} else { } else if (arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH) {
assert arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH;
arc = doSeekCeilArrayPacked(arc, targetLabel, in); arc = doSeekCeilArrayPacked(arc, targetLabel, in);
} else {
assert arc.nodeFlags() == FST.ARCS_FOR_CONTINUOUS;
arc = doSeekCeilArrayContinuous(arc, targetLabel, in);
} }
} else { } else {
arc = doSeekCeilList(arc, targetLabel); arc = doSeekCeilList(arc, targetLabel);
@ -159,6 +161,33 @@ abstract class FSTEnum<T> {
} }
} }
private FST.Arc<T> doSeekCeilArrayContinuous(
final FST.Arc<T> arc, final int targetLabel, final FST.BytesReader in) throws IOException {
int targetIndex = targetLabel - arc.firstLabel();
if (targetIndex >= arc.numArcs()) {
rollbackToLastForkThenPush();
return null;
} else {
if (targetIndex < 0) {
fst.readArcByContinuous(arc, in, 0);
assert arc.label() > targetLabel;
pushFirst();
return null;
} else {
fst.readArcByContinuous(arc, in, targetIndex);
assert arc.label() == targetLabel;
// found -- copy pasta from below
output[upto] = fst.outputs.add(output[upto - 1], arc.output());
if (targetLabel == FST.END_LABEL) {
return null;
}
setCurrentLabel(arc.label());
incr();
return fst.readFirstTargetArc(arc, getArc(upto), fstReader);
}
}
}
private FST.Arc<T> doSeekCeilArrayDirectAddressing( private FST.Arc<T> doSeekCeilArrayDirectAddressing(
final FST.Arc<T> arc, final int targetLabel, final FST.BytesReader in) throws IOException { final FST.Arc<T> arc, final int targetLabel, final FST.BytesReader in) throws IOException {
// The array is addressed directly by label, with presence bits to compute the actual arc // The array is addressed directly by label, with presence bits to compute the actual arc
@ -166,24 +195,8 @@ abstract class FSTEnum<T> {
int targetIndex = targetLabel - arc.firstLabel(); int targetIndex = targetLabel - arc.firstLabel();
if (targetIndex >= arc.numArcs()) { if (targetIndex >= arc.numArcs()) {
// Target is beyond the last arc, out of label range. rollbackToLastForkThenPush();
// Dead end (target is after the last arc); return null;
// rollback to last fork then push
upto--;
while (true) {
if (upto == 0) {
return null;
}
final FST.Arc<T> prevArc = getArc(upto);
// System.out.println(" rollback upto=" + upto + " arc.label=" + prevArc.label + "
// isLast?=" + prevArc.isLast());
if (!prevArc.isLast()) {
fst.readNextArc(prevArc, fstReader);
pushFirst();
return null;
}
upto--;
}
} else { } else {
if (targetIndex < 0) { if (targetIndex < 0) {
targetIndex = -1; targetIndex = -1;
@ -332,9 +345,11 @@ abstract class FSTEnum<T> {
final FST.BytesReader in = fst.getBytesReader(); final FST.BytesReader in = fst.getBytesReader();
if (arc.nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING) { if (arc.nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING) {
arc = doSeekFloorArrayDirectAddressing(arc, targetLabel, in); arc = doSeekFloorArrayDirectAddressing(arc, targetLabel, in);
} else { } else if (arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH) {
assert arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH;
arc = doSeekFloorArrayPacked(arc, targetLabel, in); arc = doSeekFloorArrayPacked(arc, targetLabel, in);
} else {
assert arc.nodeFlags() == FST.ARCS_FOR_CONTINUOUS;
arc = doSeekFloorContinuous(arc, targetLabel, in);
} }
} else { } else {
arc = doSeekFloorList(arc, targetLabel); arc = doSeekFloorList(arc, targetLabel);
@ -342,6 +357,34 @@ abstract class FSTEnum<T> {
} }
} }
private FST.Arc<T> doSeekFloorContinuous(FST.Arc<T> arc, int targetLabel, FST.BytesReader in)
throws IOException {
int targetIndex = targetLabel - arc.firstLabel();
if (targetIndex < 0) {
// Before first arc.
return backtrackToFloorArc(arc, targetLabel, in);
} else if (targetIndex >= arc.numArcs()) {
// After last arc.
fst.readLastArcByContinuous(arc, in);
assert arc.label() < targetLabel;
assert arc.isLast();
pushLast();
return null;
} else {
// Within label range.
fst.readArcByContinuous(arc, in, targetIndex);
assert arc.label() == targetLabel;
// found -- copy pasta from below
output[upto] = fst.outputs.add(output[upto - 1], arc.output());
if (targetLabel == FST.END_LABEL) {
return null;
}
setCurrentLabel(arc.label());
incr();
return fst.readFirstTargetArc(arc, getArc(upto), fstReader);
}
}
private FST.Arc<T> doSeekFloorArrayDirectAddressing( private FST.Arc<T> doSeekFloorArrayDirectAddressing(
FST.Arc<T> arc, int targetLabel, FST.BytesReader in) throws IOException { FST.Arc<T> arc, int targetLabel, FST.BytesReader in) throws IOException {
// The array is addressed directly by label, with presence bits to compute the actual arc // The array is addressed directly by label, with presence bits to compute the actual arc
@ -383,6 +426,28 @@ abstract class FSTEnum<T> {
} }
} }
/**
* Target is beyond the last arc, out of label range. Dead end (target is after the last arc);
* rollback to last fork then push
*/
private void rollbackToLastForkThenPush() throws IOException {
upto--;
while (true) {
if (upto == 0) {
return;
}
final FST.Arc<T> prevArc = getArc(upto);
// System.out.println(" rollback upto=" + upto + " arc.label=" + prevArc.label + "
// isLast?=" + prevArc.isLast());
if (!prevArc.isLast()) {
fst.readNextArc(prevArc, fstReader);
pushFirst();
return;
}
upto--;
}
}
/** /**
* Backtracks until it finds a node which first arc is before our target label.` Then on the node, * Backtracks until it finds a node which first arc is before our target label.` Then on the node,
* finds the arc just before the targetLabel. * finds the arc just before the targetLabel.
@ -400,9 +465,11 @@ abstract class FSTEnum<T> {
if (arc.bytesPerArc() != 0 && arc.label() != FST.END_LABEL) { if (arc.bytesPerArc() != 0 && arc.label() != FST.END_LABEL) {
if (arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH) { if (arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH) {
findNextFloorArcBinarySearch(arc, targetLabel, in); findNextFloorArcBinarySearch(arc, targetLabel, in);
} else { } else if (arc.nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING) {
assert arc.nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING;
findNextFloorArcDirectAddressing(arc, targetLabel, in); findNextFloorArcDirectAddressing(arc, targetLabel, in);
} else {
assert arc.nodeFlags() == FST.ARCS_FOR_CONTINUOUS;
findNextFloorArcContinuous(arc, targetLabel, in);
} }
} else { } else {
while (!arc.isLast() && fst.readNextArcLabel(arc, in) < targetLabel) { while (!arc.isLast() && fst.readNextArcLabel(arc, in) < targetLabel) {
@ -452,6 +519,24 @@ abstract class FSTEnum<T> {
} }
} }
/** Same as {@link #findNextFloorArcDirectAddressing} for continuous node. */
private void findNextFloorArcContinuous(FST.Arc<T> arc, int targetLabel, final FST.BytesReader in)
throws IOException {
assert arc.nodeFlags() == FST.ARCS_FOR_CONTINUOUS;
assert arc.label() != FST.END_LABEL;
assert arc.label() == arc.firstLabel();
if (arc.numArcs() > 1) {
int targetIndex = targetLabel - arc.firstLabel();
assert targetIndex >= 0;
if (targetIndex >= arc.numArcs()) {
// Beyond last arc. Take last arc.
fst.readLastArcByContinuous(arc, in);
} else {
fst.readArcByContinuous(arc, in, targetIndex - 1);
}
}
}
/** Same as {@link #findNextFloorArcDirectAddressing} for binary search node. */ /** Same as {@link #findNextFloorArcDirectAddressing} for binary search node. */
private void findNextFloorArcBinarySearch(FST.Arc<T> arc, int targetLabel, FST.BytesReader in) private void findNextFloorArcBinarySearch(FST.Arc<T> arc, int targetLabel, FST.BytesReader in)
throws IOException { throws IOException {

View File

@ -408,6 +408,12 @@ final class NodeHash<T> {
return -1; return -1;
} }
break; break;
case FST.ARCS_FOR_CONTINUOUS:
if ((node.arcs[node.numArcs - 1].label - node.arcs[0].label + 1)
!= scratchArc.numArcs()) {
return -1;
}
break;
default: default:
throw new AssertionError("unhandled scratchArc.nodeFlag() " + scratchArc.nodeFlags()); throw new AssertionError("unhandled scratchArc.nodeFlag() " + scratchArc.nodeFlags());
} }

View File

@ -854,6 +854,17 @@ public final class Util {
} }
return arc; return arc;
} }
} else if (arc.nodeFlags() == FST.ARCS_FOR_CONTINUOUS) {
int targetIndex = label - arc.label();
if (targetIndex >= arc.numArcs()) {
return null;
} else if (targetIndex < 0) {
return arc;
} else {
fst.readArcByContinuous(arc, in, targetIndex);
assert arc.label() == label;
return arc;
}
} }
// Fixed length arcs in a binary search node. // Fixed length arcs in a binary search node.
int idx = binarySearch(fst, arc, label); int idx = binarySearch(fst, arc, label);

View File

@ -137,7 +137,9 @@ public class TestFSTDirectAddressing extends LuceneTestCase {
directAddressingMemoryIncreasePercent)); directAddressingMemoryIncreasePercent));
System.out.println("num nodes = " + fstCompiler.nodeCount); System.out.println("num nodes = " + fstCompiler.nodeCount);
long fixedLengthArcNodeCount = long fixedLengthArcNodeCount =
fstCompiler.directAddressingNodeCount + fstCompiler.binarySearchNodeCount; fstCompiler.directAddressingNodeCount
+ fstCompiler.binarySearchNodeCount
+ fstCompiler.continuousNodeCount;
System.out.println( System.out.println(
"num fixed-length-arc nodes = " "num fixed-length-arc nodes = "
+ fixedLengthArcNodeCount + fixedLengthArcNodeCount
@ -161,6 +163,13 @@ public class TestFSTDirectAddressing extends LuceneTestCase {
((double) (fstCompiler.directAddressingNodeCount) ((double) (fstCompiler.directAddressingNodeCount)
/ fixedLengthArcNodeCount / fixedLengthArcNodeCount
* 100))); * 100)));
System.out.println(
"num continuous-arcs nodes = "
+ (fstCompiler.continuousNodeCount)
+ String.format(
Locale.ENGLISH,
" (%.2f %% of fixed-length-arc nodes)",
((double) (fstCompiler.continuousNodeCount) / fixedLengthArcNodeCount * 100)));
} }
private static FSTCompiler<Object> createFSTCompiler(float directAddressingMaxOversizingFactor) { private static FSTCompiler<Object> createFSTCompiler(float directAddressingMaxOversizingFactor) {
@ -211,18 +220,25 @@ public class TestFSTDirectAddressing extends LuceneTestCase {
DataInput in = new ByteArrayDataInput(buf); DataInput in = new ByteArrayDataInput(buf);
FST<BytesRef> fst = new FST<>(in, in, ByteSequenceOutputs.getSingleton()); FST<BytesRef> fst = new FST<>(in, in, ByteSequenceOutputs.getSingleton());
BytesRefFSTEnum<BytesRef> fstEnum = new BytesRefFSTEnum<>(fst); BytesRefFSTEnum<BytesRef> fstEnum = new BytesRefFSTEnum<>(fst);
int binarySearchArcCount = 0, directAddressingArcCount = 0, listArcCount = 0; int binarySearchArcCount = 0,
directAddressingArcCount = 0,
listArcCount = 0,
continuousArcCount = 0;
while (fstEnum.next() != null) { while (fstEnum.next() != null) {
if (fstEnum.arcs[fstEnum.upto].bytesPerArc() == 0) { if (fstEnum.arcs[fstEnum.upto].bytesPerArc() == 0) {
listArcCount++; listArcCount++;
} else if (fstEnum.arcs[fstEnum.upto].nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING) { } else if (fstEnum.arcs[fstEnum.upto].nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING) {
directAddressingArcCount++; directAddressingArcCount++;
} else if (fstEnum.arcs[fstEnum.upto].nodeFlags() == FST.ARCS_FOR_CONTINUOUS) {
continuousArcCount++;
} else { } else {
binarySearchArcCount++; binarySearchArcCount++;
} }
} }
System.out.println( System.out.println(
"direct addressing arcs = " "continuous arcs = "
+ continuousArcCount
+ ", direct addressing arcs = "
+ directAddressingArcCount + directAddressingArcCount
+ ", binary search arcs = " + ", binary search arcs = "
+ binarySearchArcCount + binarySearchArcCount

View File

@ -43,6 +43,26 @@ public class TestUtil extends LuceneTestCase {
assertEquals(-7, Util.binarySearch(fst, arc, 'P')); assertEquals(-7, Util.binarySearch(fst, arc, 'P'));
} }
public void testContinuous() throws Exception {
List<String> letters = Arrays.asList("A", "B", "C", "D", "E", "F", "G", "H");
FST<Object> fst = buildFST(letters, true, false);
FST.Arc<Object> first = fst.getFirstArc(new FST.Arc<>());
FST.Arc<Object> arc = new FST.Arc<>();
FST.BytesReader in = fst.getBytesReader();
for (String letter : letters) {
char c = letter.charAt(0);
arc = Util.readCeilArc(c, fst, first, arc, in);
assertNotNull(arc);
assertEquals(c, arc.label());
}
// in the middle
assertEquals('F', Util.readCeilArc('F', fst, first, arc, in).label());
// no following arcs
assertNull(Util.readCeilArc('A', fst, arc, arc, in));
}
public void testReadCeilArcPackedArray() throws Exception { public void testReadCeilArcPackedArray() throws Exception {
List<String> letters = Arrays.asList("A", "E", "J", "K", "L", "O", "T", "z"); List<String> letters = Arrays.asList("A", "E", "J", "K", "L", "O", "T", "z");
verifyReadCeilArc(letters, true, false); verifyReadCeilArc(letters, true, false);