Specialize arc store for continuous label in FST (#12748)

* init

* review fix and reuse duplicate code

* rebase

* tidy

* CHANGES.txt

* bump version

* rebase

* CHANGES.txt
This commit is contained in:
Zhang Chao 2023-11-09 19:01:32 +08:00 committed by GitHub
parent a71d64a598
commit 570832eb74
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 261 additions and 49 deletions

View File

@ -264,6 +264,8 @@ Optimizations
* GITHUB#12381: Skip docs with DocValues in NumericLeafComparator. (Lu Xugang, Adrien Grand)
* GITHUB#12748: Specialize arc store for continuous label in FST. (Guo Feng, Chao Zhang)
Changes in runtime behavior
---------------------

View File

@ -86,8 +86,11 @@ public final class Lucene90BlockTreeTermsReader extends FieldsProducer {
*/
public static final int VERSION_MSB_VLONG_OUTPUT = 1;
/** The version that specialize arc store for continuous label in FST. */
public static final int VERSION_FST_CONTINUOUS_ARCS = 2;
/** Current terms format. */
public static final int VERSION_CURRENT = VERSION_MSB_VLONG_OUTPUT;
public static final int VERSION_CURRENT = VERSION_FST_CONTINUOUS_ARCS;
/** Extension of terms index file */
static final String TERMS_INDEX_EXTENSION = "tip";

View File

@ -98,11 +98,19 @@ public final class FST<T> implements Accountable {
*/
static final byte ARCS_FOR_DIRECT_ADDRESSING = 1 << 6;
/**
* Value of the arc flags to declare a node with continuous arcs designed for pos the arc directly
* with labelToPos - firstLabel. like {@link #ARCS_FOR_BINARY_SEARCH} we use flag combinations
* that will not occur at the same time.
*/
static final byte ARCS_FOR_CONTINUOUS = ARCS_FOR_DIRECT_ADDRESSING + ARCS_FOR_BINARY_SEARCH;
// Increment version to change it
private static final String FILE_FORMAT_NAME = "FST";
private static final int VERSION_START = 6;
private static final int VERSION_LITTLE_ENDIAN = 8;
static final int VERSION_CURRENT = VERSION_LITTLE_ENDIAN;
private static final int VERSION_CONTINUOUS_ARCS = 9;
static final int VERSION_CURRENT = VERSION_CONTINUOUS_ARCS;
// Never serialized; just used to represent the virtual
// final node w/ no arcs:
@ -243,7 +251,10 @@ public final class FST<T> implements Accountable {
.append(numArcs())
.append(")")
.append("(")
.append(nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING ? "da" : "bs")
.append(
nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING
? "da"
: nodeFlags() == ARCS_FOR_CONTINUOUS ? "cs" : "bs")
.append(")");
}
return b.toString();
@ -285,8 +296,8 @@ public final class FST<T> implements Accountable {
/**
* Node header flags. Only meaningful to check if the value is either {@link
* #ARCS_FOR_BINARY_SEARCH} or {@link #ARCS_FOR_DIRECT_ADDRESSING} (other value when bytesPerArc
* == 0).
* #ARCS_FOR_BINARY_SEARCH} or {@link #ARCS_FOR_DIRECT_ADDRESSING} or {@link
* #ARCS_FOR_CONTINUOUS} (other value when bytesPerArc == 0).
*/
public byte nodeFlags() {
return nodeFlags;
@ -318,7 +329,7 @@ public final class FST<T> implements Accountable {
/**
* First label of a direct addressing node. Only valid if nodeFlags == {@link
* #ARCS_FOR_DIRECT_ADDRESSING}.
* #ARCS_FOR_DIRECT_ADDRESSING} or {@link #ARCS_FOR_CONTINUOUS}.
*/
int firstLabel() {
return firstLabel;
@ -653,7 +664,9 @@ public final class FST<T> implements Accountable {
} else {
in.setPosition(follow.target());
byte flags = arc.nodeFlags = in.readByte();
if (flags == ARCS_FOR_BINARY_SEARCH || flags == ARCS_FOR_DIRECT_ADDRESSING) {
if (flags == ARCS_FOR_BINARY_SEARCH
|| flags == ARCS_FOR_DIRECT_ADDRESSING
|| flags == ARCS_FOR_CONTINUOUS) {
// Special arc which is actually a node header for fixed length arcs.
// Jump straight to end to find the last arc.
arc.numArcs = in.readVInt();
@ -664,10 +677,14 @@ public final class FST<T> implements Accountable {
arc.firstLabel = readLabel(in);
arc.posArcsStart = in.getPosition();
readLastArcByDirectAddressing(arc, in);
} else {
} else if (flags == ARCS_FOR_BINARY_SEARCH) {
arc.arcIdx = arc.numArcs() - 2;
arc.posArcsStart = in.getPosition();
readNextRealArc(arc, in);
} else {
arc.firstLabel = readLabel(in);
arc.posArcsStart = in.getPosition();
readLastArcByContinuous(arc, in);
}
} else {
arc.flags = flags;
@ -740,7 +757,9 @@ public final class FST<T> implements Accountable {
in.setPosition(nodeAddress);
byte flags = arc.nodeFlags = in.readByte();
if (flags == ARCS_FOR_BINARY_SEARCH || flags == ARCS_FOR_DIRECT_ADDRESSING) {
if (flags == ARCS_FOR_BINARY_SEARCH
|| flags == ARCS_FOR_DIRECT_ADDRESSING
|| flags == ARCS_FOR_CONTINUOUS) {
// Special arc which is actually a node header for fixed length arcs.
arc.numArcs = in.readVInt();
arc.bytesPerArc = in.readVInt();
@ -749,6 +768,8 @@ public final class FST<T> implements Accountable {
readPresenceBytes(arc, in);
arc.firstLabel = readLabel(in);
arc.presenceIndex = -1;
} else if (flags == ARCS_FOR_CONTINUOUS) {
arc.firstLabel = readLabel(in);
}
arc.posArcsStart = in.getPosition();
} else {
@ -773,7 +794,9 @@ public final class FST<T> implements Accountable {
} else {
in.setPosition(follow.target());
byte flags = in.readByte();
return flags == ARCS_FOR_BINARY_SEARCH || flags == ARCS_FOR_DIRECT_ADDRESSING;
return flags == ARCS_FOR_BINARY_SEARCH
|| flags == ARCS_FOR_DIRECT_ADDRESSING
|| flags == ARCS_FOR_CONTINUOUS;
}
}
@ -801,16 +824,18 @@ public final class FST<T> implements Accountable {
in.setPosition(arc.nextArc());
byte flags = in.readByte();
if (flags == ARCS_FOR_BINARY_SEARCH || flags == ARCS_FOR_DIRECT_ADDRESSING) {
if (flags == ARCS_FOR_BINARY_SEARCH
|| flags == ARCS_FOR_DIRECT_ADDRESSING
|| flags == ARCS_FOR_CONTINUOUS) {
// System.out.println(" nextArc fixed length arc");
// Special arc which is actually a node header for fixed length arcs.
int numArcs = in.readVInt();
in.readVInt(); // Skip bytesPerArc.
if (flags == ARCS_FOR_BINARY_SEARCH) {
in.readByte(); // Skip arc flags.
} else {
} else if (flags == ARCS_FOR_DIRECT_ADDRESSING) {
in.skipBytes(getNumPresenceBytes(numArcs));
}
} // Nothing to do for ARCS_FOR_CONTINUOUS
}
} else {
switch (arc.nodeFlags()) {
@ -826,6 +851,8 @@ public final class FST<T> implements Accountable {
int nextIndex = BitTable.nextBitSet(arc.arcIdx(), arc, in);
assert nextIndex != -1;
return arc.firstLabel() + nextIndex;
case ARCS_FOR_CONTINUOUS:
return arc.firstLabel() + arc.arcIdx() + 1;
default:
// Variable length arcs - linear search.
assert arc.bytesPerArc() == 0;
@ -849,6 +876,20 @@ public final class FST<T> implements Accountable {
return readArc(arc, in);
}
/**
* Reads a Continuous node arc, with the provided index in the label range.
*
* @param rangeIndex The index of the arc in the label range. It must be within the label range.
*/
public Arc<T> readArcByContinuous(Arc<T> arc, final BytesReader in, int rangeIndex)
throws IOException {
assert rangeIndex >= 0 && rangeIndex < arc.numArcs();
in.setPosition(arc.posArcsStart() - rangeIndex * (long) arc.bytesPerArc());
arc.arcIdx = rangeIndex;
arc.flags = in.readByte();
return readArc(arc, in);
}
/**
* Reads a present direct addressing node arc, with the provided index in the label range.
*
@ -888,6 +929,11 @@ public final class FST<T> implements Accountable {
return readArcByDirectAddressing(arc, in, arc.numArcs() - 1, presenceIndex);
}
/** Reads the last arc of a continuous node. */
public Arc<T> readLastArcByContinuous(Arc<T> arc, final BytesReader in) throws IOException {
return readArcByContinuous(arc, in, arc.numArcs() - 1);
}
/** Never returns null, but you should never call this if arc.isLast() is true. */
public Arc<T> readNextRealArc(Arc<T> arc, final BytesReader in) throws IOException {
@ -896,6 +942,7 @@ public final class FST<T> implements Accountable {
switch (arc.nodeFlags()) {
case ARCS_FOR_BINARY_SEARCH:
case ARCS_FOR_CONTINUOUS:
assert arc.bytesPerArc() > 0;
arc.arcIdx++;
assert arc.arcIdx() >= 0 && arc.arcIdx() < arc.numArcs();
@ -924,7 +971,7 @@ public final class FST<T> implements Accountable {
* positioned just after the arc flags byte.
*/
private Arc<T> readArc(Arc<T> arc, BytesReader in) throws IOException {
if (arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING) {
if (arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING || arc.nodeFlags() == ARCS_FOR_CONTINUOUS) {
arc.label = arc.firstLabel() + arc.arcIdx();
} else {
arc.label = readLabel(in);
@ -1067,6 +1114,17 @@ public final class FST<T> implements Accountable {
}
}
return null;
} else if (flags == ARCS_FOR_CONTINUOUS) {
arc.numArcs = in.readVInt();
arc.bytesPerArc = in.readVInt();
arc.firstLabel = readLabel(in);
arc.posArcsStart = in.getPosition();
int arcIndex = labelToMatch - arc.firstLabel();
if (arcIndex < 0 || arcIndex >= arc.numArcs()) {
return null; // Before or after label range.
}
arc.arcIdx = arcIndex - 1;
return readNextRealArc(arc, in);
}
// Linear scan

View File

@ -17,6 +17,7 @@
package org.apache.lucene.util.fst;
import static org.apache.lucene.util.fst.FST.ARCS_FOR_BINARY_SEARCH;
import static org.apache.lucene.util.fst.FST.ARCS_FOR_CONTINUOUS;
import static org.apache.lucene.util.fst.FST.ARCS_FOR_DIRECT_ADDRESSING;
import static org.apache.lucene.util.fst.FST.BIT_ARC_HAS_FINAL_OUTPUT;
import static org.apache.lucene.util.fst.FST.BIT_ARC_HAS_OUTPUT;
@ -113,6 +114,7 @@ public class FSTCompiler<T> {
long nodeCount;
long binarySearchNodeCount;
long directAddressingNodeCount;
long continuousNodeCount;
final boolean allowFixedLengthArcs;
final float directAddressingMaxOversizingFactor;
@ -445,9 +447,15 @@ public class FSTCompiler<T> {
int labelRange = nodeIn.arcs[nodeIn.numArcs - 1].label - nodeIn.arcs[0].label + 1;
assert labelRange > 0;
if (shouldExpandNodeWithDirectAddressing(
boolean continuousLabel = labelRange == nodeIn.numArcs;
if (continuousLabel) {
writeNodeForDirectAddressingOrContinuous(
nodeIn, startAddress, maxBytesPerArcWithoutLabel, labelRange, true);
continuousNodeCount++;
} else if (shouldExpandNodeWithDirectAddressing(
nodeIn, maxBytesPerArc, maxBytesPerArcWithoutLabel, labelRange)) {
writeNodeForDirectAddressing(nodeIn, startAddress, maxBytesPerArcWithoutLabel, labelRange);
writeNodeForDirectAddressingOrContinuous(
nodeIn, startAddress, maxBytesPerArcWithoutLabel, labelRange, false);
directAddressingNodeCount++;
} else {
writeNodeForBinarySearch(nodeIn, startAddress, maxBytesPerArc);
@ -578,18 +586,19 @@ public class FSTCompiler<T> {
bytes.writeBytes(startAddress, fixedLengthArcsBuffer.getBytes(), 0, headerLen);
}
private void writeNodeForDirectAddressing(
private void writeNodeForDirectAddressingOrContinuous(
FSTCompiler.UnCompiledNode<T> nodeIn,
long startAddress,
int maxBytesPerArcWithoutLabel,
int labelRange) {
int labelRange,
boolean continuous) {
// Expand the arcs backwards in a buffer because we remove the labels.
// So the obtained arcs might occupy less space. This is the reason why this
// whole method is more complex.
// Drop the label bytes since we can infer the label based on the arc index,
// the presence bits, and the first label. Keep the first label.
int headerMaxLen = 11;
int numPresenceBytes = getNumPresenceBytes(labelRange);
int numPresenceBytes = continuous ? 0 : getNumPresenceBytes(labelRange);
long srcPos = bytes.getPosition();
int totalArcBytes = numLabelBytesPerArc[0] + nodeIn.numArcs * maxBytesPerArcWithoutLabel;
int bufferOffset = headerMaxLen + numPresenceBytes + totalArcBytes;
@ -620,7 +629,7 @@ public class FSTCompiler<T> {
// metadata.
fixedLengthArcsBuffer
.resetPosition()
.writeByte(ARCS_FOR_DIRECT_ADDRESSING)
.writeByte(continuous ? ARCS_FOR_CONTINUOUS : ARCS_FOR_DIRECT_ADDRESSING)
.writeVInt(labelRange) // labelRange instead of numArcs.
.writeVInt(
maxBytesPerArcWithoutLabel); // maxBytesPerArcWithoutLabel instead of maxBytesPerArc.
@ -642,8 +651,10 @@ public class FSTCompiler<T> {
writeOffset += headerLen;
// Write the presence bits
writePresenceBits(nodeIn, writeOffset, numPresenceBytes);
writeOffset += numPresenceBytes;
if (continuous == false) {
writePresenceBits(nodeIn, writeOffset, numPresenceBytes);
writeOffset += numPresenceBytes;
}
// Write the first label and the arcs.
bytes.writeBytes(writeOffset, fixedLengthArcsBuffer.getBytes(), bufferOffset, totalArcBytes);

View File

@ -149,9 +149,11 @@ abstract class FSTEnum<T> {
final FST.BytesReader in = fst.getBytesReader();
if (arc.nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING) {
arc = doSeekCeilArrayDirectAddressing(arc, targetLabel, in);
} else {
assert arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH;
} else if (arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH) {
arc = doSeekCeilArrayPacked(arc, targetLabel, in);
} else {
assert arc.nodeFlags() == FST.ARCS_FOR_CONTINUOUS;
arc = doSeekCeilArrayContinuous(arc, targetLabel, in);
}
} else {
arc = doSeekCeilList(arc, targetLabel);
@ -159,6 +161,33 @@ abstract class FSTEnum<T> {
}
}
private FST.Arc<T> doSeekCeilArrayContinuous(
final FST.Arc<T> arc, final int targetLabel, final FST.BytesReader in) throws IOException {
int targetIndex = targetLabel - arc.firstLabel();
if (targetIndex >= arc.numArcs()) {
rollbackToLastForkThenPush();
return null;
} else {
if (targetIndex < 0) {
fst.readArcByContinuous(arc, in, 0);
assert arc.label() > targetLabel;
pushFirst();
return null;
} else {
fst.readArcByContinuous(arc, in, targetIndex);
assert arc.label() == targetLabel;
// found -- copy pasta from below
output[upto] = fst.outputs.add(output[upto - 1], arc.output());
if (targetLabel == FST.END_LABEL) {
return null;
}
setCurrentLabel(arc.label());
incr();
return fst.readFirstTargetArc(arc, getArc(upto), fstReader);
}
}
}
private FST.Arc<T> doSeekCeilArrayDirectAddressing(
final FST.Arc<T> arc, final int targetLabel, final FST.BytesReader in) throws IOException {
// The array is addressed directly by label, with presence bits to compute the actual arc
@ -166,24 +195,8 @@ abstract class FSTEnum<T> {
int targetIndex = targetLabel - arc.firstLabel();
if (targetIndex >= arc.numArcs()) {
// Target is beyond the last arc, out of label range.
// Dead end (target is after the last arc);
// rollback to last fork then push
upto--;
while (true) {
if (upto == 0) {
return null;
}
final FST.Arc<T> prevArc = getArc(upto);
// System.out.println(" rollback upto=" + upto + " arc.label=" + prevArc.label + "
// isLast?=" + prevArc.isLast());
if (!prevArc.isLast()) {
fst.readNextArc(prevArc, fstReader);
pushFirst();
return null;
}
upto--;
}
rollbackToLastForkThenPush();
return null;
} else {
if (targetIndex < 0) {
targetIndex = -1;
@ -332,9 +345,11 @@ abstract class FSTEnum<T> {
final FST.BytesReader in = fst.getBytesReader();
if (arc.nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING) {
arc = doSeekFloorArrayDirectAddressing(arc, targetLabel, in);
} else {
assert arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH;
} else if (arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH) {
arc = doSeekFloorArrayPacked(arc, targetLabel, in);
} else {
assert arc.nodeFlags() == FST.ARCS_FOR_CONTINUOUS;
arc = doSeekFloorContinuous(arc, targetLabel, in);
}
} else {
arc = doSeekFloorList(arc, targetLabel);
@ -342,6 +357,34 @@ abstract class FSTEnum<T> {
}
}
private FST.Arc<T> doSeekFloorContinuous(FST.Arc<T> arc, int targetLabel, FST.BytesReader in)
throws IOException {
int targetIndex = targetLabel - arc.firstLabel();
if (targetIndex < 0) {
// Before first arc.
return backtrackToFloorArc(arc, targetLabel, in);
} else if (targetIndex >= arc.numArcs()) {
// After last arc.
fst.readLastArcByContinuous(arc, in);
assert arc.label() < targetLabel;
assert arc.isLast();
pushLast();
return null;
} else {
// Within label range.
fst.readArcByContinuous(arc, in, targetIndex);
assert arc.label() == targetLabel;
// found -- copy pasta from below
output[upto] = fst.outputs.add(output[upto - 1], arc.output());
if (targetLabel == FST.END_LABEL) {
return null;
}
setCurrentLabel(arc.label());
incr();
return fst.readFirstTargetArc(arc, getArc(upto), fstReader);
}
}
private FST.Arc<T> doSeekFloorArrayDirectAddressing(
FST.Arc<T> arc, int targetLabel, FST.BytesReader in) throws IOException {
// The array is addressed directly by label, with presence bits to compute the actual arc
@ -383,6 +426,28 @@ abstract class FSTEnum<T> {
}
}
/**
* Target is beyond the last arc, out of label range. Dead end (target is after the last arc);
* rollback to last fork then push
*/
private void rollbackToLastForkThenPush() throws IOException {
upto--;
while (true) {
if (upto == 0) {
return;
}
final FST.Arc<T> prevArc = getArc(upto);
// System.out.println(" rollback upto=" + upto + " arc.label=" + prevArc.label + "
// isLast?=" + prevArc.isLast());
if (!prevArc.isLast()) {
fst.readNextArc(prevArc, fstReader);
pushFirst();
return;
}
upto--;
}
}
/**
* Backtracks until it finds a node which first arc is before our target label.` Then on the node,
* finds the arc just before the targetLabel.
@ -400,9 +465,11 @@ abstract class FSTEnum<T> {
if (arc.bytesPerArc() != 0 && arc.label() != FST.END_LABEL) {
if (arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH) {
findNextFloorArcBinarySearch(arc, targetLabel, in);
} else {
assert arc.nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING;
} else if (arc.nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING) {
findNextFloorArcDirectAddressing(arc, targetLabel, in);
} else {
assert arc.nodeFlags() == FST.ARCS_FOR_CONTINUOUS;
findNextFloorArcContinuous(arc, targetLabel, in);
}
} else {
while (!arc.isLast() && fst.readNextArcLabel(arc, in) < targetLabel) {
@ -452,6 +519,24 @@ abstract class FSTEnum<T> {
}
}
/** Same as {@link #findNextFloorArcDirectAddressing} for continuous node. */
private void findNextFloorArcContinuous(FST.Arc<T> arc, int targetLabel, final FST.BytesReader in)
throws IOException {
assert arc.nodeFlags() == FST.ARCS_FOR_CONTINUOUS;
assert arc.label() != FST.END_LABEL;
assert arc.label() == arc.firstLabel();
if (arc.numArcs() > 1) {
int targetIndex = targetLabel - arc.firstLabel();
assert targetIndex >= 0;
if (targetIndex >= arc.numArcs()) {
// Beyond last arc. Take last arc.
fst.readLastArcByContinuous(arc, in);
} else {
fst.readArcByContinuous(arc, in, targetIndex - 1);
}
}
}
/** Same as {@link #findNextFloorArcDirectAddressing} for binary search node. */
private void findNextFloorArcBinarySearch(FST.Arc<T> arc, int targetLabel, FST.BytesReader in)
throws IOException {

View File

@ -408,6 +408,12 @@ final class NodeHash<T> {
return -1;
}
break;
case FST.ARCS_FOR_CONTINUOUS:
if ((node.arcs[node.numArcs - 1].label - node.arcs[0].label + 1)
!= scratchArc.numArcs()) {
return -1;
}
break;
default:
throw new AssertionError("unhandled scratchArc.nodeFlag() " + scratchArc.nodeFlags());
}

View File

@ -854,6 +854,17 @@ public final class Util {
}
return arc;
}
} else if (arc.nodeFlags() == FST.ARCS_FOR_CONTINUOUS) {
int targetIndex = label - arc.label();
if (targetIndex >= arc.numArcs()) {
return null;
} else if (targetIndex < 0) {
return arc;
} else {
fst.readArcByContinuous(arc, in, targetIndex);
assert arc.label() == label;
return arc;
}
}
// Fixed length arcs in a binary search node.
int idx = binarySearch(fst, arc, label);

View File

@ -137,7 +137,9 @@ public class TestFSTDirectAddressing extends LuceneTestCase {
directAddressingMemoryIncreasePercent));
System.out.println("num nodes = " + fstCompiler.nodeCount);
long fixedLengthArcNodeCount =
fstCompiler.directAddressingNodeCount + fstCompiler.binarySearchNodeCount;
fstCompiler.directAddressingNodeCount
+ fstCompiler.binarySearchNodeCount
+ fstCompiler.continuousNodeCount;
System.out.println(
"num fixed-length-arc nodes = "
+ fixedLengthArcNodeCount
@ -161,6 +163,13 @@ public class TestFSTDirectAddressing extends LuceneTestCase {
((double) (fstCompiler.directAddressingNodeCount)
/ fixedLengthArcNodeCount
* 100)));
System.out.println(
"num continuous-arcs nodes = "
+ (fstCompiler.continuousNodeCount)
+ String.format(
Locale.ENGLISH,
" (%.2f %% of fixed-length-arc nodes)",
((double) (fstCompiler.continuousNodeCount) / fixedLengthArcNodeCount * 100)));
}
private static FSTCompiler<Object> createFSTCompiler(float directAddressingMaxOversizingFactor) {
@ -211,18 +220,25 @@ public class TestFSTDirectAddressing extends LuceneTestCase {
DataInput in = new ByteArrayDataInput(buf);
FST<BytesRef> fst = new FST<>(in, in, ByteSequenceOutputs.getSingleton());
BytesRefFSTEnum<BytesRef> fstEnum = new BytesRefFSTEnum<>(fst);
int binarySearchArcCount = 0, directAddressingArcCount = 0, listArcCount = 0;
int binarySearchArcCount = 0,
directAddressingArcCount = 0,
listArcCount = 0,
continuousArcCount = 0;
while (fstEnum.next() != null) {
if (fstEnum.arcs[fstEnum.upto].bytesPerArc() == 0) {
listArcCount++;
} else if (fstEnum.arcs[fstEnum.upto].nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING) {
directAddressingArcCount++;
} else if (fstEnum.arcs[fstEnum.upto].nodeFlags() == FST.ARCS_FOR_CONTINUOUS) {
continuousArcCount++;
} else {
binarySearchArcCount++;
}
}
System.out.println(
"direct addressing arcs = "
"continuous arcs = "
+ continuousArcCount
+ ", direct addressing arcs = "
+ directAddressingArcCount
+ ", binary search arcs = "
+ binarySearchArcCount

View File

@ -43,6 +43,26 @@ public class TestUtil extends LuceneTestCase {
assertEquals(-7, Util.binarySearch(fst, arc, 'P'));
}
public void testContinuous() throws Exception {
List<String> letters = Arrays.asList("A", "B", "C", "D", "E", "F", "G", "H");
FST<Object> fst = buildFST(letters, true, false);
FST.Arc<Object> first = fst.getFirstArc(new FST.Arc<>());
FST.Arc<Object> arc = new FST.Arc<>();
FST.BytesReader in = fst.getBytesReader();
for (String letter : letters) {
char c = letter.charAt(0);
arc = Util.readCeilArc(c, fst, first, arc, in);
assertNotNull(arc);
assertEquals(c, arc.label());
}
// in the middle
assertEquals('F', Util.readCeilArc('F', fst, first, arc, in).label());
// no following arcs
assertNull(Util.readCeilArc('A', fst, arc, arc, in));
}
public void testReadCeilArcPackedArray() throws Exception {
List<String> letters = Arrays.asList("A", "E", "J", "K", "L", "O", "T", "z");
verifyReadCeilArc(letters, true, false);