mirror of https://github.com/apache/lucene.git
Specialize arc store for continuous label in FST (#12748)
* init * review fix and reuse duplicate code * rebase * tidy * CHANGES.txt * bump version * rebase * CHANGES.txt
This commit is contained in:
parent
a71d64a598
commit
570832eb74
|
@ -264,6 +264,8 @@ Optimizations
|
||||||
|
|
||||||
* GITHUB#12381: Skip docs with DocValues in NumericLeafComparator. (Lu Xugang, Adrien Grand)
|
* GITHUB#12381: Skip docs with DocValues in NumericLeafComparator. (Lu Xugang, Adrien Grand)
|
||||||
|
|
||||||
|
* GITHUB#12748: Specialize arc store for continuous label in FST. (Guo Feng, Chao Zhang)
|
||||||
|
|
||||||
Changes in runtime behavior
|
Changes in runtime behavior
|
||||||
---------------------
|
---------------------
|
||||||
|
|
||||||
|
|
|
@ -86,8 +86,11 @@ public final class Lucene90BlockTreeTermsReader extends FieldsProducer {
|
||||||
*/
|
*/
|
||||||
public static final int VERSION_MSB_VLONG_OUTPUT = 1;
|
public static final int VERSION_MSB_VLONG_OUTPUT = 1;
|
||||||
|
|
||||||
|
/** The version that specialize arc store for continuous label in FST. */
|
||||||
|
public static final int VERSION_FST_CONTINUOUS_ARCS = 2;
|
||||||
|
|
||||||
/** Current terms format. */
|
/** Current terms format. */
|
||||||
public static final int VERSION_CURRENT = VERSION_MSB_VLONG_OUTPUT;
|
public static final int VERSION_CURRENT = VERSION_FST_CONTINUOUS_ARCS;
|
||||||
|
|
||||||
/** Extension of terms index file */
|
/** Extension of terms index file */
|
||||||
static final String TERMS_INDEX_EXTENSION = "tip";
|
static final String TERMS_INDEX_EXTENSION = "tip";
|
||||||
|
|
|
@ -98,11 +98,19 @@ public final class FST<T> implements Accountable {
|
||||||
*/
|
*/
|
||||||
static final byte ARCS_FOR_DIRECT_ADDRESSING = 1 << 6;
|
static final byte ARCS_FOR_DIRECT_ADDRESSING = 1 << 6;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Value of the arc flags to declare a node with continuous arcs designed for pos the arc directly
|
||||||
|
* with labelToPos - firstLabel. like {@link #ARCS_FOR_BINARY_SEARCH} we use flag combinations
|
||||||
|
* that will not occur at the same time.
|
||||||
|
*/
|
||||||
|
static final byte ARCS_FOR_CONTINUOUS = ARCS_FOR_DIRECT_ADDRESSING + ARCS_FOR_BINARY_SEARCH;
|
||||||
|
|
||||||
// Increment version to change it
|
// Increment version to change it
|
||||||
private static final String FILE_FORMAT_NAME = "FST";
|
private static final String FILE_FORMAT_NAME = "FST";
|
||||||
private static final int VERSION_START = 6;
|
private static final int VERSION_START = 6;
|
||||||
private static final int VERSION_LITTLE_ENDIAN = 8;
|
private static final int VERSION_LITTLE_ENDIAN = 8;
|
||||||
static final int VERSION_CURRENT = VERSION_LITTLE_ENDIAN;
|
private static final int VERSION_CONTINUOUS_ARCS = 9;
|
||||||
|
static final int VERSION_CURRENT = VERSION_CONTINUOUS_ARCS;
|
||||||
|
|
||||||
// Never serialized; just used to represent the virtual
|
// Never serialized; just used to represent the virtual
|
||||||
// final node w/ no arcs:
|
// final node w/ no arcs:
|
||||||
|
@ -243,7 +251,10 @@ public final class FST<T> implements Accountable {
|
||||||
.append(numArcs())
|
.append(numArcs())
|
||||||
.append(")")
|
.append(")")
|
||||||
.append("(")
|
.append("(")
|
||||||
.append(nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING ? "da" : "bs")
|
.append(
|
||||||
|
nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING
|
||||||
|
? "da"
|
||||||
|
: nodeFlags() == ARCS_FOR_CONTINUOUS ? "cs" : "bs")
|
||||||
.append(")");
|
.append(")");
|
||||||
}
|
}
|
||||||
return b.toString();
|
return b.toString();
|
||||||
|
@ -285,8 +296,8 @@ public final class FST<T> implements Accountable {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Node header flags. Only meaningful to check if the value is either {@link
|
* Node header flags. Only meaningful to check if the value is either {@link
|
||||||
* #ARCS_FOR_BINARY_SEARCH} or {@link #ARCS_FOR_DIRECT_ADDRESSING} (other value when bytesPerArc
|
* #ARCS_FOR_BINARY_SEARCH} or {@link #ARCS_FOR_DIRECT_ADDRESSING} or {@link
|
||||||
* == 0).
|
* #ARCS_FOR_CONTINUOUS} (other value when bytesPerArc == 0).
|
||||||
*/
|
*/
|
||||||
public byte nodeFlags() {
|
public byte nodeFlags() {
|
||||||
return nodeFlags;
|
return nodeFlags;
|
||||||
|
@ -318,7 +329,7 @@ public final class FST<T> implements Accountable {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* First label of a direct addressing node. Only valid if nodeFlags == {@link
|
* First label of a direct addressing node. Only valid if nodeFlags == {@link
|
||||||
* #ARCS_FOR_DIRECT_ADDRESSING}.
|
* #ARCS_FOR_DIRECT_ADDRESSING} or {@link #ARCS_FOR_CONTINUOUS}.
|
||||||
*/
|
*/
|
||||||
int firstLabel() {
|
int firstLabel() {
|
||||||
return firstLabel;
|
return firstLabel;
|
||||||
|
@ -653,7 +664,9 @@ public final class FST<T> implements Accountable {
|
||||||
} else {
|
} else {
|
||||||
in.setPosition(follow.target());
|
in.setPosition(follow.target());
|
||||||
byte flags = arc.nodeFlags = in.readByte();
|
byte flags = arc.nodeFlags = in.readByte();
|
||||||
if (flags == ARCS_FOR_BINARY_SEARCH || flags == ARCS_FOR_DIRECT_ADDRESSING) {
|
if (flags == ARCS_FOR_BINARY_SEARCH
|
||||||
|
|| flags == ARCS_FOR_DIRECT_ADDRESSING
|
||||||
|
|| flags == ARCS_FOR_CONTINUOUS) {
|
||||||
// Special arc which is actually a node header for fixed length arcs.
|
// Special arc which is actually a node header for fixed length arcs.
|
||||||
// Jump straight to end to find the last arc.
|
// Jump straight to end to find the last arc.
|
||||||
arc.numArcs = in.readVInt();
|
arc.numArcs = in.readVInt();
|
||||||
|
@ -664,10 +677,14 @@ public final class FST<T> implements Accountable {
|
||||||
arc.firstLabel = readLabel(in);
|
arc.firstLabel = readLabel(in);
|
||||||
arc.posArcsStart = in.getPosition();
|
arc.posArcsStart = in.getPosition();
|
||||||
readLastArcByDirectAddressing(arc, in);
|
readLastArcByDirectAddressing(arc, in);
|
||||||
} else {
|
} else if (flags == ARCS_FOR_BINARY_SEARCH) {
|
||||||
arc.arcIdx = arc.numArcs() - 2;
|
arc.arcIdx = arc.numArcs() - 2;
|
||||||
arc.posArcsStart = in.getPosition();
|
arc.posArcsStart = in.getPosition();
|
||||||
readNextRealArc(arc, in);
|
readNextRealArc(arc, in);
|
||||||
|
} else {
|
||||||
|
arc.firstLabel = readLabel(in);
|
||||||
|
arc.posArcsStart = in.getPosition();
|
||||||
|
readLastArcByContinuous(arc, in);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
arc.flags = flags;
|
arc.flags = flags;
|
||||||
|
@ -740,7 +757,9 @@ public final class FST<T> implements Accountable {
|
||||||
in.setPosition(nodeAddress);
|
in.setPosition(nodeAddress);
|
||||||
|
|
||||||
byte flags = arc.nodeFlags = in.readByte();
|
byte flags = arc.nodeFlags = in.readByte();
|
||||||
if (flags == ARCS_FOR_BINARY_SEARCH || flags == ARCS_FOR_DIRECT_ADDRESSING) {
|
if (flags == ARCS_FOR_BINARY_SEARCH
|
||||||
|
|| flags == ARCS_FOR_DIRECT_ADDRESSING
|
||||||
|
|| flags == ARCS_FOR_CONTINUOUS) {
|
||||||
// Special arc which is actually a node header for fixed length arcs.
|
// Special arc which is actually a node header for fixed length arcs.
|
||||||
arc.numArcs = in.readVInt();
|
arc.numArcs = in.readVInt();
|
||||||
arc.bytesPerArc = in.readVInt();
|
arc.bytesPerArc = in.readVInt();
|
||||||
|
@ -749,6 +768,8 @@ public final class FST<T> implements Accountable {
|
||||||
readPresenceBytes(arc, in);
|
readPresenceBytes(arc, in);
|
||||||
arc.firstLabel = readLabel(in);
|
arc.firstLabel = readLabel(in);
|
||||||
arc.presenceIndex = -1;
|
arc.presenceIndex = -1;
|
||||||
|
} else if (flags == ARCS_FOR_CONTINUOUS) {
|
||||||
|
arc.firstLabel = readLabel(in);
|
||||||
}
|
}
|
||||||
arc.posArcsStart = in.getPosition();
|
arc.posArcsStart = in.getPosition();
|
||||||
} else {
|
} else {
|
||||||
|
@ -773,7 +794,9 @@ public final class FST<T> implements Accountable {
|
||||||
} else {
|
} else {
|
||||||
in.setPosition(follow.target());
|
in.setPosition(follow.target());
|
||||||
byte flags = in.readByte();
|
byte flags = in.readByte();
|
||||||
return flags == ARCS_FOR_BINARY_SEARCH || flags == ARCS_FOR_DIRECT_ADDRESSING;
|
return flags == ARCS_FOR_BINARY_SEARCH
|
||||||
|
|| flags == ARCS_FOR_DIRECT_ADDRESSING
|
||||||
|
|| flags == ARCS_FOR_CONTINUOUS;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -801,16 +824,18 @@ public final class FST<T> implements Accountable {
|
||||||
|
|
||||||
in.setPosition(arc.nextArc());
|
in.setPosition(arc.nextArc());
|
||||||
byte flags = in.readByte();
|
byte flags = in.readByte();
|
||||||
if (flags == ARCS_FOR_BINARY_SEARCH || flags == ARCS_FOR_DIRECT_ADDRESSING) {
|
if (flags == ARCS_FOR_BINARY_SEARCH
|
||||||
|
|| flags == ARCS_FOR_DIRECT_ADDRESSING
|
||||||
|
|| flags == ARCS_FOR_CONTINUOUS) {
|
||||||
// System.out.println(" nextArc fixed length arc");
|
// System.out.println(" nextArc fixed length arc");
|
||||||
// Special arc which is actually a node header for fixed length arcs.
|
// Special arc which is actually a node header for fixed length arcs.
|
||||||
int numArcs = in.readVInt();
|
int numArcs = in.readVInt();
|
||||||
in.readVInt(); // Skip bytesPerArc.
|
in.readVInt(); // Skip bytesPerArc.
|
||||||
if (flags == ARCS_FOR_BINARY_SEARCH) {
|
if (flags == ARCS_FOR_BINARY_SEARCH) {
|
||||||
in.readByte(); // Skip arc flags.
|
in.readByte(); // Skip arc flags.
|
||||||
} else {
|
} else if (flags == ARCS_FOR_DIRECT_ADDRESSING) {
|
||||||
in.skipBytes(getNumPresenceBytes(numArcs));
|
in.skipBytes(getNumPresenceBytes(numArcs));
|
||||||
}
|
} // Nothing to do for ARCS_FOR_CONTINUOUS
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
switch (arc.nodeFlags()) {
|
switch (arc.nodeFlags()) {
|
||||||
|
@ -826,6 +851,8 @@ public final class FST<T> implements Accountable {
|
||||||
int nextIndex = BitTable.nextBitSet(arc.arcIdx(), arc, in);
|
int nextIndex = BitTable.nextBitSet(arc.arcIdx(), arc, in);
|
||||||
assert nextIndex != -1;
|
assert nextIndex != -1;
|
||||||
return arc.firstLabel() + nextIndex;
|
return arc.firstLabel() + nextIndex;
|
||||||
|
case ARCS_FOR_CONTINUOUS:
|
||||||
|
return arc.firstLabel() + arc.arcIdx() + 1;
|
||||||
default:
|
default:
|
||||||
// Variable length arcs - linear search.
|
// Variable length arcs - linear search.
|
||||||
assert arc.bytesPerArc() == 0;
|
assert arc.bytesPerArc() == 0;
|
||||||
|
@ -849,6 +876,20 @@ public final class FST<T> implements Accountable {
|
||||||
return readArc(arc, in);
|
return readArc(arc, in);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reads a Continuous node arc, with the provided index in the label range.
|
||||||
|
*
|
||||||
|
* @param rangeIndex The index of the arc in the label range. It must be within the label range.
|
||||||
|
*/
|
||||||
|
public Arc<T> readArcByContinuous(Arc<T> arc, final BytesReader in, int rangeIndex)
|
||||||
|
throws IOException {
|
||||||
|
assert rangeIndex >= 0 && rangeIndex < arc.numArcs();
|
||||||
|
in.setPosition(arc.posArcsStart() - rangeIndex * (long) arc.bytesPerArc());
|
||||||
|
arc.arcIdx = rangeIndex;
|
||||||
|
arc.flags = in.readByte();
|
||||||
|
return readArc(arc, in);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Reads a present direct addressing node arc, with the provided index in the label range.
|
* Reads a present direct addressing node arc, with the provided index in the label range.
|
||||||
*
|
*
|
||||||
|
@ -888,6 +929,11 @@ public final class FST<T> implements Accountable {
|
||||||
return readArcByDirectAddressing(arc, in, arc.numArcs() - 1, presenceIndex);
|
return readArcByDirectAddressing(arc, in, arc.numArcs() - 1, presenceIndex);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Reads the last arc of a continuous node. */
|
||||||
|
public Arc<T> readLastArcByContinuous(Arc<T> arc, final BytesReader in) throws IOException {
|
||||||
|
return readArcByContinuous(arc, in, arc.numArcs() - 1);
|
||||||
|
}
|
||||||
|
|
||||||
/** Never returns null, but you should never call this if arc.isLast() is true. */
|
/** Never returns null, but you should never call this if arc.isLast() is true. */
|
||||||
public Arc<T> readNextRealArc(Arc<T> arc, final BytesReader in) throws IOException {
|
public Arc<T> readNextRealArc(Arc<T> arc, final BytesReader in) throws IOException {
|
||||||
|
|
||||||
|
@ -896,6 +942,7 @@ public final class FST<T> implements Accountable {
|
||||||
|
|
||||||
switch (arc.nodeFlags()) {
|
switch (arc.nodeFlags()) {
|
||||||
case ARCS_FOR_BINARY_SEARCH:
|
case ARCS_FOR_BINARY_SEARCH:
|
||||||
|
case ARCS_FOR_CONTINUOUS:
|
||||||
assert arc.bytesPerArc() > 0;
|
assert arc.bytesPerArc() > 0;
|
||||||
arc.arcIdx++;
|
arc.arcIdx++;
|
||||||
assert arc.arcIdx() >= 0 && arc.arcIdx() < arc.numArcs();
|
assert arc.arcIdx() >= 0 && arc.arcIdx() < arc.numArcs();
|
||||||
|
@ -924,7 +971,7 @@ public final class FST<T> implements Accountable {
|
||||||
* positioned just after the arc flags byte.
|
* positioned just after the arc flags byte.
|
||||||
*/
|
*/
|
||||||
private Arc<T> readArc(Arc<T> arc, BytesReader in) throws IOException {
|
private Arc<T> readArc(Arc<T> arc, BytesReader in) throws IOException {
|
||||||
if (arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING) {
|
if (arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING || arc.nodeFlags() == ARCS_FOR_CONTINUOUS) {
|
||||||
arc.label = arc.firstLabel() + arc.arcIdx();
|
arc.label = arc.firstLabel() + arc.arcIdx();
|
||||||
} else {
|
} else {
|
||||||
arc.label = readLabel(in);
|
arc.label = readLabel(in);
|
||||||
|
@ -1067,6 +1114,17 @@ public final class FST<T> implements Accountable {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return null;
|
return null;
|
||||||
|
} else if (flags == ARCS_FOR_CONTINUOUS) {
|
||||||
|
arc.numArcs = in.readVInt();
|
||||||
|
arc.bytesPerArc = in.readVInt();
|
||||||
|
arc.firstLabel = readLabel(in);
|
||||||
|
arc.posArcsStart = in.getPosition();
|
||||||
|
int arcIndex = labelToMatch - arc.firstLabel();
|
||||||
|
if (arcIndex < 0 || arcIndex >= arc.numArcs()) {
|
||||||
|
return null; // Before or after label range.
|
||||||
|
}
|
||||||
|
arc.arcIdx = arcIndex - 1;
|
||||||
|
return readNextRealArc(arc, in);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Linear scan
|
// Linear scan
|
||||||
|
|
|
@ -17,6 +17,7 @@
|
||||||
package org.apache.lucene.util.fst;
|
package org.apache.lucene.util.fst;
|
||||||
|
|
||||||
import static org.apache.lucene.util.fst.FST.ARCS_FOR_BINARY_SEARCH;
|
import static org.apache.lucene.util.fst.FST.ARCS_FOR_BINARY_SEARCH;
|
||||||
|
import static org.apache.lucene.util.fst.FST.ARCS_FOR_CONTINUOUS;
|
||||||
import static org.apache.lucene.util.fst.FST.ARCS_FOR_DIRECT_ADDRESSING;
|
import static org.apache.lucene.util.fst.FST.ARCS_FOR_DIRECT_ADDRESSING;
|
||||||
import static org.apache.lucene.util.fst.FST.BIT_ARC_HAS_FINAL_OUTPUT;
|
import static org.apache.lucene.util.fst.FST.BIT_ARC_HAS_FINAL_OUTPUT;
|
||||||
import static org.apache.lucene.util.fst.FST.BIT_ARC_HAS_OUTPUT;
|
import static org.apache.lucene.util.fst.FST.BIT_ARC_HAS_OUTPUT;
|
||||||
|
@ -113,6 +114,7 @@ public class FSTCompiler<T> {
|
||||||
long nodeCount;
|
long nodeCount;
|
||||||
long binarySearchNodeCount;
|
long binarySearchNodeCount;
|
||||||
long directAddressingNodeCount;
|
long directAddressingNodeCount;
|
||||||
|
long continuousNodeCount;
|
||||||
|
|
||||||
final boolean allowFixedLengthArcs;
|
final boolean allowFixedLengthArcs;
|
||||||
final float directAddressingMaxOversizingFactor;
|
final float directAddressingMaxOversizingFactor;
|
||||||
|
@ -445,9 +447,15 @@ public class FSTCompiler<T> {
|
||||||
|
|
||||||
int labelRange = nodeIn.arcs[nodeIn.numArcs - 1].label - nodeIn.arcs[0].label + 1;
|
int labelRange = nodeIn.arcs[nodeIn.numArcs - 1].label - nodeIn.arcs[0].label + 1;
|
||||||
assert labelRange > 0;
|
assert labelRange > 0;
|
||||||
if (shouldExpandNodeWithDirectAddressing(
|
boolean continuousLabel = labelRange == nodeIn.numArcs;
|
||||||
|
if (continuousLabel) {
|
||||||
|
writeNodeForDirectAddressingOrContinuous(
|
||||||
|
nodeIn, startAddress, maxBytesPerArcWithoutLabel, labelRange, true);
|
||||||
|
continuousNodeCount++;
|
||||||
|
} else if (shouldExpandNodeWithDirectAddressing(
|
||||||
nodeIn, maxBytesPerArc, maxBytesPerArcWithoutLabel, labelRange)) {
|
nodeIn, maxBytesPerArc, maxBytesPerArcWithoutLabel, labelRange)) {
|
||||||
writeNodeForDirectAddressing(nodeIn, startAddress, maxBytesPerArcWithoutLabel, labelRange);
|
writeNodeForDirectAddressingOrContinuous(
|
||||||
|
nodeIn, startAddress, maxBytesPerArcWithoutLabel, labelRange, false);
|
||||||
directAddressingNodeCount++;
|
directAddressingNodeCount++;
|
||||||
} else {
|
} else {
|
||||||
writeNodeForBinarySearch(nodeIn, startAddress, maxBytesPerArc);
|
writeNodeForBinarySearch(nodeIn, startAddress, maxBytesPerArc);
|
||||||
|
@ -578,18 +586,19 @@ public class FSTCompiler<T> {
|
||||||
bytes.writeBytes(startAddress, fixedLengthArcsBuffer.getBytes(), 0, headerLen);
|
bytes.writeBytes(startAddress, fixedLengthArcsBuffer.getBytes(), 0, headerLen);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void writeNodeForDirectAddressing(
|
private void writeNodeForDirectAddressingOrContinuous(
|
||||||
FSTCompiler.UnCompiledNode<T> nodeIn,
|
FSTCompiler.UnCompiledNode<T> nodeIn,
|
||||||
long startAddress,
|
long startAddress,
|
||||||
int maxBytesPerArcWithoutLabel,
|
int maxBytesPerArcWithoutLabel,
|
||||||
int labelRange) {
|
int labelRange,
|
||||||
|
boolean continuous) {
|
||||||
// Expand the arcs backwards in a buffer because we remove the labels.
|
// Expand the arcs backwards in a buffer because we remove the labels.
|
||||||
// So the obtained arcs might occupy less space. This is the reason why this
|
// So the obtained arcs might occupy less space. This is the reason why this
|
||||||
// whole method is more complex.
|
// whole method is more complex.
|
||||||
// Drop the label bytes since we can infer the label based on the arc index,
|
// Drop the label bytes since we can infer the label based on the arc index,
|
||||||
// the presence bits, and the first label. Keep the first label.
|
// the presence bits, and the first label. Keep the first label.
|
||||||
int headerMaxLen = 11;
|
int headerMaxLen = 11;
|
||||||
int numPresenceBytes = getNumPresenceBytes(labelRange);
|
int numPresenceBytes = continuous ? 0 : getNumPresenceBytes(labelRange);
|
||||||
long srcPos = bytes.getPosition();
|
long srcPos = bytes.getPosition();
|
||||||
int totalArcBytes = numLabelBytesPerArc[0] + nodeIn.numArcs * maxBytesPerArcWithoutLabel;
|
int totalArcBytes = numLabelBytesPerArc[0] + nodeIn.numArcs * maxBytesPerArcWithoutLabel;
|
||||||
int bufferOffset = headerMaxLen + numPresenceBytes + totalArcBytes;
|
int bufferOffset = headerMaxLen + numPresenceBytes + totalArcBytes;
|
||||||
|
@ -620,7 +629,7 @@ public class FSTCompiler<T> {
|
||||||
// metadata.
|
// metadata.
|
||||||
fixedLengthArcsBuffer
|
fixedLengthArcsBuffer
|
||||||
.resetPosition()
|
.resetPosition()
|
||||||
.writeByte(ARCS_FOR_DIRECT_ADDRESSING)
|
.writeByte(continuous ? ARCS_FOR_CONTINUOUS : ARCS_FOR_DIRECT_ADDRESSING)
|
||||||
.writeVInt(labelRange) // labelRange instead of numArcs.
|
.writeVInt(labelRange) // labelRange instead of numArcs.
|
||||||
.writeVInt(
|
.writeVInt(
|
||||||
maxBytesPerArcWithoutLabel); // maxBytesPerArcWithoutLabel instead of maxBytesPerArc.
|
maxBytesPerArcWithoutLabel); // maxBytesPerArcWithoutLabel instead of maxBytesPerArc.
|
||||||
|
@ -642,8 +651,10 @@ public class FSTCompiler<T> {
|
||||||
writeOffset += headerLen;
|
writeOffset += headerLen;
|
||||||
|
|
||||||
// Write the presence bits
|
// Write the presence bits
|
||||||
writePresenceBits(nodeIn, writeOffset, numPresenceBytes);
|
if (continuous == false) {
|
||||||
writeOffset += numPresenceBytes;
|
writePresenceBits(nodeIn, writeOffset, numPresenceBytes);
|
||||||
|
writeOffset += numPresenceBytes;
|
||||||
|
}
|
||||||
|
|
||||||
// Write the first label and the arcs.
|
// Write the first label and the arcs.
|
||||||
bytes.writeBytes(writeOffset, fixedLengthArcsBuffer.getBytes(), bufferOffset, totalArcBytes);
|
bytes.writeBytes(writeOffset, fixedLengthArcsBuffer.getBytes(), bufferOffset, totalArcBytes);
|
||||||
|
|
|
@ -149,9 +149,11 @@ abstract class FSTEnum<T> {
|
||||||
final FST.BytesReader in = fst.getBytesReader();
|
final FST.BytesReader in = fst.getBytesReader();
|
||||||
if (arc.nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING) {
|
if (arc.nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING) {
|
||||||
arc = doSeekCeilArrayDirectAddressing(arc, targetLabel, in);
|
arc = doSeekCeilArrayDirectAddressing(arc, targetLabel, in);
|
||||||
} else {
|
} else if (arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH) {
|
||||||
assert arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH;
|
|
||||||
arc = doSeekCeilArrayPacked(arc, targetLabel, in);
|
arc = doSeekCeilArrayPacked(arc, targetLabel, in);
|
||||||
|
} else {
|
||||||
|
assert arc.nodeFlags() == FST.ARCS_FOR_CONTINUOUS;
|
||||||
|
arc = doSeekCeilArrayContinuous(arc, targetLabel, in);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
arc = doSeekCeilList(arc, targetLabel);
|
arc = doSeekCeilList(arc, targetLabel);
|
||||||
|
@ -159,6 +161,33 @@ abstract class FSTEnum<T> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private FST.Arc<T> doSeekCeilArrayContinuous(
|
||||||
|
final FST.Arc<T> arc, final int targetLabel, final FST.BytesReader in) throws IOException {
|
||||||
|
int targetIndex = targetLabel - arc.firstLabel();
|
||||||
|
if (targetIndex >= arc.numArcs()) {
|
||||||
|
rollbackToLastForkThenPush();
|
||||||
|
return null;
|
||||||
|
} else {
|
||||||
|
if (targetIndex < 0) {
|
||||||
|
fst.readArcByContinuous(arc, in, 0);
|
||||||
|
assert arc.label() > targetLabel;
|
||||||
|
pushFirst();
|
||||||
|
return null;
|
||||||
|
} else {
|
||||||
|
fst.readArcByContinuous(arc, in, targetIndex);
|
||||||
|
assert arc.label() == targetLabel;
|
||||||
|
// found -- copy pasta from below
|
||||||
|
output[upto] = fst.outputs.add(output[upto - 1], arc.output());
|
||||||
|
if (targetLabel == FST.END_LABEL) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
setCurrentLabel(arc.label());
|
||||||
|
incr();
|
||||||
|
return fst.readFirstTargetArc(arc, getArc(upto), fstReader);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private FST.Arc<T> doSeekCeilArrayDirectAddressing(
|
private FST.Arc<T> doSeekCeilArrayDirectAddressing(
|
||||||
final FST.Arc<T> arc, final int targetLabel, final FST.BytesReader in) throws IOException {
|
final FST.Arc<T> arc, final int targetLabel, final FST.BytesReader in) throws IOException {
|
||||||
// The array is addressed directly by label, with presence bits to compute the actual arc
|
// The array is addressed directly by label, with presence bits to compute the actual arc
|
||||||
|
@ -166,24 +195,8 @@ abstract class FSTEnum<T> {
|
||||||
|
|
||||||
int targetIndex = targetLabel - arc.firstLabel();
|
int targetIndex = targetLabel - arc.firstLabel();
|
||||||
if (targetIndex >= arc.numArcs()) {
|
if (targetIndex >= arc.numArcs()) {
|
||||||
// Target is beyond the last arc, out of label range.
|
rollbackToLastForkThenPush();
|
||||||
// Dead end (target is after the last arc);
|
return null;
|
||||||
// rollback to last fork then push
|
|
||||||
upto--;
|
|
||||||
while (true) {
|
|
||||||
if (upto == 0) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
final FST.Arc<T> prevArc = getArc(upto);
|
|
||||||
// System.out.println(" rollback upto=" + upto + " arc.label=" + prevArc.label + "
|
|
||||||
// isLast?=" + prevArc.isLast());
|
|
||||||
if (!prevArc.isLast()) {
|
|
||||||
fst.readNextArc(prevArc, fstReader);
|
|
||||||
pushFirst();
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
upto--;
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
if (targetIndex < 0) {
|
if (targetIndex < 0) {
|
||||||
targetIndex = -1;
|
targetIndex = -1;
|
||||||
|
@ -332,9 +345,11 @@ abstract class FSTEnum<T> {
|
||||||
final FST.BytesReader in = fst.getBytesReader();
|
final FST.BytesReader in = fst.getBytesReader();
|
||||||
if (arc.nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING) {
|
if (arc.nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING) {
|
||||||
arc = doSeekFloorArrayDirectAddressing(arc, targetLabel, in);
|
arc = doSeekFloorArrayDirectAddressing(arc, targetLabel, in);
|
||||||
} else {
|
} else if (arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH) {
|
||||||
assert arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH;
|
|
||||||
arc = doSeekFloorArrayPacked(arc, targetLabel, in);
|
arc = doSeekFloorArrayPacked(arc, targetLabel, in);
|
||||||
|
} else {
|
||||||
|
assert arc.nodeFlags() == FST.ARCS_FOR_CONTINUOUS;
|
||||||
|
arc = doSeekFloorContinuous(arc, targetLabel, in);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
arc = doSeekFloorList(arc, targetLabel);
|
arc = doSeekFloorList(arc, targetLabel);
|
||||||
|
@ -342,6 +357,34 @@ abstract class FSTEnum<T> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private FST.Arc<T> doSeekFloorContinuous(FST.Arc<T> arc, int targetLabel, FST.BytesReader in)
|
||||||
|
throws IOException {
|
||||||
|
int targetIndex = targetLabel - arc.firstLabel();
|
||||||
|
if (targetIndex < 0) {
|
||||||
|
// Before first arc.
|
||||||
|
return backtrackToFloorArc(arc, targetLabel, in);
|
||||||
|
} else if (targetIndex >= arc.numArcs()) {
|
||||||
|
// After last arc.
|
||||||
|
fst.readLastArcByContinuous(arc, in);
|
||||||
|
assert arc.label() < targetLabel;
|
||||||
|
assert arc.isLast();
|
||||||
|
pushLast();
|
||||||
|
return null;
|
||||||
|
} else {
|
||||||
|
// Within label range.
|
||||||
|
fst.readArcByContinuous(arc, in, targetIndex);
|
||||||
|
assert arc.label() == targetLabel;
|
||||||
|
// found -- copy pasta from below
|
||||||
|
output[upto] = fst.outputs.add(output[upto - 1], arc.output());
|
||||||
|
if (targetLabel == FST.END_LABEL) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
setCurrentLabel(arc.label());
|
||||||
|
incr();
|
||||||
|
return fst.readFirstTargetArc(arc, getArc(upto), fstReader);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private FST.Arc<T> doSeekFloorArrayDirectAddressing(
|
private FST.Arc<T> doSeekFloorArrayDirectAddressing(
|
||||||
FST.Arc<T> arc, int targetLabel, FST.BytesReader in) throws IOException {
|
FST.Arc<T> arc, int targetLabel, FST.BytesReader in) throws IOException {
|
||||||
// The array is addressed directly by label, with presence bits to compute the actual arc
|
// The array is addressed directly by label, with presence bits to compute the actual arc
|
||||||
|
@ -383,6 +426,28 @@ abstract class FSTEnum<T> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Target is beyond the last arc, out of label range. Dead end (target is after the last arc);
|
||||||
|
* rollback to last fork then push
|
||||||
|
*/
|
||||||
|
private void rollbackToLastForkThenPush() throws IOException {
|
||||||
|
upto--;
|
||||||
|
while (true) {
|
||||||
|
if (upto == 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
final FST.Arc<T> prevArc = getArc(upto);
|
||||||
|
// System.out.println(" rollback upto=" + upto + " arc.label=" + prevArc.label + "
|
||||||
|
// isLast?=" + prevArc.isLast());
|
||||||
|
if (!prevArc.isLast()) {
|
||||||
|
fst.readNextArc(prevArc, fstReader);
|
||||||
|
pushFirst();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
upto--;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Backtracks until it finds a node which first arc is before our target label.` Then on the node,
|
* Backtracks until it finds a node which first arc is before our target label.` Then on the node,
|
||||||
* finds the arc just before the targetLabel.
|
* finds the arc just before the targetLabel.
|
||||||
|
@ -400,9 +465,11 @@ abstract class FSTEnum<T> {
|
||||||
if (arc.bytesPerArc() != 0 && arc.label() != FST.END_LABEL) {
|
if (arc.bytesPerArc() != 0 && arc.label() != FST.END_LABEL) {
|
||||||
if (arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH) {
|
if (arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH) {
|
||||||
findNextFloorArcBinarySearch(arc, targetLabel, in);
|
findNextFloorArcBinarySearch(arc, targetLabel, in);
|
||||||
} else {
|
} else if (arc.nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING) {
|
||||||
assert arc.nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING;
|
|
||||||
findNextFloorArcDirectAddressing(arc, targetLabel, in);
|
findNextFloorArcDirectAddressing(arc, targetLabel, in);
|
||||||
|
} else {
|
||||||
|
assert arc.nodeFlags() == FST.ARCS_FOR_CONTINUOUS;
|
||||||
|
findNextFloorArcContinuous(arc, targetLabel, in);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
while (!arc.isLast() && fst.readNextArcLabel(arc, in) < targetLabel) {
|
while (!arc.isLast() && fst.readNextArcLabel(arc, in) < targetLabel) {
|
||||||
|
@ -452,6 +519,24 @@ abstract class FSTEnum<T> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Same as {@link #findNextFloorArcDirectAddressing} for continuous node. */
|
||||||
|
private void findNextFloorArcContinuous(FST.Arc<T> arc, int targetLabel, final FST.BytesReader in)
|
||||||
|
throws IOException {
|
||||||
|
assert arc.nodeFlags() == FST.ARCS_FOR_CONTINUOUS;
|
||||||
|
assert arc.label() != FST.END_LABEL;
|
||||||
|
assert arc.label() == arc.firstLabel();
|
||||||
|
if (arc.numArcs() > 1) {
|
||||||
|
int targetIndex = targetLabel - arc.firstLabel();
|
||||||
|
assert targetIndex >= 0;
|
||||||
|
if (targetIndex >= arc.numArcs()) {
|
||||||
|
// Beyond last arc. Take last arc.
|
||||||
|
fst.readLastArcByContinuous(arc, in);
|
||||||
|
} else {
|
||||||
|
fst.readArcByContinuous(arc, in, targetIndex - 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/** Same as {@link #findNextFloorArcDirectAddressing} for binary search node. */
|
/** Same as {@link #findNextFloorArcDirectAddressing} for binary search node. */
|
||||||
private void findNextFloorArcBinarySearch(FST.Arc<T> arc, int targetLabel, FST.BytesReader in)
|
private void findNextFloorArcBinarySearch(FST.Arc<T> arc, int targetLabel, FST.BytesReader in)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
|
|
|
@ -408,6 +408,12 @@ final class NodeHash<T> {
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
case FST.ARCS_FOR_CONTINUOUS:
|
||||||
|
if ((node.arcs[node.numArcs - 1].label - node.arcs[0].label + 1)
|
||||||
|
!= scratchArc.numArcs()) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
throw new AssertionError("unhandled scratchArc.nodeFlag() " + scratchArc.nodeFlags());
|
throw new AssertionError("unhandled scratchArc.nodeFlag() " + scratchArc.nodeFlags());
|
||||||
}
|
}
|
||||||
|
|
|
@ -854,6 +854,17 @@ public final class Util {
|
||||||
}
|
}
|
||||||
return arc;
|
return arc;
|
||||||
}
|
}
|
||||||
|
} else if (arc.nodeFlags() == FST.ARCS_FOR_CONTINUOUS) {
|
||||||
|
int targetIndex = label - arc.label();
|
||||||
|
if (targetIndex >= arc.numArcs()) {
|
||||||
|
return null;
|
||||||
|
} else if (targetIndex < 0) {
|
||||||
|
return arc;
|
||||||
|
} else {
|
||||||
|
fst.readArcByContinuous(arc, in, targetIndex);
|
||||||
|
assert arc.label() == label;
|
||||||
|
return arc;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
// Fixed length arcs in a binary search node.
|
// Fixed length arcs in a binary search node.
|
||||||
int idx = binarySearch(fst, arc, label);
|
int idx = binarySearch(fst, arc, label);
|
||||||
|
|
|
@ -137,7 +137,9 @@ public class TestFSTDirectAddressing extends LuceneTestCase {
|
||||||
directAddressingMemoryIncreasePercent));
|
directAddressingMemoryIncreasePercent));
|
||||||
System.out.println("num nodes = " + fstCompiler.nodeCount);
|
System.out.println("num nodes = " + fstCompiler.nodeCount);
|
||||||
long fixedLengthArcNodeCount =
|
long fixedLengthArcNodeCount =
|
||||||
fstCompiler.directAddressingNodeCount + fstCompiler.binarySearchNodeCount;
|
fstCompiler.directAddressingNodeCount
|
||||||
|
+ fstCompiler.binarySearchNodeCount
|
||||||
|
+ fstCompiler.continuousNodeCount;
|
||||||
System.out.println(
|
System.out.println(
|
||||||
"num fixed-length-arc nodes = "
|
"num fixed-length-arc nodes = "
|
||||||
+ fixedLengthArcNodeCount
|
+ fixedLengthArcNodeCount
|
||||||
|
@ -161,6 +163,13 @@ public class TestFSTDirectAddressing extends LuceneTestCase {
|
||||||
((double) (fstCompiler.directAddressingNodeCount)
|
((double) (fstCompiler.directAddressingNodeCount)
|
||||||
/ fixedLengthArcNodeCount
|
/ fixedLengthArcNodeCount
|
||||||
* 100)));
|
* 100)));
|
||||||
|
System.out.println(
|
||||||
|
"num continuous-arcs nodes = "
|
||||||
|
+ (fstCompiler.continuousNodeCount)
|
||||||
|
+ String.format(
|
||||||
|
Locale.ENGLISH,
|
||||||
|
" (%.2f %% of fixed-length-arc nodes)",
|
||||||
|
((double) (fstCompiler.continuousNodeCount) / fixedLengthArcNodeCount * 100)));
|
||||||
}
|
}
|
||||||
|
|
||||||
private static FSTCompiler<Object> createFSTCompiler(float directAddressingMaxOversizingFactor) {
|
private static FSTCompiler<Object> createFSTCompiler(float directAddressingMaxOversizingFactor) {
|
||||||
|
@ -211,18 +220,25 @@ public class TestFSTDirectAddressing extends LuceneTestCase {
|
||||||
DataInput in = new ByteArrayDataInput(buf);
|
DataInput in = new ByteArrayDataInput(buf);
|
||||||
FST<BytesRef> fst = new FST<>(in, in, ByteSequenceOutputs.getSingleton());
|
FST<BytesRef> fst = new FST<>(in, in, ByteSequenceOutputs.getSingleton());
|
||||||
BytesRefFSTEnum<BytesRef> fstEnum = new BytesRefFSTEnum<>(fst);
|
BytesRefFSTEnum<BytesRef> fstEnum = new BytesRefFSTEnum<>(fst);
|
||||||
int binarySearchArcCount = 0, directAddressingArcCount = 0, listArcCount = 0;
|
int binarySearchArcCount = 0,
|
||||||
|
directAddressingArcCount = 0,
|
||||||
|
listArcCount = 0,
|
||||||
|
continuousArcCount = 0;
|
||||||
while (fstEnum.next() != null) {
|
while (fstEnum.next() != null) {
|
||||||
if (fstEnum.arcs[fstEnum.upto].bytesPerArc() == 0) {
|
if (fstEnum.arcs[fstEnum.upto].bytesPerArc() == 0) {
|
||||||
listArcCount++;
|
listArcCount++;
|
||||||
} else if (fstEnum.arcs[fstEnum.upto].nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING) {
|
} else if (fstEnum.arcs[fstEnum.upto].nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING) {
|
||||||
directAddressingArcCount++;
|
directAddressingArcCount++;
|
||||||
|
} else if (fstEnum.arcs[fstEnum.upto].nodeFlags() == FST.ARCS_FOR_CONTINUOUS) {
|
||||||
|
continuousArcCount++;
|
||||||
} else {
|
} else {
|
||||||
binarySearchArcCount++;
|
binarySearchArcCount++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
System.out.println(
|
System.out.println(
|
||||||
"direct addressing arcs = "
|
"continuous arcs = "
|
||||||
|
+ continuousArcCount
|
||||||
|
+ ", direct addressing arcs = "
|
||||||
+ directAddressingArcCount
|
+ directAddressingArcCount
|
||||||
+ ", binary search arcs = "
|
+ ", binary search arcs = "
|
||||||
+ binarySearchArcCount
|
+ binarySearchArcCount
|
||||||
|
|
|
@ -43,6 +43,26 @@ public class TestUtil extends LuceneTestCase {
|
||||||
assertEquals(-7, Util.binarySearch(fst, arc, 'P'));
|
assertEquals(-7, Util.binarySearch(fst, arc, 'P'));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testContinuous() throws Exception {
|
||||||
|
List<String> letters = Arrays.asList("A", "B", "C", "D", "E", "F", "G", "H");
|
||||||
|
FST<Object> fst = buildFST(letters, true, false);
|
||||||
|
FST.Arc<Object> first = fst.getFirstArc(new FST.Arc<>());
|
||||||
|
FST.Arc<Object> arc = new FST.Arc<>();
|
||||||
|
FST.BytesReader in = fst.getBytesReader();
|
||||||
|
|
||||||
|
for (String letter : letters) {
|
||||||
|
char c = letter.charAt(0);
|
||||||
|
arc = Util.readCeilArc(c, fst, first, arc, in);
|
||||||
|
assertNotNull(arc);
|
||||||
|
assertEquals(c, arc.label());
|
||||||
|
}
|
||||||
|
|
||||||
|
// in the middle
|
||||||
|
assertEquals('F', Util.readCeilArc('F', fst, first, arc, in).label());
|
||||||
|
// no following arcs
|
||||||
|
assertNull(Util.readCeilArc('A', fst, arc, arc, in));
|
||||||
|
}
|
||||||
|
|
||||||
public void testReadCeilArcPackedArray() throws Exception {
|
public void testReadCeilArcPackedArray() throws Exception {
|
||||||
List<String> letters = Arrays.asList("A", "E", "J", "K", "L", "O", "T", "z");
|
List<String> letters = Arrays.asList("A", "E", "J", "K", "L", "O", "T", "z");
|
||||||
verifyReadCeilArc(letters, true, false);
|
verifyReadCeilArc(letters, true, false);
|
||||||
|
|
Loading…
Reference in New Issue