LUCENE-8920: Reduce the memory used by direct addressing of arcs (#980)

This commit is contained in:
Bruno Roustant 2019-11-13 22:48:13 +01:00 committed by Adrien Grand
parent c1ac146454
commit 068b6babac
15 changed files with 1271 additions and 454 deletions

View File

@ -1084,7 +1084,7 @@ public final class OrdsSegmentTermsEnum extends BaseTermsEnum {
result.grow(1+upto);
fr.index.readFirstRealTargetArc(arc.target(), arc, fstReader);
if (arc.bytesPerArc() != 0 && arc.arcIdx() > Integer.MIN_VALUE) {
if (arc.bytesPerArc() != 0 && arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH) {
// System.out.println(" array arcs");
int low = 0;
int high = arc.numArcs() -1;

View File

@ -178,4 +178,129 @@ public final class BitUtil {
return ((l >>> 1) ^ -(l & 1));
}
/**
* Returns whether the bit at given zero-based index is set.
* <br>Example: bitIndex 66 means the third bit on the right of the second long.
*
* @param bits The bits stored in an array of long for efficiency.
* @param numLongs The number of longs in {@code bits} to consider.
* @param bitIndex The bit zero-based index. It must be greater than or equal to 0,
* and strictly less than {@code numLongs * Long.SIZE}.
*/
public static boolean isBitSet(long[] bits, int numLongs, int bitIndex) {
assert numLongs >= 0 && numLongs <= bits.length && bitIndex >= 0 && bitIndex < numLongs * Long.SIZE
: "bitIndex=" + bitIndex + " numLongs=" + numLongs + " bits.length=" + bits.length;
return (bits[bitIndex / Long.SIZE] & (1L << bitIndex)) != 0; // Shifts are mod 64.
}
/**
* Counts all bits set in the provided longs.
*
* @param bits The bits stored in an array of long for efficiency.
* @param numLongs The number of longs in {@code bits} to consider.
*/
public static int countBits(long[] bits, int numLongs) {
assert numLongs >= 0 && numLongs <= bits.length
: "numLongs=" + numLongs + " bits.length=" + bits.length;
int bitCount = 0;
for (int i = 0; i < numLongs; i++) {
bitCount += Long.bitCount(bits[i]);
}
return bitCount;
}
/**
* Counts the bits set up to the given bit zero-based index, exclusive.
* <br>In other words, how many 1s there are up to the bit at the given index excluded.
* <br>Example: bitIndex 66 means the third bit on the right of the second long.
*
* @param bits The bits stored in an array of long for efficiency.
* @param numLongs The number of longs in {@code bits} to consider.
* @param bitIndex The bit zero-based index, exclusive. It must be greater than or equal to 0,
* and less than or equal to {@code numLongs * Long.SIZE}.
*/
public static int countBitsUpTo(long[] bits, int numLongs, int bitIndex) {
assert numLongs >= 0 && numLongs <= bits.length && bitIndex >= 0 && bitIndex <= numLongs * Long.SIZE
: "bitIndex=" + bitIndex + " numLongs=" + numLongs + " bits.length=" + bits.length;
int bitCount = 0;
int lastLong = bitIndex / Long.SIZE;
for (int i = 0; i < lastLong; i++) {
// Count the bits set for all plain longs.
bitCount += Long.bitCount(bits[i]);
}
if (lastLong < numLongs) {
// Prepare a mask with 1s on the right up to bitIndex exclusive.
long mask = (1L << bitIndex) - 1L; // Shifts are mod 64.
// Count the bits set only within the mask part, so up to bitIndex exclusive.
bitCount += Long.bitCount(bits[lastLong] & mask);
}
return bitCount;
}
/**
* Returns the index of the next bit set following the given bit zero-based index.
* <br>For example with bits 100011:
* the next bit set after index=-1 is at index=0;
* the next bit set after index=0 is at index=1;
* the next bit set after index=1 is at index=5;
* there is no next bit set after index=5.
*
* @param bits The bits stored in an array of long for efficiency.
* @param numLongs The number of longs in {@code bits} to consider.
* @param bitIndex The bit zero-based index. It must be greater than or equal to -1,
* and strictly less than {@code numLongs * Long.SIZE}.
* @return The zero-based index of the next bit set after the provided {@code bitIndex};
* or -1 if none.
*/
public static int nextBitSet(long[] bits, int numLongs, int bitIndex) {
assert numLongs >= 0 && numLongs <= bits.length && bitIndex >= -1 && bitIndex < numLongs * Long.SIZE
: "bitIndex=" + bitIndex + " numLongs=" + numLongs + " bits.length=" + bits.length;
int longIndex = bitIndex / Long.SIZE;
// Prepare a mask with 1s on the left down to bitIndex exclusive.
long mask = -(1L << (bitIndex + 1)); // Shifts are mod 64.
long l = mask == -1 && bitIndex != -1 ? 0 : bits[longIndex] & mask;
while (l == 0) {
if (++longIndex == numLongs) {
return -1;
}
l = bits[longIndex];
}
return Long.numberOfTrailingZeros(l) + longIndex * 64;
}
/**
* Returns the index of the previous bit set preceding the given bit zero-based index.
* <br>For example with bits 100011:
* there is no previous bit set before index=0.
* the previous bit set before index=1 is at index=0;
* the previous bit set before index=5 is at index=1;
* the previous bit set before index=64 is at index=5;
*
* @param bits The bits stored in an array of long for efficiency.
* @param numLongs The number of longs in {@code bits} to consider.
* @param bitIndex The bit zero-based index. It must be greater than or equal to 0,
* and less than or equal to {@code numLongs * Long.SIZE}.
* @return The zero-based index of the previous bit set before the provided {@code bitIndex};
* or -1 if none.
*/
public static int previousBitSet(long[] bits, int numLongs, int bitIndex) {
assert numLongs >= 0 && numLongs <= bits.length && bitIndex >= 0 && bitIndex <= numLongs * Long.SIZE
: "bitIndex=" + bitIndex + " numLongs=" + numLongs + " bits.length=" + bits.length;
int longIndex = bitIndex / Long.SIZE;
long l;
if (longIndex == numLongs) {
l = 0;
} else {
// Prepare a mask with 1s on the right up to bitIndex exclusive.
long mask = (1L << bitIndex) - 1L; // Shifts are mod 64.
l = bits[longIndex] & mask;
}
while (l == 0) {
if (longIndex-- == 0) {
return -1;
}
l = bits[longIndex];
}
return 63 - Long.numberOfLeadingZeros(l) + longIndex * 64;
}
}

View File

@ -19,6 +19,7 @@ package org.apache.lucene.util.fst;
import java.io.IOException;
import org.apache.lucene.store.ByteArrayDataOutput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.IntsRefBuilder;
@ -50,6 +51,30 @@ import org.apache.lucene.util.fst.FST.INPUT_TYPE; // javadoc
public class Builder<T> {
/**
* Default oversizing factor used to decide whether to encode a node with direct addressing or binary search.
* Default is 1: ensure no oversizing on average.
* <p>
* This factor does not determine whether to encode a node with a list of variable length arcs or with
* fixed length arcs. It only determines the effective encoding of a node that is already known to be
* encoded with fixed length arcs.
* See {@code FST.shouldExpandNodeWithFixedLengthArcs()}
* and {@code FST.shouldExpandNodeWithDirectAddressing()}.
* <p>
* For English words we measured 217K nodes, only 3.27% nodes are encoded with fixed length arcs,
* and 99.99% of them with direct addressing. Overall FST memory reduced by 1.67%.
* <p>
* For worst case we measured 168K nodes, 50% of them are encoded with fixed length arcs,
* and 14% of them with direct encoding. Overall FST memory reduced by 0.8%.
* <p>
* Use {@code TestFstDirectAddressing.main()}
* and {@code TestFstDirectAddressing.testWorstCaseForDirectAddressing()}
* to evaluate a change.
*
* @see #setDirectAddressingMaxOversizingFactor
*/
static final float DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR = 1.0f;
private final NodeHash<T> dedupHash;
final FST<T> fst;
private final T NO_OUTPUT;
@ -83,12 +108,18 @@ public class Builder<T> {
long lastFrozenNode;
// Reused temporarily while building the FST:
int[] reusedBytesPerArc = new int[4];
int[] numBytesPerArc = new int[4];
int[] numLabelBytesPerArc = new int[numBytesPerArc.length];
final FixedLengthArcsBuffer fixedLengthArcsBuffer = new FixedLengthArcsBuffer();
long arcCount;
long nodeCount;
long binarySearchNodeCount;
long directAddressingNodeCount;
boolean allowArrayArcs;
boolean allowFixedLengthArcs;
float directAddressingMaxOversizingFactor = DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR;
long directAddressingExpansionCredit;
BytesStore bytes;
@ -138,9 +169,9 @@ public class Builder<T> {
* FSA, use {@link NoOutputs#getSingleton()} and {@link NoOutputs#getNoOutput()} as the
* singleton output object.
*
* @param allowArrayArcs Pass false to disable the array arc optimization
* while building the FST; this will make the resulting
* FST smaller but slower to traverse.
* @param allowFixedLengthArcs Pass false to disable the fixed length arc optimization (binary search or
* direct addressing) while building the FST; this will make the resulting FST smaller but slower to
* traverse.
*
* @param bytesPageBits How many bits wide to make each
* byte[] block in the BytesStore; if you know the FST
@ -149,12 +180,12 @@ public class Builder<T> {
*/
public Builder(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doShareSuffix,
boolean doShareNonSingletonNodes, int shareMaxTailLength, Outputs<T> outputs,
boolean allowArrayArcs, int bytesPageBits) {
boolean allowFixedLengthArcs, int bytesPageBits) {
this.minSuffixCount1 = minSuffixCount1;
this.minSuffixCount2 = minSuffixCount2;
this.doShareNonSingletonNodes = doShareNonSingletonNodes;
this.shareMaxTailLength = shareMaxTailLength;
this.allowArrayArcs = allowArrayArcs;
this.allowFixedLengthArcs = allowFixedLengthArcs;
fst = new FST<>(inputType, outputs, bytesPageBits);
bytes = fst.bytes;
assert bytes != null;
@ -173,6 +204,27 @@ public class Builder<T> {
}
}
/**
* Overrides the default the maximum oversizing of fixed array allowed to enable direct addressing
* of arcs instead of binary search.
* <p>
* Setting this factor to a negative value (e.g. -1) effectively disables direct addressing,
* only binary search nodes will be created.
*
* @see #DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR
*/
public Builder<T> setDirectAddressingMaxOversizingFactor(float factor) {
directAddressingMaxOversizingFactor = factor;
return this;
}
/**
* @see #setDirectAddressingMaxOversizingFactor(float)
*/
public float getDirectAddressingMaxOversizingFactor() {
return directAddressingMaxOversizingFactor;
}
public long getTermCount() {
return frontier[0].inputCount;
}
@ -639,4 +691,52 @@ public class Builder<T> {
}
}
}
/**
* Reusable buffer for building nodes with fixed length arcs (binary search or direct addressing).
*/
static class FixedLengthArcsBuffer {
// Initial capacity is the max length required for the header of a node with fixed length arcs:
// header(byte) + numArcs(vint) + numBytes(vint)
private byte[] bytes = new byte[11];
private final ByteArrayDataOutput bado = new ByteArrayDataOutput(bytes);
/** Ensures the capacity of the internal byte array. Enlarges it if needed. */
FixedLengthArcsBuffer ensureCapacity(int capacity) {
if (bytes.length < capacity) {
bytes = new byte[ArrayUtil.oversize(capacity, Byte.BYTES)];
bado.reset(bytes);
}
return this;
}
FixedLengthArcsBuffer resetPosition() {
bado.reset(bytes);
return this;
}
FixedLengthArcsBuffer writeByte(byte b) {
bado.writeByte(b);
return this;
}
FixedLengthArcsBuffer writeVInt(int i) {
try {
bado.writeVInt(i);
} catch (IOException e) { // Never thrown.
throw new RuntimeException(e);
}
return this;
}
int getPosition() {
return bado.getPosition();
}
/** Gets the internal byte array. */
byte[] getBytes() {
return bytes;
}
}
}

View File

@ -239,6 +239,27 @@ class BytesStore extends DataOutput implements Accountable {
}
}
/** Copies bytes from this store to a target byte array. */
public void copyBytes(long src, byte[] dest, int offset, int len) {
int blockIndex = (int) (src >> blockBits);
int upto = (int) (src & blockMask);
byte[] block = blocks.get(blockIndex);
while (len > 0) {
int chunk = blockSize - upto;
if (len <= chunk) {
System.arraycopy(block, upto, dest, offset, len);
break;
} else {
System.arraycopy(block, upto, dest, offset, chunk);
blockIndex++;
block = blocks.get(blockIndex);
upto = 0;
len -= chunk;
offset += chunk;
}
}
}
/** Writes an int at the absolute position without
* changing the current pointer. */
public void writeInt(long pos, int value) {

File diff suppressed because it is too large Load Diff

View File

@ -138,12 +138,13 @@ abstract class FSTEnum<T> {
while(arc != null) {
int targetLabel = getTargetLabel();
//System.out.println(" cycle upto=" + upto + " arc.label=" + arc.label + " (" + (char) arc.label + ") vs targetLabel=" + targetLabel);
if (arc.bytesPerArc() != 0 && arc.label() != -1) {
if (arc.bytesPerArc() != 0 && arc.label() != FST.END_LABEL) {
// Arcs are in an array
final FST.BytesReader in = fst.getBytesReader();
if (arc.arcIdx() == Integer.MIN_VALUE) {
arc = doSeekCeilArrayWithGaps(arc, targetLabel, in);
if (arc.nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING) {
arc = doSeekCeilArrayDirectAddressing(arc, targetLabel, in);
} else {
assert arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH;
arc = doSeekCeilArrayPacked(arc, targetLabel, in);
}
} else {
@ -152,17 +153,12 @@ abstract class FSTEnum<T> {
}
}
private FST.Arc<T> doSeekCeilArrayWithGaps(final FST.Arc<T> arc, final int targetLabel, final FST.BytesReader in) throws IOException {
// The array is addressed directly by label and may contain holes.
private FST.Arc<T> doSeekCeilArrayDirectAddressing(final FST.Arc<T> arc, final int targetLabel, final FST.BytesReader in) throws IOException {
// The array is addressed directly by label, with presence bits to compute the actual arc offset.
in.setPosition(arc.posArcsStart());
in.skipBytes(1);
int firstLabel = fst.readLabel(in);
int arcOffset = targetLabel - firstLabel;
if (arcOffset >= arc.numArcs()) {
// target is beyond the last arc
fst.readArcAtPosition(arc, in, arc.posArcsStart() - (arc.numArcs() - 1) * arc.bytesPerArc());
assert arc.isLast();
int targetIndex = targetLabel - arc.firstLabel();
if (targetIndex >= arc.numArcs()) {
// Target is beyond the last arc, out of label range.
// Dead end (target is after the last arc);
// rollback to last fork then push
upto--;
@ -180,17 +176,13 @@ abstract class FSTEnum<T> {
upto--;
}
} else {
// TODO: if firstLabel == targetLabel
long pos;
if (arcOffset >= 0) {
pos = arc.posArcsStart() - (arc.bytesPerArc() * arcOffset);
} else {
pos = arc.posArcsStart();
}
fst.readArcAtPosition(arc, in, pos);
if (arc.label() == targetLabel) {
if (targetIndex < 0) {
targetIndex = -1;
} else if (arc.bitTable().isBitSet(targetIndex)) {
fst.readArcByDirectAddressing(arc, in, targetIndex);
assert arc.label() == targetLabel;
// found -- copy pasta from below
output[upto] = fst.outputs.add(output[upto-1], arc.output());
output[upto] = fst.outputs.add(output[upto - 1], arc.output());
if (targetLabel == FST.END_LABEL) {
return null;
}
@ -198,7 +190,10 @@ abstract class FSTEnum<T> {
incr();
return fst.readFirstTargetArc(arc, getArc(upto), fstReader);
}
// not found, return the next highest
// Not found, return the next arc (ceil).
int ceilIndex = arc.bitTable().nextBitSet(targetIndex);
assert ceilIndex != -1;
fst.readArcByDirectAddressing(arc, in, ceilIndex);
assert arc.label() > targetLabel;
pushFirst();
return null;
@ -319,9 +314,10 @@ abstract class FSTEnum<T> {
if (arc.bytesPerArc() != 0 && arc.label() != FST.END_LABEL) {
// Arcs are in an array
final FST.BytesReader in = fst.getBytesReader();
if (arc.arcIdx() == Integer.MIN_VALUE) {
arc = doSeekFloorArrayWithGaps(arc, targetLabel, in);
if (arc.nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING) {
arc = doSeekFloorArrayDirectAddressing(arc, targetLabel, in);
} else {
assert arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH;
arc = doSeekFloorArrayPacked(arc, targetLabel, in);
}
} else {
@ -330,46 +326,25 @@ abstract class FSTEnum<T> {
}
}
private FST.Arc<T> doSeekFloorArrayWithGaps(FST.Arc<T> arc, int targetLabel, final FST.BytesReader in) throws IOException {
// The array is addressed directly by label and may contain holes.
in.setPosition(arc.posArcsStart());
in.skipBytes(1);
int firstLabel = fst.readLabel(in);
int targetOffset = targetLabel - firstLabel;
if (targetOffset < 0) {
//System.out.println(" before first"); Very first arc is after our target TODO: if each
// arc could somehow read the arc just before, we can save this re-scan. The ceil case
// doesn't need this because it reads the next arc instead:
while(true) {
// First, walk backwards until we find a first arc
// that's before our target label:
fst.readFirstTargetArc(getArc(upto-1), arc, fstReader);
if (arc.label() < targetLabel) {
// Then, scan forwards to the arc just before
// the targetLabel:
while(!arc.isLast() && fst.readNextArcLabel(arc, in) < targetLabel) {
fst.readNextArc(arc, fstReader);
}
pushLast();
return null;
}
upto--;
if (upto == 0) {
return null;
}
targetLabel = getTargetLabel();
arc = getArc(upto);
}
private FST.Arc<T> doSeekFloorArrayDirectAddressing(FST.Arc<T> arc, int targetLabel, FST.BytesReader in) throws IOException {
// The array is addressed directly by label, with presence bits to compute the actual arc offset.
int targetIndex = targetLabel - arc.firstLabel();
if (targetIndex < 0) {
// Before first arc.
return backtrackToFloorArc(arc, targetLabel, in);
} else if (targetIndex >= arc.numArcs()) {
// After last arc.
fst.readArcByDirectAddressing(arc, in, arc.numArcs() - 1);
assert arc.label() < targetLabel;
assert arc.isLast();
pushLast();
return null;
} else {
if (targetOffset >= arc.numArcs()) {
fst.readArcAtPosition(arc, in, arc.posArcsStart() - arc.bytesPerArc() * (arc.numArcs() - 1));
assert arc.isLast();
assert arc.label() < targetLabel: "arc.label=" + arc.label() + " vs targetLabel=" + targetLabel;
pushLast();
return null;
}
fst.readArcAtPosition(arc, in, arc.posArcsStart() - arc.bytesPerArc() * targetOffset);
if (arc.label() == targetLabel) {
// Within label range.
if (arc.bitTable().isBitSet(targetIndex)) {
fst.readArcByDirectAddressing(arc, in, targetIndex);
assert arc.label() == targetLabel;
// found -- copy pasta from below
output[upto] = fst.outputs.add(output[upto-1], arc.output());
if (targetLabel == FST.END_LABEL) {
@ -379,18 +354,99 @@ abstract class FSTEnum<T> {
incr();
return fst.readFirstTargetArc(arc, getArc(upto), fstReader);
}
// Scan backwards to find a floor arc that is not missing
for (long arcOffset = arc.posArcsStart() - targetOffset * arc.bytesPerArc(); arcOffset <= arc.posArcsStart(); arcOffset += arc.bytesPerArc()) {
// TODO: we can do better here by skipping missing arcs
fst.readArcAtPosition(arc, in, arcOffset);
if (arc.label() < targetLabel) {
assert arc.isLast() || fst.readNextArcLabel(arc, in) > targetLabel;
pushLast();
return null;
// Scan backwards to find a floor arc.
int floorIndex = arc.bitTable().previousBitSet(targetIndex);
assert floorIndex != -1;
fst.readArcByDirectAddressing(arc, in, floorIndex);
assert arc.label() < targetLabel;
assert arc.isLast() || fst.readNextArcLabel(arc, in) > targetLabel;
pushLast();
return null;
}
}
/**
* Backtracks until it finds a node which first arc is before our target label.`
* Then on the node, finds the arc just before the targetLabel.
*
* @return null to continue the seek floor recursion loop.
*/
private FST.Arc<T> backtrackToFloorArc(FST.Arc<T> arc, int targetLabel, final FST.BytesReader in) throws IOException {
while (true) {
// First, walk backwards until we find a node which first arc is before our target label.
fst.readFirstTargetArc(getArc(upto-1), arc, fstReader);
if (arc.label() < targetLabel) {
// Then on this node, find the arc just before the targetLabel.
if (!arc.isLast()) {
if (arc.bytesPerArc() != 0 && arc.label() != FST.END_LABEL) {
if (arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH) {
findNextFloorArcBinarySearch(arc, targetLabel, in);
} else {
assert arc.nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING;
findNextFloorArcDirectAddressing(arc, targetLabel, in);
}
} else {
while (!arc.isLast() && fst.readNextArcLabel(arc, in) < targetLabel) {
fst.readNextArc(arc, fstReader);
}
}
}
assert arc.label() < targetLabel;
assert arc.isLast() || fst.readNextArcLabel(arc, in) >= targetLabel;
pushLast();
return null;
}
upto--;
if (upto == 0) {
return null;
}
targetLabel = getTargetLabel();
arc = getArc(upto);
}
}
/**
* Finds and reads an arc on the current node which label is strictly less than the given label.
* Skips the first arc, finds next floor arc; or none if the floor arc is the first
* arc itself (in this case it has already been read).
* <p>
* Precondition: the given arc is the first arc of the node.
*/
private void findNextFloorArcDirectAddressing(FST.Arc<T> arc, int targetLabel, final FST.BytesReader in) throws IOException {
assert arc.nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING;
assert arc.label() != FST.END_LABEL;
assert arc.label() == arc.firstLabel();
if (arc.numArcs() > 1) {
int targetIndex = targetLabel - arc.firstLabel();
assert targetIndex >= 0;
if (targetIndex >= arc.numArcs()) {
// Beyond last arc. Take last arc.
fst.readArcByDirectAddressing(arc, in, arc.numArcs() - 1);
} else {
// Take the preceding arc, even if the target is present.
int floorIndex = arc.bitTable().previousBitSet(targetIndex);
if (floorIndex > 0) {
fst.readArcByDirectAddressing(arc, in, floorIndex);
}
}
assert false: "arc.label=" + arc.label() + " vs targetLabel=" + targetLabel;
return arc; // unreachable
}
}
/**
* Same as {@link #findNextFloorArcDirectAddressing} for binary search node.
*/
private void findNextFloorArcBinarySearch(FST.Arc<T> arc, int targetLabel, FST.BytesReader in) throws IOException {
assert arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH;
assert arc.label() != FST.END_LABEL;
assert arc.arcIdx() == 0;
if (arc.numArcs() > 1) {
int idx = Util.binarySearch(fst, arc, targetLabel);
assert idx != -1;
if (idx > 1) {
fst.readArcByIndex(arc, in, idx - 1);
} else if (idx < -2) {
fst.readArcByIndex(arc, in, -2 - idx);
}
}
}
@ -412,34 +468,10 @@ abstract class FSTEnum<T> {
incr();
return fst.readFirstTargetArc(arc, getArc(upto), fstReader);
} else if (idx == -1) {
//System.out.println(" before first");
// Very first arc is after our target
// TODO: if each arc could somehow read the arc just
// before, we can save this re-scan. The ceil case
// doesn't need this because it reads the next arc
// instead:
while(true) {
// First, walk backwards until we find a first arc
// that's before our target label:
fst.readFirstTargetArc(getArc(upto-1), arc, fstReader);
if (arc.label() < targetLabel) {
// Then, scan forwards to the arc just before
// the targetLabel:
while(!arc.isLast() && fst.readNextArcLabel(arc, in) < targetLabel) {
fst.readNextArc(arc, fstReader);
}
pushLast();
return null;
}
upto--;
if (upto == 0) {
return null;
}
targetLabel = getTargetLabel();
arc = getArc(upto);
}
// Before first arc.
return backtrackToFloorArc(arc, targetLabel, in);
} else {
// There is a floor arc; idx will be {@code -1 - (floor + 1)}.
// There is a floor arc; idx will be (-1 - (floor + 1)).
fst.readArcByIndex(arc, in, -2 - idx);
assert arc.isLast() || fst.readNextArcLabel(arc, in) > targetLabel;
assert arc.label() < targetLabel: "arc.label=" + arc.label() + " vs targetLabel=" + targetLabel;

View File

@ -41,9 +41,22 @@ final class NodeHash<T> {
private boolean nodesEqual(Builder.UnCompiledNode<T> node, long address) throws IOException {
fst.readFirstRealTargetArc(address, scratchArc, in);
if (scratchArc.isPackedArray() && node.numArcs != scratchArc.numArcs()) {
return false;
// Fail fast for a node with fixed length arcs.
if (scratchArc.bytesPerArc() != 0) {
if (scratchArc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH) {
if (node.numArcs != scratchArc.numArcs()) {
return false;
}
} else {
assert scratchArc.nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING;
if ((node.arcs[node.numArcs - 1].label - node.arcs[0].label + 1) != scratchArc.numArcs()
|| node.numArcs != scratchArc.bitTable().countBits()) {
return false;
}
}
}
for(int arcUpto=0; arcUpto < node.numArcs; arcUpto++) {
final Builder.Arc<T> arc = node.arcs[arcUpto];
if (arc.label != scratchArc.label() ||

View File

@ -151,7 +151,7 @@ public final class Util {
fst.readFirstRealTargetArc(arc.target(), arc, in);
if (arc.bytesPerArc() != 0 && arc.arcIdx() > Integer.MIN_VALUE) {
if (arc.bytesPerArc() != 0 && arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH) {
int low = 0;
int high = arc.numArcs() -1;
@ -940,18 +940,27 @@ public final class Util {
}
fst.readFirstTargetArc(follow, arc, in);
if (arc.bytesPerArc() != 0 && arc.label() != FST.END_LABEL) {
if (arc.arcIdx() == Integer.MIN_VALUE) {
// Arcs are in an array-with-gaps
int offset = label - arc.label();
if (offset >= arc.numArcs()) {
if (arc.nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING) {
// Fixed length arcs in a direct addressing node.
int targetIndex = label - arc.label();
if (targetIndex >= arc.numArcs()) {
return null;
} else if (offset < 0) {
} else if (targetIndex < 0) {
return arc;
} else {
return fst.readArcAtPosition(arc, in, arc.posArcsStart() - offset * arc.bytesPerArc());
if (arc.bitTable().isBitSet(targetIndex)) {
fst.readArcByDirectAddressing(arc, in, targetIndex);
assert arc.label() == label;
} else {
int ceilIndex = arc.bitTable().nextBitSet(targetIndex);
assert ceilIndex != -1;
fst.readArcByDirectAddressing(arc, in, ceilIndex);
assert arc.label() > label;
}
return arc;
}
}
// Arcs are packed array -- use binary search to find the target.
// Fixed length arcs in a binary search node.
int idx = binarySearch(fst, arc, label);
if (idx >= 0) {
return fst.readArcByIndex(arc, in, idx);
@ -964,7 +973,8 @@ public final class Util {
return fst.readArcByIndex(arc, in , idx);
}
// Linear scan
// Variable length arcs in a linear scan list,
// or special arc with label == FST.END_LABEL.
fst.readFirstRealTargetArc(follow.target(), arc, in);
while (true) {
@ -995,6 +1005,7 @@ public final class Util {
* @throws IOException when the FST reader does
*/
static <T> int binarySearch(FST<T> fst, FST.Arc<T> arc, int targetLabel) throws IOException {
assert arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH : "Arc is not encoded as packed array for binary search (nodeFlags=" + arc.nodeFlags() + ")";
BytesReader in = fst.getBytesReader();
int low = arc.arcIdx();
int mid = 0;

View File

@ -0,0 +1,85 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.util;
public class TestBitUtil extends LuceneTestCase {
public void testNextBitSet() {
for (int i = 0; i < 10000; i++) {
long[] bits = buildRandomBits();
int numLong = bits.length - 1;
// Verify nextBitSet with countBitsUpTo for all bit indexes.
for (int bitIndex = -1; bitIndex < 64 * numLong; bitIndex++) {
int nextIndex = BitUtil.nextBitSet(bits, numLong, bitIndex);
if (nextIndex == -1) {
assertEquals("No next bit set, so expected no bit count diff"
+ " (i=" + i + " bitIndex=" + bitIndex + ")",
BitUtil.countBitsUpTo(bits, numLong, bitIndex + 1), BitUtil.countBits(bits, numLong));
} else {
assertTrue("Expected next bit set at nextIndex=" + nextIndex
+ " (i=" + i + " bitIndex=" + bitIndex + ")",
BitUtil.isBitSet(bits, numLong, nextIndex));
assertEquals("Next bit set at nextIndex=" + nextIndex
+ " so expected bit count diff of 1"
+ " (i=" + i + " bitIndex=" + bitIndex + ")",
BitUtil.countBitsUpTo(bits, numLong, bitIndex + 1) + 1,
BitUtil.countBitsUpTo(bits, numLong, nextIndex + 1));
}
}
}
}
public void testPreviousBitSet() {
for (int i = 0; i < 10000; i++) {
long[] bits = buildRandomBits();
int numLong = bits.length - 1;
// Verify previousBitSet with countBitsUpTo for all bit indexes.
for (int bitIndex = 0; bitIndex <= 64 * numLong; bitIndex++) {
int previousIndex = BitUtil.previousBitSet(bits, numLong, bitIndex);
if (previousIndex == -1) {
assertEquals("No previous bit set, so expected bit count 0"
+ " (i=" + i + " bitIndex=" + bitIndex + ")",
0, BitUtil.countBitsUpTo(bits, numLong, bitIndex));
} else {
assertTrue("Expected previous bit set at previousIndex=" + previousIndex
+ " (i=" + i + " bitIndex=" + bitIndex + ")",
BitUtil.isBitSet(bits, numLong, previousIndex));
int bitCount = BitUtil.countBitsUpTo(bits, numLong, Math.min(bitIndex + 1, numLong * Long.SIZE));
int expectedPreviousBitCount = bitIndex < numLong * Long.SIZE && BitUtil.isBitSet(bits, numLong, bitIndex) ?
bitCount - 1 : bitCount;
assertEquals("Previous bit set at previousIndex=" + previousIndex
+ " with current bitCount=" + bitCount
+ " so expected previousBitCount=" + expectedPreviousBitCount
+ " (i=" + i + " bitIndex=" + bitIndex + ")",
expectedPreviousBitCount, BitUtil.countBitsUpTo(bits, numLong, previousIndex + 1));
}
}
}
}
private long[] buildRandomBits() {
long[] bits = new long[random().nextInt(3) + 2];
for (int j = 0; j < bits.length; j++) {
// Bias towards zeros which require special logic.
bits[j] = random().nextInt(4) == 0 ? 0L : random().nextLong();
}
return bits;
}
}

View File

@ -303,7 +303,7 @@ public class TestFSTs extends LuceneTestCase {
public void testBigSet() throws IOException {
testRandomWords(TestUtil.nextInt(random(), 50000, 60000), 1);
}
// Build FST for all unique terms in the test line docs
// file, up until a doc limit
public void testRealTerms() throws Exception {
@ -1078,10 +1078,10 @@ public class TestFSTs extends LuceneTestCase {
int children = verifyStateAndBelow(fst, new FST.Arc<>().copyFrom(arc), depth + 1);
assertEquals(
expanded,
(depth <= FST.FIXED_ARRAY_SHALLOW_DISTANCE &&
children >= FST.FIXED_ARRAY_NUM_ARCS_SHALLOW) ||
children >= FST.FIXED_ARRAY_NUM_ARCS_DEEP);
(depth <= FST.FIXED_LENGTH_ARC_SHALLOW_DEPTH &&
children >= FST.FIXED_LENGTH_ARC_SHALLOW_NUM_ARCS) ||
children >= FST.FIXED_LENGTH_ARC_DEEP_NUM_ARCS,
expanded);
if (arc.isLast()) break;
}
@ -1092,8 +1092,8 @@ public class TestFSTs extends LuceneTestCase {
}
// Sanity check.
assertTrue(FST.FIXED_ARRAY_NUM_ARCS_SHALLOW < FST.FIXED_ARRAY_NUM_ARCS_DEEP);
assertTrue(FST.FIXED_ARRAY_SHALLOW_DISTANCE >= 0);
assertTrue(FST.FIXED_LENGTH_ARC_SHALLOW_NUM_ARCS < FST.FIXED_LENGTH_ARC_DEEP_NUM_ARCS);
assertTrue(FST.FIXED_LENGTH_ARC_SHALLOW_DEPTH >= 0);
SyntheticData s = new SyntheticData();
@ -1635,4 +1635,23 @@ public class TestFSTs extends LuceneTestCase {
// expected
}
}
public void testSimpleDepth() throws Exception {
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
BytesRef ab = new BytesRef("ab");
BytesRef ac = new BytesRef("ac");
BytesRef bd = new BytesRef("bd");
builder.add(Util.toIntsRef(ab, new IntsRefBuilder()), 3L);
builder.add(Util.toIntsRef(ac, new IntsRefBuilder()), 5L);
builder.add(Util.toIntsRef(bd, new IntsRefBuilder()), 7L);
FST<Long> fst = builder.finish();
assertEquals(3, (long) Util.get(fst, ab));
assertEquals(5, (long) Util.get(fst, ac));
assertEquals(7, (long) Util.get(fst, bd));
}
}

View File

@ -1,105 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.util.fst;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Locale;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.LuceneTestCase;
public class TestFstDirect extends LuceneTestCase {
public void testDenseWithGap() throws Exception {
List<String> words = Arrays.asList("ah", "bi", "cj", "dk", "fl", "gm");
List<BytesRef> entries = new ArrayList<>();
for (String word : words) {
entries.add(new BytesRef(word.getBytes("ascii")));
}
final BytesRefFSTEnum<Object> fstEnum = new BytesRefFSTEnum<>(buildFST(entries));
for (BytesRef entry : entries) {
assertNotNull(entry.utf8ToString() + " not found", fstEnum.seekExact(entry));
}
}
public void testDeDupTails() throws Exception {
List<BytesRef> entries = new ArrayList<>();
for (int i = 0; i < 1000000; i += 4) {
byte[] b = new byte[3];
int val = i;
for (int j = b.length - 1; j >= 0; --j) {
b[j] = (byte) (val & 0xff);
val >>= 8;
}
entries.add(new BytesRef(b));
}
long size = buildFST(entries).ramBytesUsed();
// Size is 1664 when we use only list-encoding. We were previously failing to ever de-dup
// arrays-with-gaps, which led this case to blow up.
assertTrue(size < 3000);
//printf("fst size = %d bytes", size);
}
private FST<Object> buildFST(List<BytesRef> entries) throws Exception {
final Outputs<Object> outputs = NoOutputs.getSingleton();
final Builder<Object> b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
BytesRef last = null;
for (BytesRef entry : entries) {
if (entry.equals(last) == false) {
b.add(Util.toIntsRef(entry, new IntsRefBuilder()), outputs.getNoOutput());
}
last = entry;
}
FST<Object> fst = b.finish();
return fst;
}
private static void printf(String format, Object ... values) {
System.out.println(String.format(Locale.ROOT, format, values));
}
private static long nsToMs(long ns) {
return ns / 1_000_000;
}
public static void main(String... args) throws Exception {
byte[] buf = Files.readAllBytes(Paths.get(args[0]));
DataInput in = new ByteArrayDataInput(buf);
FST<BytesRef> fst = new FST<>(in, ByteSequenceOutputs.getSingleton());
BytesRefFSTEnum<BytesRef> fstEnum = new BytesRefFSTEnum<>(fst);
int sparseArrayArcCount = 0, directArrayArcCount = 0, listArcCount = 0;
while(fstEnum.next() != null) {
if (fstEnum.arcs[fstEnum.upto].bytesPerArc() == 0) {
listArcCount ++;
} else if (fstEnum.arcs[fstEnum.upto].arcIdx() == Integer.MIN_VALUE) {
directArrayArcCount ++;
} else {
sparseArrayArcCount ++;
}
}
System.out.println("direct arcs = " + directArrayArcCount + ", sparse arcs = " + sparseArrayArcCount +
" list arcs = " + listArcCount);
}
}

View File

@ -0,0 +1,212 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.util.fst;
import java.io.BufferedReader;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Set;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.LuceneTestCase;
public class TestFstDirectAddressing extends LuceneTestCase {
public void testDenseWithGap() throws Exception {
List<String> words = Arrays.asList("ah", "bi", "cj", "dk", "fl", "gm");
List<BytesRef> entries = new ArrayList<>();
for (String word : words) {
entries.add(new BytesRef(word.getBytes(StandardCharsets.US_ASCII)));
}
final BytesRefFSTEnum<Object> fstEnum = new BytesRefFSTEnum<>(buildFST(entries));
for (BytesRef entry : entries) {
assertNotNull(entry.utf8ToString() + " not found", fstEnum.seekExact(entry));
}
}
public void testDeDupTails() throws Exception {
List<BytesRef> entries = new ArrayList<>();
for (int i = 0; i < 1000000; i += 4) {
byte[] b = new byte[3];
int val = i;
for (int j = b.length - 1; j >= 0; --j) {
b[j] = (byte) (val & 0xff);
val >>= 8;
}
entries.add(new BytesRef(b));
}
long size = buildFST(entries).ramBytesUsed();
// Size is 1648 when we use only list-encoding. We were previously failing to ever de-dup
// direct addressing, which led this case to blow up.
assertTrue(size <= 1080);
//printf("fst size = %d bytes", size);
}
public void testWorstCaseForDirectAddressing() throws Exception {
// This test will fail if there is more than 1% memory increase with direct addressing in this worst case.
final double MEMORY_INCREASE_LIMIT_PERCENT = 1d;
final int NUM_WORDS = 1000000;
// Generate words with specially crafted bytes.
Set<BytesRef> wordSet = new HashSet<>();
for (int i = 0; i < NUM_WORDS; ++i) {
byte[] b = new byte[5];
random().nextBytes(b);
for (int j = 0; j < b.length; ++j) {
b[j] &= 0xfc; // Make this byte a multiple of 4.
}
wordSet.add(new BytesRef(b));
}
List<BytesRef> wordList = new ArrayList<>(wordSet);
Collections.sort(wordList);
// Disable direct addressing and measure the FST size.
Builder<Object> builder = createBuilder(-1f);
FST<Object> fst = buildFST(wordList, builder);
long ramBytesUsedNoDirectAddressing = fst.ramBytesUsed();
// Enable direct addressing and measure the FST size.
builder = createBuilder(Builder.DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR);
fst = buildFST(wordList, builder);
long ramBytesUsed = fst.ramBytesUsed();
// Compute the size increase in percents.
double directAddressingMemoryIncreasePercent = ((double) ramBytesUsed / ramBytesUsedNoDirectAddressing - 1) * 100;
// printStats(builder, ramBytesUsed, directAddressingMemoryIncreasePercent);
// Verify the FST size does not exceed the limit.
assertTrue("FST size exceeds limit, size = " + ramBytesUsed
+ ", increase = " + directAddressingMemoryIncreasePercent + " %"
+ ", limit = " + MEMORY_INCREASE_LIMIT_PERCENT + " %",
directAddressingMemoryIncreasePercent < MEMORY_INCREASE_LIMIT_PERCENT);
}
private static void printStats(Builder<Object> builder, long ramBytesUsed, double directAddressingMemoryIncreasePercent) {
System.out.println("directAddressingMaxOversizingFactor = " + builder.getDirectAddressingMaxOversizingFactor());
System.out.println("ramBytesUsed = "
+ String.format(Locale.ENGLISH, "%.2f MB", ramBytesUsed / 1024d / 1024d)
+ String.format(Locale.ENGLISH, " (%.2f %% increase with direct addressing)", directAddressingMemoryIncreasePercent));
System.out.println("num nodes = " + builder.nodeCount);
long fixedLengthArcNodeCount = builder.directAddressingNodeCount + builder.binarySearchNodeCount;
System.out.println("num fixed-length-arc nodes = " + fixedLengthArcNodeCount
+ String.format(Locale.ENGLISH, " (%.2f %% of all nodes)",
((double) fixedLengthArcNodeCount / builder.nodeCount * 100)));
System.out.println("num binary-search nodes = " + (builder.binarySearchNodeCount)
+ String.format(Locale.ENGLISH, " (%.2f %% of fixed-length-arc nodes)",
((double) (builder.binarySearchNodeCount) / fixedLengthArcNodeCount * 100)));
System.out.println("num direct-addressing nodes = " + (builder.directAddressingNodeCount)
+ String.format(Locale.ENGLISH, " (%.2f %% of fixed-length-arc nodes)",
((double) (builder.directAddressingNodeCount) / fixedLengthArcNodeCount * 100)));
}
private static Builder<Object> createBuilder(float directAddressingMaxOversizingFactor) {
return new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, NoOutputs.getSingleton(), true, 15)
.setDirectAddressingMaxOversizingFactor(directAddressingMaxOversizingFactor);
}
private FST<Object> buildFST(List<BytesRef> entries) throws Exception {
return buildFST(entries, createBuilder(Builder.DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR));
}
private static FST<Object> buildFST(List<BytesRef> entries, Builder<Object> builder) throws Exception {
BytesRef last = null;
for (BytesRef entry : entries) {
if (entry.equals(last) == false) {
builder.add(Util.toIntsRef(entry, new IntsRefBuilder()), NoOutputs.getSingleton().getNoOutput());
}
last = entry;
}
return builder.finish();
}
public static void main(String... args) throws Exception {
if (args.length < 2) {
throw new IllegalArgumentException("Missing argument");
}
if (args[0].equals("-countFSTArcs")) {
countFSTArcs(args[1]);
} else if (args[0].equals("-measureFSTOversizing")) {
measureFSTOversizing(args[1]);
} else {
throw new IllegalArgumentException("Invalid argument " + args[0]);
}
}
private static void countFSTArcs(String FSTFilePath) throws IOException {
byte[] buf = Files.readAllBytes(Paths.get(FSTFilePath));
DataInput in = new ByteArrayDataInput(buf);
FST<BytesRef> fst = new FST<>(in, ByteSequenceOutputs.getSingleton());
BytesRefFSTEnum<BytesRef> fstEnum = new BytesRefFSTEnum<>(fst);
int binarySearchArcCount = 0, directAddressingArcCount = 0, listArcCount = 0;
while(fstEnum.next() != null) {
if (fstEnum.arcs[fstEnum.upto].bytesPerArc() == 0) {
listArcCount ++;
} else if (fstEnum.arcs[fstEnum.upto].nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING) {
directAddressingArcCount ++;
} else {
binarySearchArcCount ++;
}
}
System.out.println("direct addressing arcs = " + directAddressingArcCount
+ ", binary search arcs = " + binarySearchArcCount
+ " list arcs = " + listArcCount);
}
private static void measureFSTOversizing(String wordsFilePath) throws Exception {
final int MAX_NUM_WORDS = 1000000;
// Read real english words.
List<BytesRef> wordList = new ArrayList<>();
try (BufferedReader reader = Files.newBufferedReader(Paths.get(wordsFilePath))) {
while (wordList.size() < MAX_NUM_WORDS) {
String word = reader.readLine();
if (word == null) {
break;
}
wordList.add(new BytesRef(word));
}
}
Collections.sort(wordList);
// Disable direct addressing and measure the FST size.
Builder<Object> builder = createBuilder(-1f);
FST<Object> fst = buildFST(wordList, builder);
long ramBytesUsedNoDirectAddressing = fst.ramBytesUsed();
// Enable direct addressing and measure the FST size.
builder = createBuilder(Builder.DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR);
fst = buildFST(wordList, builder);
long ramBytesUsed = fst.ramBytesUsed();
// Compute the size increase in percents.
double directAddressingMemoryIncreasePercent = ((double) ramBytesUsed / ramBytesUsedNoDirectAddressing - 1) * 100;
printStats(builder, ramBytesUsed, directAddressingMemoryIncreasePercent);
}
}

View File

@ -27,10 +27,10 @@ import org.apache.lucene.util.LuceneTestCase;
public class TestUtil extends LuceneTestCase {
public void testBinarySearch() throws Exception {
// Creates a node with 8 arcs spanning (z-A) = 57 chars that will be encoded as a sparse array (no gaps)
// requiring binary search
// Create a node with 8 arcs spanning (z-A) and ensure it is encoded as a packed array
// requiring binary search.
List<String> letters = Arrays.asList("A", "E", "J", "K", "L", "O", "T", "z");
FST<Object> fst = buildFST(letters, true);
FST<Object> fst = buildFST(letters, true, false);
FST.Arc<Object> arc = fst.getFirstArc(new FST.Arc<>());
arc = fst.readFirstTargetArc(arc, arc, fst.getBytesReader());
for (int i = 0; i < letters.size(); i++) {
@ -47,21 +47,21 @@ public class TestUtil extends LuceneTestCase {
public void testReadCeilArcPackedArray() throws Exception {
List<String> letters = Arrays.asList("A", "E", "J", "K", "L", "O", "T", "z");
verifyReadCeilArc(letters, true);
verifyReadCeilArc(letters, true, false);
}
public void testReadCeilArcArrayWithGaps() throws Exception {
List<String> letters = Arrays.asList("A", "E", "J", "K", "L", "O", "T");
verifyReadCeilArc(letters, true);
verifyReadCeilArc(letters, true, true);
}
public void testReadCeilArcList() throws Exception {
List<String> letters = Arrays.asList("A", "E", "J", "K", "L", "O", "T", "z");
verifyReadCeilArc(letters, false);
verifyReadCeilArc(letters, false, false);
}
private void verifyReadCeilArc(List<String> letters, boolean allowArrayArcs) throws Exception {
FST<Object> fst = buildFST(letters, allowArrayArcs);
private void verifyReadCeilArc(List<String> letters, boolean allowArrayArcs, boolean allowDirectAddressing) throws Exception {
FST<Object> fst = buildFST(letters, allowArrayArcs, allowDirectAddressing);
FST.Arc<Object> first = fst.getFirstArc(new FST.Arc<>());
FST.Arc<Object> arc = new FST.Arc<>();
FST.BytesReader in = fst.getBytesReader();
@ -81,9 +81,12 @@ public class TestUtil extends LuceneTestCase {
assertNull(Util.readCeilArc('Z', fst, arc, arc, in));
}
private FST<Object> buildFST(List<String> words, boolean allowArrayArcs) throws Exception {
private FST<Object> buildFST(List<String> words, boolean allowArrayArcs, boolean allowDirectAddressing) throws Exception {
final Outputs<Object> outputs = NoOutputs.getSingleton();
final Builder<Object> b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, allowArrayArcs, 15);
if (!allowDirectAddressing) {
b.setDirectAddressingMaxOversizingFactor(-1f);
}
for (String word : words) {
b.add(Util.toIntsRef(new BytesRef(word), new IntsRefBuilder()), outputs.getNoOutput());