mirror of https://github.com/apache/lucene.git
LUCENE-8920: Reduce the memory used by direct addressing of arcs (#980)
This commit is contained in:
parent
c1ac146454
commit
068b6babac
Binary file not shown.
Binary file not shown.
|
@ -1084,7 +1084,7 @@ public final class OrdsSegmentTermsEnum extends BaseTermsEnum {
|
|||
result.grow(1+upto);
|
||||
fr.index.readFirstRealTargetArc(arc.target(), arc, fstReader);
|
||||
|
||||
if (arc.bytesPerArc() != 0 && arc.arcIdx() > Integer.MIN_VALUE) {
|
||||
if (arc.bytesPerArc() != 0 && arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH) {
|
||||
// System.out.println(" array arcs");
|
||||
int low = 0;
|
||||
int high = arc.numArcs() -1;
|
||||
|
|
|
@ -178,4 +178,129 @@ public final class BitUtil {
|
|||
return ((l >>> 1) ^ -(l & 1));
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns whether the bit at given zero-based index is set.
|
||||
* <br>Example: bitIndex 66 means the third bit on the right of the second long.
|
||||
*
|
||||
* @param bits The bits stored in an array of long for efficiency.
|
||||
* @param numLongs The number of longs in {@code bits} to consider.
|
||||
* @param bitIndex The bit zero-based index. It must be greater than or equal to 0,
|
||||
* and strictly less than {@code numLongs * Long.SIZE}.
|
||||
*/
|
||||
public static boolean isBitSet(long[] bits, int numLongs, int bitIndex) {
|
||||
assert numLongs >= 0 && numLongs <= bits.length && bitIndex >= 0 && bitIndex < numLongs * Long.SIZE
|
||||
: "bitIndex=" + bitIndex + " numLongs=" + numLongs + " bits.length=" + bits.length;
|
||||
return (bits[bitIndex / Long.SIZE] & (1L << bitIndex)) != 0; // Shifts are mod 64.
|
||||
}
|
||||
|
||||
/**
|
||||
* Counts all bits set in the provided longs.
|
||||
*
|
||||
* @param bits The bits stored in an array of long for efficiency.
|
||||
* @param numLongs The number of longs in {@code bits} to consider.
|
||||
*/
|
||||
public static int countBits(long[] bits, int numLongs) {
|
||||
assert numLongs >= 0 && numLongs <= bits.length
|
||||
: "numLongs=" + numLongs + " bits.length=" + bits.length;
|
||||
int bitCount = 0;
|
||||
for (int i = 0; i < numLongs; i++) {
|
||||
bitCount += Long.bitCount(bits[i]);
|
||||
}
|
||||
return bitCount;
|
||||
}
|
||||
|
||||
/**
|
||||
* Counts the bits set up to the given bit zero-based index, exclusive.
|
||||
* <br>In other words, how many 1s there are up to the bit at the given index excluded.
|
||||
* <br>Example: bitIndex 66 means the third bit on the right of the second long.
|
||||
*
|
||||
* @param bits The bits stored in an array of long for efficiency.
|
||||
* @param numLongs The number of longs in {@code bits} to consider.
|
||||
* @param bitIndex The bit zero-based index, exclusive. It must be greater than or equal to 0,
|
||||
* and less than or equal to {@code numLongs * Long.SIZE}.
|
||||
*/
|
||||
public static int countBitsUpTo(long[] bits, int numLongs, int bitIndex) {
|
||||
assert numLongs >= 0 && numLongs <= bits.length && bitIndex >= 0 && bitIndex <= numLongs * Long.SIZE
|
||||
: "bitIndex=" + bitIndex + " numLongs=" + numLongs + " bits.length=" + bits.length;
|
||||
int bitCount = 0;
|
||||
int lastLong = bitIndex / Long.SIZE;
|
||||
for (int i = 0; i < lastLong; i++) {
|
||||
// Count the bits set for all plain longs.
|
||||
bitCount += Long.bitCount(bits[i]);
|
||||
}
|
||||
if (lastLong < numLongs) {
|
||||
// Prepare a mask with 1s on the right up to bitIndex exclusive.
|
||||
long mask = (1L << bitIndex) - 1L; // Shifts are mod 64.
|
||||
// Count the bits set only within the mask part, so up to bitIndex exclusive.
|
||||
bitCount += Long.bitCount(bits[lastLong] & mask);
|
||||
}
|
||||
return bitCount;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the index of the next bit set following the given bit zero-based index.
|
||||
* <br>For example with bits 100011:
|
||||
* the next bit set after index=-1 is at index=0;
|
||||
* the next bit set after index=0 is at index=1;
|
||||
* the next bit set after index=1 is at index=5;
|
||||
* there is no next bit set after index=5.
|
||||
*
|
||||
* @param bits The bits stored in an array of long for efficiency.
|
||||
* @param numLongs The number of longs in {@code bits} to consider.
|
||||
* @param bitIndex The bit zero-based index. It must be greater than or equal to -1,
|
||||
* and strictly less than {@code numLongs * Long.SIZE}.
|
||||
* @return The zero-based index of the next bit set after the provided {@code bitIndex};
|
||||
* or -1 if none.
|
||||
*/
|
||||
public static int nextBitSet(long[] bits, int numLongs, int bitIndex) {
|
||||
assert numLongs >= 0 && numLongs <= bits.length && bitIndex >= -1 && bitIndex < numLongs * Long.SIZE
|
||||
: "bitIndex=" + bitIndex + " numLongs=" + numLongs + " bits.length=" + bits.length;
|
||||
int longIndex = bitIndex / Long.SIZE;
|
||||
// Prepare a mask with 1s on the left down to bitIndex exclusive.
|
||||
long mask = -(1L << (bitIndex + 1)); // Shifts are mod 64.
|
||||
long l = mask == -1 && bitIndex != -1 ? 0 : bits[longIndex] & mask;
|
||||
while (l == 0) {
|
||||
if (++longIndex == numLongs) {
|
||||
return -1;
|
||||
}
|
||||
l = bits[longIndex];
|
||||
}
|
||||
return Long.numberOfTrailingZeros(l) + longIndex * 64;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the index of the previous bit set preceding the given bit zero-based index.
|
||||
* <br>For example with bits 100011:
|
||||
* there is no previous bit set before index=0.
|
||||
* the previous bit set before index=1 is at index=0;
|
||||
* the previous bit set before index=5 is at index=1;
|
||||
* the previous bit set before index=64 is at index=5;
|
||||
*
|
||||
* @param bits The bits stored in an array of long for efficiency.
|
||||
* @param numLongs The number of longs in {@code bits} to consider.
|
||||
* @param bitIndex The bit zero-based index. It must be greater than or equal to 0,
|
||||
* and less than or equal to {@code numLongs * Long.SIZE}.
|
||||
* @return The zero-based index of the previous bit set before the provided {@code bitIndex};
|
||||
* or -1 if none.
|
||||
*/
|
||||
public static int previousBitSet(long[] bits, int numLongs, int bitIndex) {
|
||||
assert numLongs >= 0 && numLongs <= bits.length && bitIndex >= 0 && bitIndex <= numLongs * Long.SIZE
|
||||
: "bitIndex=" + bitIndex + " numLongs=" + numLongs + " bits.length=" + bits.length;
|
||||
int longIndex = bitIndex / Long.SIZE;
|
||||
long l;
|
||||
if (longIndex == numLongs) {
|
||||
l = 0;
|
||||
} else {
|
||||
// Prepare a mask with 1s on the right up to bitIndex exclusive.
|
||||
long mask = (1L << bitIndex) - 1L; // Shifts are mod 64.
|
||||
l = bits[longIndex] & mask;
|
||||
}
|
||||
while (l == 0) {
|
||||
if (longIndex-- == 0) {
|
||||
return -1;
|
||||
}
|
||||
l = bits[longIndex];
|
||||
}
|
||||
return 63 - Long.numberOfLeadingZeros(l) + longIndex * 64;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.lucene.util.fst;
|
|||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.store.ByteArrayDataOutput;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
import org.apache.lucene.util.IntsRefBuilder;
|
||||
|
@ -50,6 +51,30 @@ import org.apache.lucene.util.fst.FST.INPUT_TYPE; // javadoc
|
|||
|
||||
public class Builder<T> {
|
||||
|
||||
/**
|
||||
* Default oversizing factor used to decide whether to encode a node with direct addressing or binary search.
|
||||
* Default is 1: ensure no oversizing on average.
|
||||
* <p>
|
||||
* This factor does not determine whether to encode a node with a list of variable length arcs or with
|
||||
* fixed length arcs. It only determines the effective encoding of a node that is already known to be
|
||||
* encoded with fixed length arcs.
|
||||
* See {@code FST.shouldExpandNodeWithFixedLengthArcs()}
|
||||
* and {@code FST.shouldExpandNodeWithDirectAddressing()}.
|
||||
* <p>
|
||||
* For English words we measured 217K nodes, only 3.27% nodes are encoded with fixed length arcs,
|
||||
* and 99.99% of them with direct addressing. Overall FST memory reduced by 1.67%.
|
||||
* <p>
|
||||
* For worst case we measured 168K nodes, 50% of them are encoded with fixed length arcs,
|
||||
* and 14% of them with direct encoding. Overall FST memory reduced by 0.8%.
|
||||
* <p>
|
||||
* Use {@code TestFstDirectAddressing.main()}
|
||||
* and {@code TestFstDirectAddressing.testWorstCaseForDirectAddressing()}
|
||||
* to evaluate a change.
|
||||
*
|
||||
* @see #setDirectAddressingMaxOversizingFactor
|
||||
*/
|
||||
static final float DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR = 1.0f;
|
||||
|
||||
private final NodeHash<T> dedupHash;
|
||||
final FST<T> fst;
|
||||
private final T NO_OUTPUT;
|
||||
|
@ -83,12 +108,18 @@ public class Builder<T> {
|
|||
long lastFrozenNode;
|
||||
|
||||
// Reused temporarily while building the FST:
|
||||
int[] reusedBytesPerArc = new int[4];
|
||||
int[] numBytesPerArc = new int[4];
|
||||
int[] numLabelBytesPerArc = new int[numBytesPerArc.length];
|
||||
final FixedLengthArcsBuffer fixedLengthArcsBuffer = new FixedLengthArcsBuffer();
|
||||
|
||||
long arcCount;
|
||||
long nodeCount;
|
||||
long binarySearchNodeCount;
|
||||
long directAddressingNodeCount;
|
||||
|
||||
boolean allowArrayArcs;
|
||||
boolean allowFixedLengthArcs;
|
||||
float directAddressingMaxOversizingFactor = DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR;
|
||||
long directAddressingExpansionCredit;
|
||||
|
||||
BytesStore bytes;
|
||||
|
||||
|
@ -138,9 +169,9 @@ public class Builder<T> {
|
|||
* FSA, use {@link NoOutputs#getSingleton()} and {@link NoOutputs#getNoOutput()} as the
|
||||
* singleton output object.
|
||||
*
|
||||
* @param allowArrayArcs Pass false to disable the array arc optimization
|
||||
* while building the FST; this will make the resulting
|
||||
* FST smaller but slower to traverse.
|
||||
* @param allowFixedLengthArcs Pass false to disable the fixed length arc optimization (binary search or
|
||||
* direct addressing) while building the FST; this will make the resulting FST smaller but slower to
|
||||
* traverse.
|
||||
*
|
||||
* @param bytesPageBits How many bits wide to make each
|
||||
* byte[] block in the BytesStore; if you know the FST
|
||||
|
@ -149,12 +180,12 @@ public class Builder<T> {
|
|||
*/
|
||||
public Builder(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doShareSuffix,
|
||||
boolean doShareNonSingletonNodes, int shareMaxTailLength, Outputs<T> outputs,
|
||||
boolean allowArrayArcs, int bytesPageBits) {
|
||||
boolean allowFixedLengthArcs, int bytesPageBits) {
|
||||
this.minSuffixCount1 = minSuffixCount1;
|
||||
this.minSuffixCount2 = minSuffixCount2;
|
||||
this.doShareNonSingletonNodes = doShareNonSingletonNodes;
|
||||
this.shareMaxTailLength = shareMaxTailLength;
|
||||
this.allowArrayArcs = allowArrayArcs;
|
||||
this.allowFixedLengthArcs = allowFixedLengthArcs;
|
||||
fst = new FST<>(inputType, outputs, bytesPageBits);
|
||||
bytes = fst.bytes;
|
||||
assert bytes != null;
|
||||
|
@ -173,6 +204,27 @@ public class Builder<T> {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Overrides the default the maximum oversizing of fixed array allowed to enable direct addressing
|
||||
* of arcs instead of binary search.
|
||||
* <p>
|
||||
* Setting this factor to a negative value (e.g. -1) effectively disables direct addressing,
|
||||
* only binary search nodes will be created.
|
||||
*
|
||||
* @see #DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR
|
||||
*/
|
||||
public Builder<T> setDirectAddressingMaxOversizingFactor(float factor) {
|
||||
directAddressingMaxOversizingFactor = factor;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* @see #setDirectAddressingMaxOversizingFactor(float)
|
||||
*/
|
||||
public float getDirectAddressingMaxOversizingFactor() {
|
||||
return directAddressingMaxOversizingFactor;
|
||||
}
|
||||
|
||||
public long getTermCount() {
|
||||
return frontier[0].inputCount;
|
||||
}
|
||||
|
@ -639,4 +691,52 @@ public class Builder<T> {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reusable buffer for building nodes with fixed length arcs (binary search or direct addressing).
|
||||
*/
|
||||
static class FixedLengthArcsBuffer {
|
||||
|
||||
// Initial capacity is the max length required for the header of a node with fixed length arcs:
|
||||
// header(byte) + numArcs(vint) + numBytes(vint)
|
||||
private byte[] bytes = new byte[11];
|
||||
private final ByteArrayDataOutput bado = new ByteArrayDataOutput(bytes);
|
||||
|
||||
/** Ensures the capacity of the internal byte array. Enlarges it if needed. */
|
||||
FixedLengthArcsBuffer ensureCapacity(int capacity) {
|
||||
if (bytes.length < capacity) {
|
||||
bytes = new byte[ArrayUtil.oversize(capacity, Byte.BYTES)];
|
||||
bado.reset(bytes);
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
FixedLengthArcsBuffer resetPosition() {
|
||||
bado.reset(bytes);
|
||||
return this;
|
||||
}
|
||||
|
||||
FixedLengthArcsBuffer writeByte(byte b) {
|
||||
bado.writeByte(b);
|
||||
return this;
|
||||
}
|
||||
|
||||
FixedLengthArcsBuffer writeVInt(int i) {
|
||||
try {
|
||||
bado.writeVInt(i);
|
||||
} catch (IOException e) { // Never thrown.
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
int getPosition() {
|
||||
return bado.getPosition();
|
||||
}
|
||||
|
||||
/** Gets the internal byte array. */
|
||||
byte[] getBytes() {
|
||||
return bytes;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -239,6 +239,27 @@ class BytesStore extends DataOutput implements Accountable {
|
|||
}
|
||||
}
|
||||
|
||||
/** Copies bytes from this store to a target byte array. */
|
||||
public void copyBytes(long src, byte[] dest, int offset, int len) {
|
||||
int blockIndex = (int) (src >> blockBits);
|
||||
int upto = (int) (src & blockMask);
|
||||
byte[] block = blocks.get(blockIndex);
|
||||
while (len > 0) {
|
||||
int chunk = blockSize - upto;
|
||||
if (len <= chunk) {
|
||||
System.arraycopy(block, upto, dest, offset, len);
|
||||
break;
|
||||
} else {
|
||||
System.arraycopy(block, upto, dest, offset, chunk);
|
||||
blockIndex++;
|
||||
block = blocks.get(blockIndex);
|
||||
upto = 0;
|
||||
len -= chunk;
|
||||
offset += chunk;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Writes an int at the absolute position without
|
||||
* changing the current pointer. */
|
||||
public void writeInt(long pos, int value) {
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -138,12 +138,13 @@ abstract class FSTEnum<T> {
|
|||
while(arc != null) {
|
||||
int targetLabel = getTargetLabel();
|
||||
//System.out.println(" cycle upto=" + upto + " arc.label=" + arc.label + " (" + (char) arc.label + ") vs targetLabel=" + targetLabel);
|
||||
if (arc.bytesPerArc() != 0 && arc.label() != -1) {
|
||||
if (arc.bytesPerArc() != 0 && arc.label() != FST.END_LABEL) {
|
||||
// Arcs are in an array
|
||||
final FST.BytesReader in = fst.getBytesReader();
|
||||
if (arc.arcIdx() == Integer.MIN_VALUE) {
|
||||
arc = doSeekCeilArrayWithGaps(arc, targetLabel, in);
|
||||
if (arc.nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING) {
|
||||
arc = doSeekCeilArrayDirectAddressing(arc, targetLabel, in);
|
||||
} else {
|
||||
assert arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH;
|
||||
arc = doSeekCeilArrayPacked(arc, targetLabel, in);
|
||||
}
|
||||
} else {
|
||||
|
@ -152,17 +153,12 @@ abstract class FSTEnum<T> {
|
|||
}
|
||||
}
|
||||
|
||||
private FST.Arc<T> doSeekCeilArrayWithGaps(final FST.Arc<T> arc, final int targetLabel, final FST.BytesReader in) throws IOException {
|
||||
// The array is addressed directly by label and may contain holes.
|
||||
private FST.Arc<T> doSeekCeilArrayDirectAddressing(final FST.Arc<T> arc, final int targetLabel, final FST.BytesReader in) throws IOException {
|
||||
// The array is addressed directly by label, with presence bits to compute the actual arc offset.
|
||||
|
||||
in.setPosition(arc.posArcsStart());
|
||||
in.skipBytes(1);
|
||||
int firstLabel = fst.readLabel(in);
|
||||
int arcOffset = targetLabel - firstLabel;
|
||||
if (arcOffset >= arc.numArcs()) {
|
||||
// target is beyond the last arc
|
||||
fst.readArcAtPosition(arc, in, arc.posArcsStart() - (arc.numArcs() - 1) * arc.bytesPerArc());
|
||||
assert arc.isLast();
|
||||
int targetIndex = targetLabel - arc.firstLabel();
|
||||
if (targetIndex >= arc.numArcs()) {
|
||||
// Target is beyond the last arc, out of label range.
|
||||
// Dead end (target is after the last arc);
|
||||
// rollback to last fork then push
|
||||
upto--;
|
||||
|
@ -180,17 +176,13 @@ abstract class FSTEnum<T> {
|
|||
upto--;
|
||||
}
|
||||
} else {
|
||||
// TODO: if firstLabel == targetLabel
|
||||
long pos;
|
||||
if (arcOffset >= 0) {
|
||||
pos = arc.posArcsStart() - (arc.bytesPerArc() * arcOffset);
|
||||
} else {
|
||||
pos = arc.posArcsStart();
|
||||
}
|
||||
fst.readArcAtPosition(arc, in, pos);
|
||||
if (arc.label() == targetLabel) {
|
||||
if (targetIndex < 0) {
|
||||
targetIndex = -1;
|
||||
} else if (arc.bitTable().isBitSet(targetIndex)) {
|
||||
fst.readArcByDirectAddressing(arc, in, targetIndex);
|
||||
assert arc.label() == targetLabel;
|
||||
// found -- copy pasta from below
|
||||
output[upto] = fst.outputs.add(output[upto-1], arc.output());
|
||||
output[upto] = fst.outputs.add(output[upto - 1], arc.output());
|
||||
if (targetLabel == FST.END_LABEL) {
|
||||
return null;
|
||||
}
|
||||
|
@ -198,7 +190,10 @@ abstract class FSTEnum<T> {
|
|||
incr();
|
||||
return fst.readFirstTargetArc(arc, getArc(upto), fstReader);
|
||||
}
|
||||
// not found, return the next highest
|
||||
// Not found, return the next arc (ceil).
|
||||
int ceilIndex = arc.bitTable().nextBitSet(targetIndex);
|
||||
assert ceilIndex != -1;
|
||||
fst.readArcByDirectAddressing(arc, in, ceilIndex);
|
||||
assert arc.label() > targetLabel;
|
||||
pushFirst();
|
||||
return null;
|
||||
|
@ -319,9 +314,10 @@ abstract class FSTEnum<T> {
|
|||
if (arc.bytesPerArc() != 0 && arc.label() != FST.END_LABEL) {
|
||||
// Arcs are in an array
|
||||
final FST.BytesReader in = fst.getBytesReader();
|
||||
if (arc.arcIdx() == Integer.MIN_VALUE) {
|
||||
arc = doSeekFloorArrayWithGaps(arc, targetLabel, in);
|
||||
if (arc.nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING) {
|
||||
arc = doSeekFloorArrayDirectAddressing(arc, targetLabel, in);
|
||||
} else {
|
||||
assert arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH;
|
||||
arc = doSeekFloorArrayPacked(arc, targetLabel, in);
|
||||
}
|
||||
} else {
|
||||
|
@ -330,46 +326,25 @@ abstract class FSTEnum<T> {
|
|||
}
|
||||
}
|
||||
|
||||
private FST.Arc<T> doSeekFloorArrayWithGaps(FST.Arc<T> arc, int targetLabel, final FST.BytesReader in) throws IOException {
|
||||
// The array is addressed directly by label and may contain holes.
|
||||
in.setPosition(arc.posArcsStart());
|
||||
in.skipBytes(1);
|
||||
int firstLabel = fst.readLabel(in);
|
||||
int targetOffset = targetLabel - firstLabel;
|
||||
if (targetOffset < 0) {
|
||||
//System.out.println(" before first"); Very first arc is after our target TODO: if each
|
||||
// arc could somehow read the arc just before, we can save this re-scan. The ceil case
|
||||
// doesn't need this because it reads the next arc instead:
|
||||
while(true) {
|
||||
// First, walk backwards until we find a first arc
|
||||
// that's before our target label:
|
||||
fst.readFirstTargetArc(getArc(upto-1), arc, fstReader);
|
||||
if (arc.label() < targetLabel) {
|
||||
// Then, scan forwards to the arc just before
|
||||
// the targetLabel:
|
||||
while(!arc.isLast() && fst.readNextArcLabel(arc, in) < targetLabel) {
|
||||
fst.readNextArc(arc, fstReader);
|
||||
}
|
||||
pushLast();
|
||||
return null;
|
||||
}
|
||||
upto--;
|
||||
if (upto == 0) {
|
||||
return null;
|
||||
}
|
||||
targetLabel = getTargetLabel();
|
||||
arc = getArc(upto);
|
||||
}
|
||||
private FST.Arc<T> doSeekFloorArrayDirectAddressing(FST.Arc<T> arc, int targetLabel, FST.BytesReader in) throws IOException {
|
||||
// The array is addressed directly by label, with presence bits to compute the actual arc offset.
|
||||
|
||||
int targetIndex = targetLabel - arc.firstLabel();
|
||||
if (targetIndex < 0) {
|
||||
// Before first arc.
|
||||
return backtrackToFloorArc(arc, targetLabel, in);
|
||||
} else if (targetIndex >= arc.numArcs()) {
|
||||
// After last arc.
|
||||
fst.readArcByDirectAddressing(arc, in, arc.numArcs() - 1);
|
||||
assert arc.label() < targetLabel;
|
||||
assert arc.isLast();
|
||||
pushLast();
|
||||
return null;
|
||||
} else {
|
||||
if (targetOffset >= arc.numArcs()) {
|
||||
fst.readArcAtPosition(arc, in, arc.posArcsStart() - arc.bytesPerArc() * (arc.numArcs() - 1));
|
||||
assert arc.isLast();
|
||||
assert arc.label() < targetLabel: "arc.label=" + arc.label() + " vs targetLabel=" + targetLabel;
|
||||
pushLast();
|
||||
return null;
|
||||
}
|
||||
fst.readArcAtPosition(arc, in, arc.posArcsStart() - arc.bytesPerArc() * targetOffset);
|
||||
if (arc.label() == targetLabel) {
|
||||
// Within label range.
|
||||
if (arc.bitTable().isBitSet(targetIndex)) {
|
||||
fst.readArcByDirectAddressing(arc, in, targetIndex);
|
||||
assert arc.label() == targetLabel;
|
||||
// found -- copy pasta from below
|
||||
output[upto] = fst.outputs.add(output[upto-1], arc.output());
|
||||
if (targetLabel == FST.END_LABEL) {
|
||||
|
@ -379,18 +354,99 @@ abstract class FSTEnum<T> {
|
|||
incr();
|
||||
return fst.readFirstTargetArc(arc, getArc(upto), fstReader);
|
||||
}
|
||||
// Scan backwards to find a floor arc that is not missing
|
||||
for (long arcOffset = arc.posArcsStart() - targetOffset * arc.bytesPerArc(); arcOffset <= arc.posArcsStart(); arcOffset += arc.bytesPerArc()) {
|
||||
// TODO: we can do better here by skipping missing arcs
|
||||
fst.readArcAtPosition(arc, in, arcOffset);
|
||||
if (arc.label() < targetLabel) {
|
||||
assert arc.isLast() || fst.readNextArcLabel(arc, in) > targetLabel;
|
||||
pushLast();
|
||||
return null;
|
||||
// Scan backwards to find a floor arc.
|
||||
int floorIndex = arc.bitTable().previousBitSet(targetIndex);
|
||||
assert floorIndex != -1;
|
||||
fst.readArcByDirectAddressing(arc, in, floorIndex);
|
||||
assert arc.label() < targetLabel;
|
||||
assert arc.isLast() || fst.readNextArcLabel(arc, in) > targetLabel;
|
||||
pushLast();
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Backtracks until it finds a node which first arc is before our target label.`
|
||||
* Then on the node, finds the arc just before the targetLabel.
|
||||
*
|
||||
* @return null to continue the seek floor recursion loop.
|
||||
*/
|
||||
private FST.Arc<T> backtrackToFloorArc(FST.Arc<T> arc, int targetLabel, final FST.BytesReader in) throws IOException {
|
||||
while (true) {
|
||||
// First, walk backwards until we find a node which first arc is before our target label.
|
||||
fst.readFirstTargetArc(getArc(upto-1), arc, fstReader);
|
||||
if (arc.label() < targetLabel) {
|
||||
// Then on this node, find the arc just before the targetLabel.
|
||||
if (!arc.isLast()) {
|
||||
if (arc.bytesPerArc() != 0 && arc.label() != FST.END_LABEL) {
|
||||
if (arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH) {
|
||||
findNextFloorArcBinarySearch(arc, targetLabel, in);
|
||||
} else {
|
||||
assert arc.nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING;
|
||||
findNextFloorArcDirectAddressing(arc, targetLabel, in);
|
||||
}
|
||||
} else {
|
||||
while (!arc.isLast() && fst.readNextArcLabel(arc, in) < targetLabel) {
|
||||
fst.readNextArc(arc, fstReader);
|
||||
}
|
||||
}
|
||||
}
|
||||
assert arc.label() < targetLabel;
|
||||
assert arc.isLast() || fst.readNextArcLabel(arc, in) >= targetLabel;
|
||||
pushLast();
|
||||
return null;
|
||||
}
|
||||
upto--;
|
||||
if (upto == 0) {
|
||||
return null;
|
||||
}
|
||||
targetLabel = getTargetLabel();
|
||||
arc = getArc(upto);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Finds and reads an arc on the current node which label is strictly less than the given label.
|
||||
* Skips the first arc, finds next floor arc; or none if the floor arc is the first
|
||||
* arc itself (in this case it has already been read).
|
||||
* <p>
|
||||
* Precondition: the given arc is the first arc of the node.
|
||||
*/
|
||||
private void findNextFloorArcDirectAddressing(FST.Arc<T> arc, int targetLabel, final FST.BytesReader in) throws IOException {
|
||||
assert arc.nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING;
|
||||
assert arc.label() != FST.END_LABEL;
|
||||
assert arc.label() == arc.firstLabel();
|
||||
if (arc.numArcs() > 1) {
|
||||
int targetIndex = targetLabel - arc.firstLabel();
|
||||
assert targetIndex >= 0;
|
||||
if (targetIndex >= arc.numArcs()) {
|
||||
// Beyond last arc. Take last arc.
|
||||
fst.readArcByDirectAddressing(arc, in, arc.numArcs() - 1);
|
||||
} else {
|
||||
// Take the preceding arc, even if the target is present.
|
||||
int floorIndex = arc.bitTable().previousBitSet(targetIndex);
|
||||
if (floorIndex > 0) {
|
||||
fst.readArcByDirectAddressing(arc, in, floorIndex);
|
||||
}
|
||||
}
|
||||
assert false: "arc.label=" + arc.label() + " vs targetLabel=" + targetLabel;
|
||||
return arc; // unreachable
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Same as {@link #findNextFloorArcDirectAddressing} for binary search node.
|
||||
*/
|
||||
private void findNextFloorArcBinarySearch(FST.Arc<T> arc, int targetLabel, FST.BytesReader in) throws IOException {
|
||||
assert arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH;
|
||||
assert arc.label() != FST.END_LABEL;
|
||||
assert arc.arcIdx() == 0;
|
||||
if (arc.numArcs() > 1) {
|
||||
int idx = Util.binarySearch(fst, arc, targetLabel);
|
||||
assert idx != -1;
|
||||
if (idx > 1) {
|
||||
fst.readArcByIndex(arc, in, idx - 1);
|
||||
} else if (idx < -2) {
|
||||
fst.readArcByIndex(arc, in, -2 - idx);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -412,34 +468,10 @@ abstract class FSTEnum<T> {
|
|||
incr();
|
||||
return fst.readFirstTargetArc(arc, getArc(upto), fstReader);
|
||||
} else if (idx == -1) {
|
||||
//System.out.println(" before first");
|
||||
// Very first arc is after our target
|
||||
// TODO: if each arc could somehow read the arc just
|
||||
// before, we can save this re-scan. The ceil case
|
||||
// doesn't need this because it reads the next arc
|
||||
// instead:
|
||||
while(true) {
|
||||
// First, walk backwards until we find a first arc
|
||||
// that's before our target label:
|
||||
fst.readFirstTargetArc(getArc(upto-1), arc, fstReader);
|
||||
if (arc.label() < targetLabel) {
|
||||
// Then, scan forwards to the arc just before
|
||||
// the targetLabel:
|
||||
while(!arc.isLast() && fst.readNextArcLabel(arc, in) < targetLabel) {
|
||||
fst.readNextArc(arc, fstReader);
|
||||
}
|
||||
pushLast();
|
||||
return null;
|
||||
}
|
||||
upto--;
|
||||
if (upto == 0) {
|
||||
return null;
|
||||
}
|
||||
targetLabel = getTargetLabel();
|
||||
arc = getArc(upto);
|
||||
}
|
||||
// Before first arc.
|
||||
return backtrackToFloorArc(arc, targetLabel, in);
|
||||
} else {
|
||||
// There is a floor arc; idx will be {@code -1 - (floor + 1)}.
|
||||
// There is a floor arc; idx will be (-1 - (floor + 1)).
|
||||
fst.readArcByIndex(arc, in, -2 - idx);
|
||||
assert arc.isLast() || fst.readNextArcLabel(arc, in) > targetLabel;
|
||||
assert arc.label() < targetLabel: "arc.label=" + arc.label() + " vs targetLabel=" + targetLabel;
|
||||
|
|
|
@ -41,9 +41,22 @@ final class NodeHash<T> {
|
|||
|
||||
private boolean nodesEqual(Builder.UnCompiledNode<T> node, long address) throws IOException {
|
||||
fst.readFirstRealTargetArc(address, scratchArc, in);
|
||||
if (scratchArc.isPackedArray() && node.numArcs != scratchArc.numArcs()) {
|
||||
return false;
|
||||
|
||||
// Fail fast for a node with fixed length arcs.
|
||||
if (scratchArc.bytesPerArc() != 0) {
|
||||
if (scratchArc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH) {
|
||||
if (node.numArcs != scratchArc.numArcs()) {
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
assert scratchArc.nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING;
|
||||
if ((node.arcs[node.numArcs - 1].label - node.arcs[0].label + 1) != scratchArc.numArcs()
|
||||
|| node.numArcs != scratchArc.bitTable().countBits()) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for(int arcUpto=0; arcUpto < node.numArcs; arcUpto++) {
|
||||
final Builder.Arc<T> arc = node.arcs[arcUpto];
|
||||
if (arc.label != scratchArc.label() ||
|
||||
|
|
|
@ -151,7 +151,7 @@ public final class Util {
|
|||
|
||||
fst.readFirstRealTargetArc(arc.target(), arc, in);
|
||||
|
||||
if (arc.bytesPerArc() != 0 && arc.arcIdx() > Integer.MIN_VALUE) {
|
||||
if (arc.bytesPerArc() != 0 && arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH) {
|
||||
|
||||
int low = 0;
|
||||
int high = arc.numArcs() -1;
|
||||
|
@ -940,18 +940,27 @@ public final class Util {
|
|||
}
|
||||
fst.readFirstTargetArc(follow, arc, in);
|
||||
if (arc.bytesPerArc() != 0 && arc.label() != FST.END_LABEL) {
|
||||
if (arc.arcIdx() == Integer.MIN_VALUE) {
|
||||
// Arcs are in an array-with-gaps
|
||||
int offset = label - arc.label();
|
||||
if (offset >= arc.numArcs()) {
|
||||
if (arc.nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING) {
|
||||
// Fixed length arcs in a direct addressing node.
|
||||
int targetIndex = label - arc.label();
|
||||
if (targetIndex >= arc.numArcs()) {
|
||||
return null;
|
||||
} else if (offset < 0) {
|
||||
} else if (targetIndex < 0) {
|
||||
return arc;
|
||||
} else {
|
||||
return fst.readArcAtPosition(arc, in, arc.posArcsStart() - offset * arc.bytesPerArc());
|
||||
if (arc.bitTable().isBitSet(targetIndex)) {
|
||||
fst.readArcByDirectAddressing(arc, in, targetIndex);
|
||||
assert arc.label() == label;
|
||||
} else {
|
||||
int ceilIndex = arc.bitTable().nextBitSet(targetIndex);
|
||||
assert ceilIndex != -1;
|
||||
fst.readArcByDirectAddressing(arc, in, ceilIndex);
|
||||
assert arc.label() > label;
|
||||
}
|
||||
return arc;
|
||||
}
|
||||
}
|
||||
// Arcs are packed array -- use binary search to find the target.
|
||||
// Fixed length arcs in a binary search node.
|
||||
int idx = binarySearch(fst, arc, label);
|
||||
if (idx >= 0) {
|
||||
return fst.readArcByIndex(arc, in, idx);
|
||||
|
@ -964,7 +973,8 @@ public final class Util {
|
|||
return fst.readArcByIndex(arc, in , idx);
|
||||
}
|
||||
|
||||
// Linear scan
|
||||
// Variable length arcs in a linear scan list,
|
||||
// or special arc with label == FST.END_LABEL.
|
||||
fst.readFirstRealTargetArc(follow.target(), arc, in);
|
||||
|
||||
while (true) {
|
||||
|
@ -995,6 +1005,7 @@ public final class Util {
|
|||
* @throws IOException when the FST reader does
|
||||
*/
|
||||
static <T> int binarySearch(FST<T> fst, FST.Arc<T> arc, int targetLabel) throws IOException {
|
||||
assert arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH : "Arc is not encoded as packed array for binary search (nodeFlags=" + arc.nodeFlags() + ")";
|
||||
BytesReader in = fst.getBytesReader();
|
||||
int low = arc.arcIdx();
|
||||
int mid = 0;
|
||||
|
|
|
@ -0,0 +1,85 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.util;
|
||||
|
||||
public class TestBitUtil extends LuceneTestCase {
|
||||
|
||||
public void testNextBitSet() {
|
||||
for (int i = 0; i < 10000; i++) {
|
||||
long[] bits = buildRandomBits();
|
||||
int numLong = bits.length - 1;
|
||||
|
||||
// Verify nextBitSet with countBitsUpTo for all bit indexes.
|
||||
for (int bitIndex = -1; bitIndex < 64 * numLong; bitIndex++) {
|
||||
int nextIndex = BitUtil.nextBitSet(bits, numLong, bitIndex);
|
||||
if (nextIndex == -1) {
|
||||
assertEquals("No next bit set, so expected no bit count diff"
|
||||
+ " (i=" + i + " bitIndex=" + bitIndex + ")",
|
||||
BitUtil.countBitsUpTo(bits, numLong, bitIndex + 1), BitUtil.countBits(bits, numLong));
|
||||
} else {
|
||||
assertTrue("Expected next bit set at nextIndex=" + nextIndex
|
||||
+ " (i=" + i + " bitIndex=" + bitIndex + ")",
|
||||
BitUtil.isBitSet(bits, numLong, nextIndex));
|
||||
assertEquals("Next bit set at nextIndex=" + nextIndex
|
||||
+ " so expected bit count diff of 1"
|
||||
+ " (i=" + i + " bitIndex=" + bitIndex + ")",
|
||||
BitUtil.countBitsUpTo(bits, numLong, bitIndex + 1) + 1,
|
||||
BitUtil.countBitsUpTo(bits, numLong, nextIndex + 1));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void testPreviousBitSet() {
|
||||
for (int i = 0; i < 10000; i++) {
|
||||
long[] bits = buildRandomBits();
|
||||
int numLong = bits.length - 1;
|
||||
|
||||
// Verify previousBitSet with countBitsUpTo for all bit indexes.
|
||||
for (int bitIndex = 0; bitIndex <= 64 * numLong; bitIndex++) {
|
||||
int previousIndex = BitUtil.previousBitSet(bits, numLong, bitIndex);
|
||||
if (previousIndex == -1) {
|
||||
assertEquals("No previous bit set, so expected bit count 0"
|
||||
+ " (i=" + i + " bitIndex=" + bitIndex + ")",
|
||||
0, BitUtil.countBitsUpTo(bits, numLong, bitIndex));
|
||||
} else {
|
||||
assertTrue("Expected previous bit set at previousIndex=" + previousIndex
|
||||
+ " (i=" + i + " bitIndex=" + bitIndex + ")",
|
||||
BitUtil.isBitSet(bits, numLong, previousIndex));
|
||||
int bitCount = BitUtil.countBitsUpTo(bits, numLong, Math.min(bitIndex + 1, numLong * Long.SIZE));
|
||||
int expectedPreviousBitCount = bitIndex < numLong * Long.SIZE && BitUtil.isBitSet(bits, numLong, bitIndex) ?
|
||||
bitCount - 1 : bitCount;
|
||||
assertEquals("Previous bit set at previousIndex=" + previousIndex
|
||||
+ " with current bitCount=" + bitCount
|
||||
+ " so expected previousBitCount=" + expectedPreviousBitCount
|
||||
+ " (i=" + i + " bitIndex=" + bitIndex + ")",
|
||||
expectedPreviousBitCount, BitUtil.countBitsUpTo(bits, numLong, previousIndex + 1));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private long[] buildRandomBits() {
|
||||
long[] bits = new long[random().nextInt(3) + 2];
|
||||
for (int j = 0; j < bits.length; j++) {
|
||||
// Bias towards zeros which require special logic.
|
||||
bits[j] = random().nextInt(4) == 0 ? 0L : random().nextLong();
|
||||
}
|
||||
return bits;
|
||||
}
|
||||
}
|
|
@ -303,7 +303,7 @@ public class TestFSTs extends LuceneTestCase {
|
|||
public void testBigSet() throws IOException {
|
||||
testRandomWords(TestUtil.nextInt(random(), 50000, 60000), 1);
|
||||
}
|
||||
|
||||
|
||||
// Build FST for all unique terms in the test line docs
|
||||
// file, up until a doc limit
|
||||
public void testRealTerms() throws Exception {
|
||||
|
@ -1078,10 +1078,10 @@ public class TestFSTs extends LuceneTestCase {
|
|||
int children = verifyStateAndBelow(fst, new FST.Arc<>().copyFrom(arc), depth + 1);
|
||||
|
||||
assertEquals(
|
||||
expanded,
|
||||
(depth <= FST.FIXED_ARRAY_SHALLOW_DISTANCE &&
|
||||
children >= FST.FIXED_ARRAY_NUM_ARCS_SHALLOW) ||
|
||||
children >= FST.FIXED_ARRAY_NUM_ARCS_DEEP);
|
||||
(depth <= FST.FIXED_LENGTH_ARC_SHALLOW_DEPTH &&
|
||||
children >= FST.FIXED_LENGTH_ARC_SHALLOW_NUM_ARCS) ||
|
||||
children >= FST.FIXED_LENGTH_ARC_DEEP_NUM_ARCS,
|
||||
expanded);
|
||||
if (arc.isLast()) break;
|
||||
}
|
||||
|
||||
|
@ -1092,8 +1092,8 @@ public class TestFSTs extends LuceneTestCase {
|
|||
}
|
||||
|
||||
// Sanity check.
|
||||
assertTrue(FST.FIXED_ARRAY_NUM_ARCS_SHALLOW < FST.FIXED_ARRAY_NUM_ARCS_DEEP);
|
||||
assertTrue(FST.FIXED_ARRAY_SHALLOW_DISTANCE >= 0);
|
||||
assertTrue(FST.FIXED_LENGTH_ARC_SHALLOW_NUM_ARCS < FST.FIXED_LENGTH_ARC_DEEP_NUM_ARCS);
|
||||
assertTrue(FST.FIXED_LENGTH_ARC_SHALLOW_DEPTH >= 0);
|
||||
|
||||
SyntheticData s = new SyntheticData();
|
||||
|
||||
|
@ -1635,4 +1635,23 @@ public class TestFSTs extends LuceneTestCase {
|
|||
// expected
|
||||
}
|
||||
}
|
||||
|
||||
public void testSimpleDepth() throws Exception {
|
||||
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
|
||||
Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
|
||||
BytesRef ab = new BytesRef("ab");
|
||||
BytesRef ac = new BytesRef("ac");
|
||||
BytesRef bd = new BytesRef("bd");
|
||||
|
||||
builder.add(Util.toIntsRef(ab, new IntsRefBuilder()), 3L);
|
||||
builder.add(Util.toIntsRef(ac, new IntsRefBuilder()), 5L);
|
||||
builder.add(Util.toIntsRef(bd, new IntsRefBuilder()), 7L);
|
||||
|
||||
FST<Long> fst = builder.finish();
|
||||
|
||||
assertEquals(3, (long) Util.get(fst, ab));
|
||||
assertEquals(5, (long) Util.get(fst, ac));
|
||||
assertEquals(7, (long) Util.get(fst, bd));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,105 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.util.fst;
|
||||
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
|
||||
import org.apache.lucene.store.ByteArrayDataInput;
|
||||
import org.apache.lucene.store.DataInput;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IntsRefBuilder;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
public class TestFstDirect extends LuceneTestCase {
|
||||
|
||||
public void testDenseWithGap() throws Exception {
|
||||
List<String> words = Arrays.asList("ah", "bi", "cj", "dk", "fl", "gm");
|
||||
List<BytesRef> entries = new ArrayList<>();
|
||||
for (String word : words) {
|
||||
entries.add(new BytesRef(word.getBytes("ascii")));
|
||||
}
|
||||
final BytesRefFSTEnum<Object> fstEnum = new BytesRefFSTEnum<>(buildFST(entries));
|
||||
for (BytesRef entry : entries) {
|
||||
assertNotNull(entry.utf8ToString() + " not found", fstEnum.seekExact(entry));
|
||||
}
|
||||
}
|
||||
|
||||
public void testDeDupTails() throws Exception {
|
||||
List<BytesRef> entries = new ArrayList<>();
|
||||
for (int i = 0; i < 1000000; i += 4) {
|
||||
byte[] b = new byte[3];
|
||||
int val = i;
|
||||
for (int j = b.length - 1; j >= 0; --j) {
|
||||
b[j] = (byte) (val & 0xff);
|
||||
val >>= 8;
|
||||
}
|
||||
entries.add(new BytesRef(b));
|
||||
}
|
||||
long size = buildFST(entries).ramBytesUsed();
|
||||
// Size is 1664 when we use only list-encoding. We were previously failing to ever de-dup
|
||||
// arrays-with-gaps, which led this case to blow up.
|
||||
assertTrue(size < 3000);
|
||||
//printf("fst size = %d bytes", size);
|
||||
}
|
||||
|
||||
private FST<Object> buildFST(List<BytesRef> entries) throws Exception {
|
||||
final Outputs<Object> outputs = NoOutputs.getSingleton();
|
||||
final Builder<Object> b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
|
||||
BytesRef last = null;
|
||||
for (BytesRef entry : entries) {
|
||||
if (entry.equals(last) == false) {
|
||||
b.add(Util.toIntsRef(entry, new IntsRefBuilder()), outputs.getNoOutput());
|
||||
}
|
||||
last = entry;
|
||||
}
|
||||
FST<Object> fst = b.finish();
|
||||
return fst;
|
||||
}
|
||||
|
||||
private static void printf(String format, Object ... values) {
|
||||
System.out.println(String.format(Locale.ROOT, format, values));
|
||||
}
|
||||
|
||||
private static long nsToMs(long ns) {
|
||||
return ns / 1_000_000;
|
||||
}
|
||||
|
||||
public static void main(String... args) throws Exception {
|
||||
byte[] buf = Files.readAllBytes(Paths.get(args[0]));
|
||||
DataInput in = new ByteArrayDataInput(buf);
|
||||
FST<BytesRef> fst = new FST<>(in, ByteSequenceOutputs.getSingleton());
|
||||
BytesRefFSTEnum<BytesRef> fstEnum = new BytesRefFSTEnum<>(fst);
|
||||
int sparseArrayArcCount = 0, directArrayArcCount = 0, listArcCount = 0;
|
||||
while(fstEnum.next() != null) {
|
||||
if (fstEnum.arcs[fstEnum.upto].bytesPerArc() == 0) {
|
||||
listArcCount ++;
|
||||
} else if (fstEnum.arcs[fstEnum.upto].arcIdx() == Integer.MIN_VALUE) {
|
||||
directArrayArcCount ++;
|
||||
} else {
|
||||
sparseArrayArcCount ++;
|
||||
}
|
||||
}
|
||||
System.out.println("direct arcs = " + directArrayArcCount + ", sparse arcs = " + sparseArrayArcCount +
|
||||
" list arcs = " + listArcCount);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,212 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.util.fst;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.store.ByteArrayDataInput;
|
||||
import org.apache.lucene.store.DataInput;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IntsRefBuilder;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
public class TestFstDirectAddressing extends LuceneTestCase {
|
||||
|
||||
public void testDenseWithGap() throws Exception {
|
||||
List<String> words = Arrays.asList("ah", "bi", "cj", "dk", "fl", "gm");
|
||||
List<BytesRef> entries = new ArrayList<>();
|
||||
for (String word : words) {
|
||||
entries.add(new BytesRef(word.getBytes(StandardCharsets.US_ASCII)));
|
||||
}
|
||||
final BytesRefFSTEnum<Object> fstEnum = new BytesRefFSTEnum<>(buildFST(entries));
|
||||
for (BytesRef entry : entries) {
|
||||
assertNotNull(entry.utf8ToString() + " not found", fstEnum.seekExact(entry));
|
||||
}
|
||||
}
|
||||
|
||||
public void testDeDupTails() throws Exception {
|
||||
List<BytesRef> entries = new ArrayList<>();
|
||||
for (int i = 0; i < 1000000; i += 4) {
|
||||
byte[] b = new byte[3];
|
||||
int val = i;
|
||||
for (int j = b.length - 1; j >= 0; --j) {
|
||||
b[j] = (byte) (val & 0xff);
|
||||
val >>= 8;
|
||||
}
|
||||
entries.add(new BytesRef(b));
|
||||
}
|
||||
long size = buildFST(entries).ramBytesUsed();
|
||||
// Size is 1648 when we use only list-encoding. We were previously failing to ever de-dup
|
||||
// direct addressing, which led this case to blow up.
|
||||
assertTrue(size <= 1080);
|
||||
//printf("fst size = %d bytes", size);
|
||||
}
|
||||
|
||||
public void testWorstCaseForDirectAddressing() throws Exception {
|
||||
// This test will fail if there is more than 1% memory increase with direct addressing in this worst case.
|
||||
final double MEMORY_INCREASE_LIMIT_PERCENT = 1d;
|
||||
final int NUM_WORDS = 1000000;
|
||||
|
||||
// Generate words with specially crafted bytes.
|
||||
Set<BytesRef> wordSet = new HashSet<>();
|
||||
for (int i = 0; i < NUM_WORDS; ++i) {
|
||||
byte[] b = new byte[5];
|
||||
random().nextBytes(b);
|
||||
for (int j = 0; j < b.length; ++j) {
|
||||
b[j] &= 0xfc; // Make this byte a multiple of 4.
|
||||
}
|
||||
wordSet.add(new BytesRef(b));
|
||||
}
|
||||
List<BytesRef> wordList = new ArrayList<>(wordSet);
|
||||
Collections.sort(wordList);
|
||||
|
||||
// Disable direct addressing and measure the FST size.
|
||||
Builder<Object> builder = createBuilder(-1f);
|
||||
FST<Object> fst = buildFST(wordList, builder);
|
||||
long ramBytesUsedNoDirectAddressing = fst.ramBytesUsed();
|
||||
|
||||
// Enable direct addressing and measure the FST size.
|
||||
builder = createBuilder(Builder.DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR);
|
||||
fst = buildFST(wordList, builder);
|
||||
long ramBytesUsed = fst.ramBytesUsed();
|
||||
|
||||
// Compute the size increase in percents.
|
||||
double directAddressingMemoryIncreasePercent = ((double) ramBytesUsed / ramBytesUsedNoDirectAddressing - 1) * 100;
|
||||
|
||||
// printStats(builder, ramBytesUsed, directAddressingMemoryIncreasePercent);
|
||||
|
||||
// Verify the FST size does not exceed the limit.
|
||||
assertTrue("FST size exceeds limit, size = " + ramBytesUsed
|
||||
+ ", increase = " + directAddressingMemoryIncreasePercent + " %"
|
||||
+ ", limit = " + MEMORY_INCREASE_LIMIT_PERCENT + " %",
|
||||
directAddressingMemoryIncreasePercent < MEMORY_INCREASE_LIMIT_PERCENT);
|
||||
}
|
||||
|
||||
private static void printStats(Builder<Object> builder, long ramBytesUsed, double directAddressingMemoryIncreasePercent) {
|
||||
System.out.println("directAddressingMaxOversizingFactor = " + builder.getDirectAddressingMaxOversizingFactor());
|
||||
System.out.println("ramBytesUsed = "
|
||||
+ String.format(Locale.ENGLISH, "%.2f MB", ramBytesUsed / 1024d / 1024d)
|
||||
+ String.format(Locale.ENGLISH, " (%.2f %% increase with direct addressing)", directAddressingMemoryIncreasePercent));
|
||||
System.out.println("num nodes = " + builder.nodeCount);
|
||||
long fixedLengthArcNodeCount = builder.directAddressingNodeCount + builder.binarySearchNodeCount;
|
||||
System.out.println("num fixed-length-arc nodes = " + fixedLengthArcNodeCount
|
||||
+ String.format(Locale.ENGLISH, " (%.2f %% of all nodes)",
|
||||
((double) fixedLengthArcNodeCount / builder.nodeCount * 100)));
|
||||
System.out.println("num binary-search nodes = " + (builder.binarySearchNodeCount)
|
||||
+ String.format(Locale.ENGLISH, " (%.2f %% of fixed-length-arc nodes)",
|
||||
((double) (builder.binarySearchNodeCount) / fixedLengthArcNodeCount * 100)));
|
||||
System.out.println("num direct-addressing nodes = " + (builder.directAddressingNodeCount)
|
||||
+ String.format(Locale.ENGLISH, " (%.2f %% of fixed-length-arc nodes)",
|
||||
((double) (builder.directAddressingNodeCount) / fixedLengthArcNodeCount * 100)));
|
||||
}
|
||||
|
||||
private static Builder<Object> createBuilder(float directAddressingMaxOversizingFactor) {
|
||||
return new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, NoOutputs.getSingleton(), true, 15)
|
||||
.setDirectAddressingMaxOversizingFactor(directAddressingMaxOversizingFactor);
|
||||
}
|
||||
|
||||
private FST<Object> buildFST(List<BytesRef> entries) throws Exception {
|
||||
return buildFST(entries, createBuilder(Builder.DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR));
|
||||
}
|
||||
|
||||
private static FST<Object> buildFST(List<BytesRef> entries, Builder<Object> builder) throws Exception {
|
||||
BytesRef last = null;
|
||||
for (BytesRef entry : entries) {
|
||||
if (entry.equals(last) == false) {
|
||||
builder.add(Util.toIntsRef(entry, new IntsRefBuilder()), NoOutputs.getSingleton().getNoOutput());
|
||||
}
|
||||
last = entry;
|
||||
}
|
||||
return builder.finish();
|
||||
}
|
||||
|
||||
public static void main(String... args) throws Exception {
|
||||
if (args.length < 2) {
|
||||
throw new IllegalArgumentException("Missing argument");
|
||||
}
|
||||
if (args[0].equals("-countFSTArcs")) {
|
||||
countFSTArcs(args[1]);
|
||||
} else if (args[0].equals("-measureFSTOversizing")) {
|
||||
measureFSTOversizing(args[1]);
|
||||
} else {
|
||||
throw new IllegalArgumentException("Invalid argument " + args[0]);
|
||||
}
|
||||
}
|
||||
|
||||
private static void countFSTArcs(String FSTFilePath) throws IOException {
|
||||
byte[] buf = Files.readAllBytes(Paths.get(FSTFilePath));
|
||||
DataInput in = new ByteArrayDataInput(buf);
|
||||
FST<BytesRef> fst = new FST<>(in, ByteSequenceOutputs.getSingleton());
|
||||
BytesRefFSTEnum<BytesRef> fstEnum = new BytesRefFSTEnum<>(fst);
|
||||
int binarySearchArcCount = 0, directAddressingArcCount = 0, listArcCount = 0;
|
||||
while(fstEnum.next() != null) {
|
||||
if (fstEnum.arcs[fstEnum.upto].bytesPerArc() == 0) {
|
||||
listArcCount ++;
|
||||
} else if (fstEnum.arcs[fstEnum.upto].nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING) {
|
||||
directAddressingArcCount ++;
|
||||
} else {
|
||||
binarySearchArcCount ++;
|
||||
}
|
||||
}
|
||||
System.out.println("direct addressing arcs = " + directAddressingArcCount
|
||||
+ ", binary search arcs = " + binarySearchArcCount
|
||||
+ " list arcs = " + listArcCount);
|
||||
}
|
||||
|
||||
private static void measureFSTOversizing(String wordsFilePath) throws Exception {
|
||||
final int MAX_NUM_WORDS = 1000000;
|
||||
|
||||
// Read real english words.
|
||||
List<BytesRef> wordList = new ArrayList<>();
|
||||
try (BufferedReader reader = Files.newBufferedReader(Paths.get(wordsFilePath))) {
|
||||
while (wordList.size() < MAX_NUM_WORDS) {
|
||||
String word = reader.readLine();
|
||||
if (word == null) {
|
||||
break;
|
||||
}
|
||||
wordList.add(new BytesRef(word));
|
||||
}
|
||||
}
|
||||
Collections.sort(wordList);
|
||||
|
||||
// Disable direct addressing and measure the FST size.
|
||||
Builder<Object> builder = createBuilder(-1f);
|
||||
FST<Object> fst = buildFST(wordList, builder);
|
||||
long ramBytesUsedNoDirectAddressing = fst.ramBytesUsed();
|
||||
|
||||
// Enable direct addressing and measure the FST size.
|
||||
builder = createBuilder(Builder.DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR);
|
||||
fst = buildFST(wordList, builder);
|
||||
long ramBytesUsed = fst.ramBytesUsed();
|
||||
|
||||
// Compute the size increase in percents.
|
||||
double directAddressingMemoryIncreasePercent = ((double) ramBytesUsed / ramBytesUsedNoDirectAddressing - 1) * 100;
|
||||
|
||||
printStats(builder, ramBytesUsed, directAddressingMemoryIncreasePercent);
|
||||
}
|
||||
}
|
|
@ -27,10 +27,10 @@ import org.apache.lucene.util.LuceneTestCase;
|
|||
public class TestUtil extends LuceneTestCase {
|
||||
|
||||
public void testBinarySearch() throws Exception {
|
||||
// Creates a node with 8 arcs spanning (z-A) = 57 chars that will be encoded as a sparse array (no gaps)
|
||||
// requiring binary search
|
||||
// Create a node with 8 arcs spanning (z-A) and ensure it is encoded as a packed array
|
||||
// requiring binary search.
|
||||
List<String> letters = Arrays.asList("A", "E", "J", "K", "L", "O", "T", "z");
|
||||
FST<Object> fst = buildFST(letters, true);
|
||||
FST<Object> fst = buildFST(letters, true, false);
|
||||
FST.Arc<Object> arc = fst.getFirstArc(new FST.Arc<>());
|
||||
arc = fst.readFirstTargetArc(arc, arc, fst.getBytesReader());
|
||||
for (int i = 0; i < letters.size(); i++) {
|
||||
|
@ -47,21 +47,21 @@ public class TestUtil extends LuceneTestCase {
|
|||
|
||||
public void testReadCeilArcPackedArray() throws Exception {
|
||||
List<String> letters = Arrays.asList("A", "E", "J", "K", "L", "O", "T", "z");
|
||||
verifyReadCeilArc(letters, true);
|
||||
verifyReadCeilArc(letters, true, false);
|
||||
}
|
||||
|
||||
public void testReadCeilArcArrayWithGaps() throws Exception {
|
||||
List<String> letters = Arrays.asList("A", "E", "J", "K", "L", "O", "T");
|
||||
verifyReadCeilArc(letters, true);
|
||||
verifyReadCeilArc(letters, true, true);
|
||||
}
|
||||
|
||||
public void testReadCeilArcList() throws Exception {
|
||||
List<String> letters = Arrays.asList("A", "E", "J", "K", "L", "O", "T", "z");
|
||||
verifyReadCeilArc(letters, false);
|
||||
verifyReadCeilArc(letters, false, false);
|
||||
}
|
||||
|
||||
private void verifyReadCeilArc(List<String> letters, boolean allowArrayArcs) throws Exception {
|
||||
FST<Object> fst = buildFST(letters, allowArrayArcs);
|
||||
private void verifyReadCeilArc(List<String> letters, boolean allowArrayArcs, boolean allowDirectAddressing) throws Exception {
|
||||
FST<Object> fst = buildFST(letters, allowArrayArcs, allowDirectAddressing);
|
||||
FST.Arc<Object> first = fst.getFirstArc(new FST.Arc<>());
|
||||
FST.Arc<Object> arc = new FST.Arc<>();
|
||||
FST.BytesReader in = fst.getBytesReader();
|
||||
|
@ -81,9 +81,12 @@ public class TestUtil extends LuceneTestCase {
|
|||
assertNull(Util.readCeilArc('Z', fst, arc, arc, in));
|
||||
}
|
||||
|
||||
private FST<Object> buildFST(List<String> words, boolean allowArrayArcs) throws Exception {
|
||||
private FST<Object> buildFST(List<String> words, boolean allowArrayArcs, boolean allowDirectAddressing) throws Exception {
|
||||
final Outputs<Object> outputs = NoOutputs.getSingleton();
|
||||
final Builder<Object> b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, allowArrayArcs, 15);
|
||||
if (!allowDirectAddressing) {
|
||||
b.setDirectAddressingMaxOversizingFactor(-1f);
|
||||
}
|
||||
|
||||
for (String word : words) {
|
||||
b.add(Util.toIntsRef(new BytesRef(word), new IntsRefBuilder()), outputs.getNoOutput());
|
||||
|
|
Loading…
Reference in New Issue