mirror of https://github.com/apache/lucene.git
LUCENE-8920: Reduce the memory used by direct addressing of arcs (#980)
This commit is contained in:
parent
010fb0b994
commit
5dd9c4c04b
Binary file not shown.
Binary file not shown.
|
@ -1084,7 +1084,7 @@ public final class OrdsSegmentTermsEnum extends BaseTermsEnum {
|
||||||
result.grow(1+upto);
|
result.grow(1+upto);
|
||||||
fr.index.readFirstRealTargetArc(arc.target(), arc, fstReader);
|
fr.index.readFirstRealTargetArc(arc.target(), arc, fstReader);
|
||||||
|
|
||||||
if (arc.bytesPerArc() != 0 && arc.arcIdx() > Integer.MIN_VALUE) {
|
if (arc.bytesPerArc() != 0 && arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH) {
|
||||||
// System.out.println(" array arcs");
|
// System.out.println(" array arcs");
|
||||||
int low = 0;
|
int low = 0;
|
||||||
int high = arc.numArcs() -1;
|
int high = arc.numArcs() -1;
|
||||||
|
|
|
@ -178,4 +178,129 @@ public final class BitUtil {
|
||||||
return ((l >>> 1) ^ -(l & 1));
|
return ((l >>> 1) ^ -(l & 1));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns whether the bit at given zero-based index is set.
|
||||||
|
* <br>Example: bitIndex 66 means the third bit on the right of the second long.
|
||||||
|
*
|
||||||
|
* @param bits The bits stored in an array of long for efficiency.
|
||||||
|
* @param numLongs The number of longs in {@code bits} to consider.
|
||||||
|
* @param bitIndex The bit zero-based index. It must be greater than or equal to 0,
|
||||||
|
* and strictly less than {@code numLongs * Long.SIZE}.
|
||||||
|
*/
|
||||||
|
public static boolean isBitSet(long[] bits, int numLongs, int bitIndex) {
|
||||||
|
assert numLongs >= 0 && numLongs <= bits.length && bitIndex >= 0 && bitIndex < numLongs * Long.SIZE
|
||||||
|
: "bitIndex=" + bitIndex + " numLongs=" + numLongs + " bits.length=" + bits.length;
|
||||||
|
return (bits[bitIndex / Long.SIZE] & (1L << bitIndex)) != 0; // Shifts are mod 64.
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Counts all bits set in the provided longs.
|
||||||
|
*
|
||||||
|
* @param bits The bits stored in an array of long for efficiency.
|
||||||
|
* @param numLongs The number of longs in {@code bits} to consider.
|
||||||
|
*/
|
||||||
|
public static int countBits(long[] bits, int numLongs) {
|
||||||
|
assert numLongs >= 0 && numLongs <= bits.length
|
||||||
|
: "numLongs=" + numLongs + " bits.length=" + bits.length;
|
||||||
|
int bitCount = 0;
|
||||||
|
for (int i = 0; i < numLongs; i++) {
|
||||||
|
bitCount += Long.bitCount(bits[i]);
|
||||||
|
}
|
||||||
|
return bitCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Counts the bits set up to the given bit zero-based index, exclusive.
|
||||||
|
* <br>In other words, how many 1s there are up to the bit at the given index excluded.
|
||||||
|
* <br>Example: bitIndex 66 means the third bit on the right of the second long.
|
||||||
|
*
|
||||||
|
* @param bits The bits stored in an array of long for efficiency.
|
||||||
|
* @param numLongs The number of longs in {@code bits} to consider.
|
||||||
|
* @param bitIndex The bit zero-based index, exclusive. It must be greater than or equal to 0,
|
||||||
|
* and less than or equal to {@code numLongs * Long.SIZE}.
|
||||||
|
*/
|
||||||
|
public static int countBitsUpTo(long[] bits, int numLongs, int bitIndex) {
|
||||||
|
assert numLongs >= 0 && numLongs <= bits.length && bitIndex >= 0 && bitIndex <= numLongs * Long.SIZE
|
||||||
|
: "bitIndex=" + bitIndex + " numLongs=" + numLongs + " bits.length=" + bits.length;
|
||||||
|
int bitCount = 0;
|
||||||
|
int lastLong = bitIndex / Long.SIZE;
|
||||||
|
for (int i = 0; i < lastLong; i++) {
|
||||||
|
// Count the bits set for all plain longs.
|
||||||
|
bitCount += Long.bitCount(bits[i]);
|
||||||
|
}
|
||||||
|
if (lastLong < numLongs) {
|
||||||
|
// Prepare a mask with 1s on the right up to bitIndex exclusive.
|
||||||
|
long mask = (1L << bitIndex) - 1L; // Shifts are mod 64.
|
||||||
|
// Count the bits set only within the mask part, so up to bitIndex exclusive.
|
||||||
|
bitCount += Long.bitCount(bits[lastLong] & mask);
|
||||||
|
}
|
||||||
|
return bitCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the index of the next bit set following the given bit zero-based index.
|
||||||
|
* <br>For example with bits 100011:
|
||||||
|
* the next bit set after index=-1 is at index=0;
|
||||||
|
* the next bit set after index=0 is at index=1;
|
||||||
|
* the next bit set after index=1 is at index=5;
|
||||||
|
* there is no next bit set after index=5.
|
||||||
|
*
|
||||||
|
* @param bits The bits stored in an array of long for efficiency.
|
||||||
|
* @param numLongs The number of longs in {@code bits} to consider.
|
||||||
|
* @param bitIndex The bit zero-based index. It must be greater than or equal to -1,
|
||||||
|
* and strictly less than {@code numLongs * Long.SIZE}.
|
||||||
|
* @return The zero-based index of the next bit set after the provided {@code bitIndex};
|
||||||
|
* or -1 if none.
|
||||||
|
*/
|
||||||
|
public static int nextBitSet(long[] bits, int numLongs, int bitIndex) {
|
||||||
|
assert numLongs >= 0 && numLongs <= bits.length && bitIndex >= -1 && bitIndex < numLongs * Long.SIZE
|
||||||
|
: "bitIndex=" + bitIndex + " numLongs=" + numLongs + " bits.length=" + bits.length;
|
||||||
|
int longIndex = bitIndex / Long.SIZE;
|
||||||
|
// Prepare a mask with 1s on the left down to bitIndex exclusive.
|
||||||
|
long mask = -(1L << (bitIndex + 1)); // Shifts are mod 64.
|
||||||
|
long l = mask == -1 && bitIndex != -1 ? 0 : bits[longIndex] & mask;
|
||||||
|
while (l == 0) {
|
||||||
|
if (++longIndex == numLongs) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
l = bits[longIndex];
|
||||||
|
}
|
||||||
|
return Long.numberOfTrailingZeros(l) + longIndex * 64;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the index of the previous bit set preceding the given bit zero-based index.
|
||||||
|
* <br>For example with bits 100011:
|
||||||
|
* there is no previous bit set before index=0.
|
||||||
|
* the previous bit set before index=1 is at index=0;
|
||||||
|
* the previous bit set before index=5 is at index=1;
|
||||||
|
* the previous bit set before index=64 is at index=5;
|
||||||
|
*
|
||||||
|
* @param bits The bits stored in an array of long for efficiency.
|
||||||
|
* @param numLongs The number of longs in {@code bits} to consider.
|
||||||
|
* @param bitIndex The bit zero-based index. It must be greater than or equal to 0,
|
||||||
|
* and less than or equal to {@code numLongs * Long.SIZE}.
|
||||||
|
* @return The zero-based index of the previous bit set before the provided {@code bitIndex};
|
||||||
|
* or -1 if none.
|
||||||
|
*/
|
||||||
|
public static int previousBitSet(long[] bits, int numLongs, int bitIndex) {
|
||||||
|
assert numLongs >= 0 && numLongs <= bits.length && bitIndex >= 0 && bitIndex <= numLongs * Long.SIZE
|
||||||
|
: "bitIndex=" + bitIndex + " numLongs=" + numLongs + " bits.length=" + bits.length;
|
||||||
|
int longIndex = bitIndex / Long.SIZE;
|
||||||
|
long l;
|
||||||
|
if (longIndex == numLongs) {
|
||||||
|
l = 0;
|
||||||
|
} else {
|
||||||
|
// Prepare a mask with 1s on the right up to bitIndex exclusive.
|
||||||
|
long mask = (1L << bitIndex) - 1L; // Shifts are mod 64.
|
||||||
|
l = bits[longIndex] & mask;
|
||||||
|
}
|
||||||
|
while (l == 0) {
|
||||||
|
if (longIndex-- == 0) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
l = bits[longIndex];
|
||||||
|
}
|
||||||
|
return 63 - Long.numberOfLeadingZeros(l) + longIndex * 64;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.lucene.util.fst;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.store.ByteArrayDataOutput;
|
||||||
import org.apache.lucene.util.ArrayUtil;
|
import org.apache.lucene.util.ArrayUtil;
|
||||||
import org.apache.lucene.util.IntsRef;
|
import org.apache.lucene.util.IntsRef;
|
||||||
import org.apache.lucene.util.IntsRefBuilder;
|
import org.apache.lucene.util.IntsRefBuilder;
|
||||||
|
@ -50,6 +51,30 @@ import org.apache.lucene.util.fst.FST.INPUT_TYPE; // javadoc
|
||||||
|
|
||||||
public class Builder<T> {
|
public class Builder<T> {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Default oversizing factor used to decide whether to encode a node with direct addressing or binary search.
|
||||||
|
* Default is 1: ensure no oversizing on average.
|
||||||
|
* <p>
|
||||||
|
* This factor does not determine whether to encode a node with a list of variable length arcs or with
|
||||||
|
* fixed length arcs. It only determines the effective encoding of a node that is already known to be
|
||||||
|
* encoded with fixed length arcs.
|
||||||
|
* See {@code FST.shouldExpandNodeWithFixedLengthArcs()}
|
||||||
|
* and {@code FST.shouldExpandNodeWithDirectAddressing()}.
|
||||||
|
* <p>
|
||||||
|
* For English words we measured 217K nodes, only 3.27% nodes are encoded with fixed length arcs,
|
||||||
|
* and 99.99% of them with direct addressing. Overall FST memory reduced by 1.67%.
|
||||||
|
* <p>
|
||||||
|
* For worst case we measured 168K nodes, 50% of them are encoded with fixed length arcs,
|
||||||
|
* and 14% of them with direct encoding. Overall FST memory reduced by 0.8%.
|
||||||
|
* <p>
|
||||||
|
* Use {@code TestFstDirectAddressing.main()}
|
||||||
|
* and {@code TestFstDirectAddressing.testWorstCaseForDirectAddressing()}
|
||||||
|
* to evaluate a change.
|
||||||
|
*
|
||||||
|
* @see #setDirectAddressingMaxOversizingFactor
|
||||||
|
*/
|
||||||
|
static final float DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR = 1.0f;
|
||||||
|
|
||||||
private final NodeHash<T> dedupHash;
|
private final NodeHash<T> dedupHash;
|
||||||
final FST<T> fst;
|
final FST<T> fst;
|
||||||
private final T NO_OUTPUT;
|
private final T NO_OUTPUT;
|
||||||
|
@ -83,12 +108,18 @@ public class Builder<T> {
|
||||||
long lastFrozenNode;
|
long lastFrozenNode;
|
||||||
|
|
||||||
// Reused temporarily while building the FST:
|
// Reused temporarily while building the FST:
|
||||||
int[] reusedBytesPerArc = new int[4];
|
int[] numBytesPerArc = new int[4];
|
||||||
|
int[] numLabelBytesPerArc = new int[numBytesPerArc.length];
|
||||||
|
final FixedLengthArcsBuffer fixedLengthArcsBuffer = new FixedLengthArcsBuffer();
|
||||||
|
|
||||||
long arcCount;
|
long arcCount;
|
||||||
long nodeCount;
|
long nodeCount;
|
||||||
|
long binarySearchNodeCount;
|
||||||
|
long directAddressingNodeCount;
|
||||||
|
|
||||||
boolean allowArrayArcs;
|
boolean allowFixedLengthArcs;
|
||||||
|
float directAddressingMaxOversizingFactor = DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR;
|
||||||
|
long directAddressingExpansionCredit;
|
||||||
|
|
||||||
BytesStore bytes;
|
BytesStore bytes;
|
||||||
|
|
||||||
|
@ -138,9 +169,9 @@ public class Builder<T> {
|
||||||
* FSA, use {@link NoOutputs#getSingleton()} and {@link NoOutputs#getNoOutput()} as the
|
* FSA, use {@link NoOutputs#getSingleton()} and {@link NoOutputs#getNoOutput()} as the
|
||||||
* singleton output object.
|
* singleton output object.
|
||||||
*
|
*
|
||||||
* @param allowArrayArcs Pass false to disable the array arc optimization
|
* @param allowFixedLengthArcs Pass false to disable the fixed length arc optimization (binary search or
|
||||||
* while building the FST; this will make the resulting
|
* direct addressing) while building the FST; this will make the resulting FST smaller but slower to
|
||||||
* FST smaller but slower to traverse.
|
* traverse.
|
||||||
*
|
*
|
||||||
* @param bytesPageBits How many bits wide to make each
|
* @param bytesPageBits How many bits wide to make each
|
||||||
* byte[] block in the BytesStore; if you know the FST
|
* byte[] block in the BytesStore; if you know the FST
|
||||||
|
@ -149,12 +180,12 @@ public class Builder<T> {
|
||||||
*/
|
*/
|
||||||
public Builder(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doShareSuffix,
|
public Builder(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doShareSuffix,
|
||||||
boolean doShareNonSingletonNodes, int shareMaxTailLength, Outputs<T> outputs,
|
boolean doShareNonSingletonNodes, int shareMaxTailLength, Outputs<T> outputs,
|
||||||
boolean allowArrayArcs, int bytesPageBits) {
|
boolean allowFixedLengthArcs, int bytesPageBits) {
|
||||||
this.minSuffixCount1 = minSuffixCount1;
|
this.minSuffixCount1 = minSuffixCount1;
|
||||||
this.minSuffixCount2 = minSuffixCount2;
|
this.minSuffixCount2 = minSuffixCount2;
|
||||||
this.doShareNonSingletonNodes = doShareNonSingletonNodes;
|
this.doShareNonSingletonNodes = doShareNonSingletonNodes;
|
||||||
this.shareMaxTailLength = shareMaxTailLength;
|
this.shareMaxTailLength = shareMaxTailLength;
|
||||||
this.allowArrayArcs = allowArrayArcs;
|
this.allowFixedLengthArcs = allowFixedLengthArcs;
|
||||||
fst = new FST<>(inputType, outputs, bytesPageBits);
|
fst = new FST<>(inputType, outputs, bytesPageBits);
|
||||||
bytes = fst.bytes;
|
bytes = fst.bytes;
|
||||||
assert bytes != null;
|
assert bytes != null;
|
||||||
|
@ -173,6 +204,27 @@ public class Builder<T> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Overrides the default the maximum oversizing of fixed array allowed to enable direct addressing
|
||||||
|
* of arcs instead of binary search.
|
||||||
|
* <p>
|
||||||
|
* Setting this factor to a negative value (e.g. -1) effectively disables direct addressing,
|
||||||
|
* only binary search nodes will be created.
|
||||||
|
*
|
||||||
|
* @see #DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR
|
||||||
|
*/
|
||||||
|
public Builder<T> setDirectAddressingMaxOversizingFactor(float factor) {
|
||||||
|
directAddressingMaxOversizingFactor = factor;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @see #setDirectAddressingMaxOversizingFactor(float)
|
||||||
|
*/
|
||||||
|
public float getDirectAddressingMaxOversizingFactor() {
|
||||||
|
return directAddressingMaxOversizingFactor;
|
||||||
|
}
|
||||||
|
|
||||||
public long getTermCount() {
|
public long getTermCount() {
|
||||||
return frontier[0].inputCount;
|
return frontier[0].inputCount;
|
||||||
}
|
}
|
||||||
|
@ -639,4 +691,52 @@ public class Builder<T> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reusable buffer for building nodes with fixed length arcs (binary search or direct addressing).
|
||||||
|
*/
|
||||||
|
static class FixedLengthArcsBuffer {
|
||||||
|
|
||||||
|
// Initial capacity is the max length required for the header of a node with fixed length arcs:
|
||||||
|
// header(byte) + numArcs(vint) + numBytes(vint)
|
||||||
|
private byte[] bytes = new byte[11];
|
||||||
|
private final ByteArrayDataOutput bado = new ByteArrayDataOutput(bytes);
|
||||||
|
|
||||||
|
/** Ensures the capacity of the internal byte array. Enlarges it if needed. */
|
||||||
|
FixedLengthArcsBuffer ensureCapacity(int capacity) {
|
||||||
|
if (bytes.length < capacity) {
|
||||||
|
bytes = new byte[ArrayUtil.oversize(capacity, Byte.BYTES)];
|
||||||
|
bado.reset(bytes);
|
||||||
|
}
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
FixedLengthArcsBuffer resetPosition() {
|
||||||
|
bado.reset(bytes);
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
FixedLengthArcsBuffer writeByte(byte b) {
|
||||||
|
bado.writeByte(b);
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
FixedLengthArcsBuffer writeVInt(int i) {
|
||||||
|
try {
|
||||||
|
bado.writeVInt(i);
|
||||||
|
} catch (IOException e) { // Never thrown.
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
int getPosition() {
|
||||||
|
return bado.getPosition();
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Gets the internal byte array. */
|
||||||
|
byte[] getBytes() {
|
||||||
|
return bytes;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -239,6 +239,27 @@ class BytesStore extends DataOutput implements Accountable {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Copies bytes from this store to a target byte array. */
|
||||||
|
public void copyBytes(long src, byte[] dest, int offset, int len) {
|
||||||
|
int blockIndex = (int) (src >> blockBits);
|
||||||
|
int upto = (int) (src & blockMask);
|
||||||
|
byte[] block = blocks.get(blockIndex);
|
||||||
|
while (len > 0) {
|
||||||
|
int chunk = blockSize - upto;
|
||||||
|
if (len <= chunk) {
|
||||||
|
System.arraycopy(block, upto, dest, offset, len);
|
||||||
|
break;
|
||||||
|
} else {
|
||||||
|
System.arraycopy(block, upto, dest, offset, chunk);
|
||||||
|
blockIndex++;
|
||||||
|
block = blocks.get(blockIndex);
|
||||||
|
upto = 0;
|
||||||
|
len -= chunk;
|
||||||
|
offset += chunk;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/** Writes an int at the absolute position without
|
/** Writes an int at the absolute position without
|
||||||
* changing the current pointer. */
|
* changing the current pointer. */
|
||||||
public void writeInt(long pos, int value) {
|
public void writeInt(long pos, int value) {
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -138,12 +138,13 @@ abstract class FSTEnum<T> {
|
||||||
while(arc != null) {
|
while(arc != null) {
|
||||||
int targetLabel = getTargetLabel();
|
int targetLabel = getTargetLabel();
|
||||||
//System.out.println(" cycle upto=" + upto + " arc.label=" + arc.label + " (" + (char) arc.label + ") vs targetLabel=" + targetLabel);
|
//System.out.println(" cycle upto=" + upto + " arc.label=" + arc.label + " (" + (char) arc.label + ") vs targetLabel=" + targetLabel);
|
||||||
if (arc.bytesPerArc() != 0 && arc.label() != -1) {
|
if (arc.bytesPerArc() != 0 && arc.label() != FST.END_LABEL) {
|
||||||
// Arcs are in an array
|
// Arcs are in an array
|
||||||
final FST.BytesReader in = fst.getBytesReader();
|
final FST.BytesReader in = fst.getBytesReader();
|
||||||
if (arc.arcIdx() == Integer.MIN_VALUE) {
|
if (arc.nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING) {
|
||||||
arc = doSeekCeilArrayWithGaps(arc, targetLabel, in);
|
arc = doSeekCeilArrayDirectAddressing(arc, targetLabel, in);
|
||||||
} else {
|
} else {
|
||||||
|
assert arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH;
|
||||||
arc = doSeekCeilArrayPacked(arc, targetLabel, in);
|
arc = doSeekCeilArrayPacked(arc, targetLabel, in);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
@ -152,17 +153,12 @@ abstract class FSTEnum<T> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private FST.Arc<T> doSeekCeilArrayWithGaps(final FST.Arc<T> arc, final int targetLabel, final FST.BytesReader in) throws IOException {
|
private FST.Arc<T> doSeekCeilArrayDirectAddressing(final FST.Arc<T> arc, final int targetLabel, final FST.BytesReader in) throws IOException {
|
||||||
// The array is addressed directly by label and may contain holes.
|
// The array is addressed directly by label, with presence bits to compute the actual arc offset.
|
||||||
|
|
||||||
in.setPosition(arc.posArcsStart());
|
int targetIndex = targetLabel - arc.firstLabel();
|
||||||
in.skipBytes(1);
|
if (targetIndex >= arc.numArcs()) {
|
||||||
int firstLabel = fst.readLabel(in);
|
// Target is beyond the last arc, out of label range.
|
||||||
int arcOffset = targetLabel - firstLabel;
|
|
||||||
if (arcOffset >= arc.numArcs()) {
|
|
||||||
// target is beyond the last arc
|
|
||||||
fst.readArcAtPosition(arc, in, arc.posArcsStart() - (arc.numArcs() - 1) * arc.bytesPerArc());
|
|
||||||
assert arc.isLast();
|
|
||||||
// Dead end (target is after the last arc);
|
// Dead end (target is after the last arc);
|
||||||
// rollback to last fork then push
|
// rollback to last fork then push
|
||||||
upto--;
|
upto--;
|
||||||
|
@ -180,17 +176,13 @@ abstract class FSTEnum<T> {
|
||||||
upto--;
|
upto--;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// TODO: if firstLabel == targetLabel
|
if (targetIndex < 0) {
|
||||||
long pos;
|
targetIndex = -1;
|
||||||
if (arcOffset >= 0) {
|
} else if (arc.bitTable().isBitSet(targetIndex)) {
|
||||||
pos = arc.posArcsStart() - (arc.bytesPerArc() * arcOffset);
|
fst.readArcByDirectAddressing(arc, in, targetIndex);
|
||||||
} else {
|
assert arc.label() == targetLabel;
|
||||||
pos = arc.posArcsStart();
|
|
||||||
}
|
|
||||||
fst.readArcAtPosition(arc, in, pos);
|
|
||||||
if (arc.label() == targetLabel) {
|
|
||||||
// found -- copy pasta from below
|
// found -- copy pasta from below
|
||||||
output[upto] = fst.outputs.add(output[upto-1], arc.output());
|
output[upto] = fst.outputs.add(output[upto - 1], arc.output());
|
||||||
if (targetLabel == FST.END_LABEL) {
|
if (targetLabel == FST.END_LABEL) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
@ -198,7 +190,10 @@ abstract class FSTEnum<T> {
|
||||||
incr();
|
incr();
|
||||||
return fst.readFirstTargetArc(arc, getArc(upto), fstReader);
|
return fst.readFirstTargetArc(arc, getArc(upto), fstReader);
|
||||||
}
|
}
|
||||||
// not found, return the next highest
|
// Not found, return the next arc (ceil).
|
||||||
|
int ceilIndex = arc.bitTable().nextBitSet(targetIndex);
|
||||||
|
assert ceilIndex != -1;
|
||||||
|
fst.readArcByDirectAddressing(arc, in, ceilIndex);
|
||||||
assert arc.label() > targetLabel;
|
assert arc.label() > targetLabel;
|
||||||
pushFirst();
|
pushFirst();
|
||||||
return null;
|
return null;
|
||||||
|
@ -319,9 +314,10 @@ abstract class FSTEnum<T> {
|
||||||
if (arc.bytesPerArc() != 0 && arc.label() != FST.END_LABEL) {
|
if (arc.bytesPerArc() != 0 && arc.label() != FST.END_LABEL) {
|
||||||
// Arcs are in an array
|
// Arcs are in an array
|
||||||
final FST.BytesReader in = fst.getBytesReader();
|
final FST.BytesReader in = fst.getBytesReader();
|
||||||
if (arc.arcIdx() == Integer.MIN_VALUE) {
|
if (arc.nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING) {
|
||||||
arc = doSeekFloorArrayWithGaps(arc, targetLabel, in);
|
arc = doSeekFloorArrayDirectAddressing(arc, targetLabel, in);
|
||||||
} else {
|
} else {
|
||||||
|
assert arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH;
|
||||||
arc = doSeekFloorArrayPacked(arc, targetLabel, in);
|
arc = doSeekFloorArrayPacked(arc, targetLabel, in);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
@ -330,46 +326,25 @@ abstract class FSTEnum<T> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private FST.Arc<T> doSeekFloorArrayWithGaps(FST.Arc<T> arc, int targetLabel, final FST.BytesReader in) throws IOException {
|
private FST.Arc<T> doSeekFloorArrayDirectAddressing(FST.Arc<T> arc, int targetLabel, FST.BytesReader in) throws IOException {
|
||||||
// The array is addressed directly by label and may contain holes.
|
// The array is addressed directly by label, with presence bits to compute the actual arc offset.
|
||||||
in.setPosition(arc.posArcsStart());
|
|
||||||
in.skipBytes(1);
|
int targetIndex = targetLabel - arc.firstLabel();
|
||||||
int firstLabel = fst.readLabel(in);
|
if (targetIndex < 0) {
|
||||||
int targetOffset = targetLabel - firstLabel;
|
// Before first arc.
|
||||||
if (targetOffset < 0) {
|
return backtrackToFloorArc(arc, targetLabel, in);
|
||||||
//System.out.println(" before first"); Very first arc is after our target TODO: if each
|
} else if (targetIndex >= arc.numArcs()) {
|
||||||
// arc could somehow read the arc just before, we can save this re-scan. The ceil case
|
// After last arc.
|
||||||
// doesn't need this because it reads the next arc instead:
|
fst.readArcByDirectAddressing(arc, in, arc.numArcs() - 1);
|
||||||
while(true) {
|
assert arc.label() < targetLabel;
|
||||||
// First, walk backwards until we find a first arc
|
assert arc.isLast();
|
||||||
// that's before our target label:
|
pushLast();
|
||||||
fst.readFirstTargetArc(getArc(upto-1), arc, fstReader);
|
return null;
|
||||||
if (arc.label() < targetLabel) {
|
|
||||||
// Then, scan forwards to the arc just before
|
|
||||||
// the targetLabel:
|
|
||||||
while(!arc.isLast() && fst.readNextArcLabel(arc, in) < targetLabel) {
|
|
||||||
fst.readNextArc(arc, fstReader);
|
|
||||||
}
|
|
||||||
pushLast();
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
upto--;
|
|
||||||
if (upto == 0) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
targetLabel = getTargetLabel();
|
|
||||||
arc = getArc(upto);
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
if (targetOffset >= arc.numArcs()) {
|
// Within label range.
|
||||||
fst.readArcAtPosition(arc, in, arc.posArcsStart() - arc.bytesPerArc() * (arc.numArcs() - 1));
|
if (arc.bitTable().isBitSet(targetIndex)) {
|
||||||
assert arc.isLast();
|
fst.readArcByDirectAddressing(arc, in, targetIndex);
|
||||||
assert arc.label() < targetLabel: "arc.label=" + arc.label() + " vs targetLabel=" + targetLabel;
|
assert arc.label() == targetLabel;
|
||||||
pushLast();
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
fst.readArcAtPosition(arc, in, arc.posArcsStart() - arc.bytesPerArc() * targetOffset);
|
|
||||||
if (arc.label() == targetLabel) {
|
|
||||||
// found -- copy pasta from below
|
// found -- copy pasta from below
|
||||||
output[upto] = fst.outputs.add(output[upto-1], arc.output());
|
output[upto] = fst.outputs.add(output[upto-1], arc.output());
|
||||||
if (targetLabel == FST.END_LABEL) {
|
if (targetLabel == FST.END_LABEL) {
|
||||||
|
@ -379,18 +354,99 @@ abstract class FSTEnum<T> {
|
||||||
incr();
|
incr();
|
||||||
return fst.readFirstTargetArc(arc, getArc(upto), fstReader);
|
return fst.readFirstTargetArc(arc, getArc(upto), fstReader);
|
||||||
}
|
}
|
||||||
// Scan backwards to find a floor arc that is not missing
|
// Scan backwards to find a floor arc.
|
||||||
for (long arcOffset = arc.posArcsStart() - targetOffset * arc.bytesPerArc(); arcOffset <= arc.posArcsStart(); arcOffset += arc.bytesPerArc()) {
|
int floorIndex = arc.bitTable().previousBitSet(targetIndex);
|
||||||
// TODO: we can do better here by skipping missing arcs
|
assert floorIndex != -1;
|
||||||
fst.readArcAtPosition(arc, in, arcOffset);
|
fst.readArcByDirectAddressing(arc, in, floorIndex);
|
||||||
if (arc.label() < targetLabel) {
|
assert arc.label() < targetLabel;
|
||||||
assert arc.isLast() || fst.readNextArcLabel(arc, in) > targetLabel;
|
assert arc.isLast() || fst.readNextArcLabel(arc, in) > targetLabel;
|
||||||
pushLast();
|
pushLast();
|
||||||
return null;
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Backtracks until it finds a node which first arc is before our target label.`
|
||||||
|
* Then on the node, finds the arc just before the targetLabel.
|
||||||
|
*
|
||||||
|
* @return null to continue the seek floor recursion loop.
|
||||||
|
*/
|
||||||
|
private FST.Arc<T> backtrackToFloorArc(FST.Arc<T> arc, int targetLabel, final FST.BytesReader in) throws IOException {
|
||||||
|
while (true) {
|
||||||
|
// First, walk backwards until we find a node which first arc is before our target label.
|
||||||
|
fst.readFirstTargetArc(getArc(upto-1), arc, fstReader);
|
||||||
|
if (arc.label() < targetLabel) {
|
||||||
|
// Then on this node, find the arc just before the targetLabel.
|
||||||
|
if (!arc.isLast()) {
|
||||||
|
if (arc.bytesPerArc() != 0 && arc.label() != FST.END_LABEL) {
|
||||||
|
if (arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH) {
|
||||||
|
findNextFloorArcBinarySearch(arc, targetLabel, in);
|
||||||
|
} else {
|
||||||
|
assert arc.nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING;
|
||||||
|
findNextFloorArcDirectAddressing(arc, targetLabel, in);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
while (!arc.isLast() && fst.readNextArcLabel(arc, in) < targetLabel) {
|
||||||
|
fst.readNextArc(arc, fstReader);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assert arc.label() < targetLabel;
|
||||||
|
assert arc.isLast() || fst.readNextArcLabel(arc, in) >= targetLabel;
|
||||||
|
pushLast();
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
upto--;
|
||||||
|
if (upto == 0) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
targetLabel = getTargetLabel();
|
||||||
|
arc = getArc(upto);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Finds and reads an arc on the current node which label is strictly less than the given label.
|
||||||
|
* Skips the first arc, finds next floor arc; or none if the floor arc is the first
|
||||||
|
* arc itself (in this case it has already been read).
|
||||||
|
* <p>
|
||||||
|
* Precondition: the given arc is the first arc of the node.
|
||||||
|
*/
|
||||||
|
private void findNextFloorArcDirectAddressing(FST.Arc<T> arc, int targetLabel, final FST.BytesReader in) throws IOException {
|
||||||
|
assert arc.nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING;
|
||||||
|
assert arc.label() != FST.END_LABEL;
|
||||||
|
assert arc.label() == arc.firstLabel();
|
||||||
|
if (arc.numArcs() > 1) {
|
||||||
|
int targetIndex = targetLabel - arc.firstLabel();
|
||||||
|
assert targetIndex >= 0;
|
||||||
|
if (targetIndex >= arc.numArcs()) {
|
||||||
|
// Beyond last arc. Take last arc.
|
||||||
|
fst.readArcByDirectAddressing(arc, in, arc.numArcs() - 1);
|
||||||
|
} else {
|
||||||
|
// Take the preceding arc, even if the target is present.
|
||||||
|
int floorIndex = arc.bitTable().previousBitSet(targetIndex);
|
||||||
|
if (floorIndex > 0) {
|
||||||
|
fst.readArcByDirectAddressing(arc, in, floorIndex);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
assert false: "arc.label=" + arc.label() + " vs targetLabel=" + targetLabel;
|
}
|
||||||
return arc; // unreachable
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Same as {@link #findNextFloorArcDirectAddressing} for binary search node.
|
||||||
|
*/
|
||||||
|
private void findNextFloorArcBinarySearch(FST.Arc<T> arc, int targetLabel, FST.BytesReader in) throws IOException {
|
||||||
|
assert arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH;
|
||||||
|
assert arc.label() != FST.END_LABEL;
|
||||||
|
assert arc.arcIdx() == 0;
|
||||||
|
if (arc.numArcs() > 1) {
|
||||||
|
int idx = Util.binarySearch(fst, arc, targetLabel);
|
||||||
|
assert idx != -1;
|
||||||
|
if (idx > 1) {
|
||||||
|
fst.readArcByIndex(arc, in, idx - 1);
|
||||||
|
} else if (idx < -2) {
|
||||||
|
fst.readArcByIndex(arc, in, -2 - idx);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -412,34 +468,10 @@ abstract class FSTEnum<T> {
|
||||||
incr();
|
incr();
|
||||||
return fst.readFirstTargetArc(arc, getArc(upto), fstReader);
|
return fst.readFirstTargetArc(arc, getArc(upto), fstReader);
|
||||||
} else if (idx == -1) {
|
} else if (idx == -1) {
|
||||||
//System.out.println(" before first");
|
// Before first arc.
|
||||||
// Very first arc is after our target
|
return backtrackToFloorArc(arc, targetLabel, in);
|
||||||
// TODO: if each arc could somehow read the arc just
|
|
||||||
// before, we can save this re-scan. The ceil case
|
|
||||||
// doesn't need this because it reads the next arc
|
|
||||||
// instead:
|
|
||||||
while(true) {
|
|
||||||
// First, walk backwards until we find a first arc
|
|
||||||
// that's before our target label:
|
|
||||||
fst.readFirstTargetArc(getArc(upto-1), arc, fstReader);
|
|
||||||
if (arc.label() < targetLabel) {
|
|
||||||
// Then, scan forwards to the arc just before
|
|
||||||
// the targetLabel:
|
|
||||||
while(!arc.isLast() && fst.readNextArcLabel(arc, in) < targetLabel) {
|
|
||||||
fst.readNextArc(arc, fstReader);
|
|
||||||
}
|
|
||||||
pushLast();
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
upto--;
|
|
||||||
if (upto == 0) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
targetLabel = getTargetLabel();
|
|
||||||
arc = getArc(upto);
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
// There is a floor arc; idx will be {@code -1 - (floor + 1)}.
|
// There is a floor arc; idx will be (-1 - (floor + 1)).
|
||||||
fst.readArcByIndex(arc, in, -2 - idx);
|
fst.readArcByIndex(arc, in, -2 - idx);
|
||||||
assert arc.isLast() || fst.readNextArcLabel(arc, in) > targetLabel;
|
assert arc.isLast() || fst.readNextArcLabel(arc, in) > targetLabel;
|
||||||
assert arc.label() < targetLabel: "arc.label=" + arc.label() + " vs targetLabel=" + targetLabel;
|
assert arc.label() < targetLabel: "arc.label=" + arc.label() + " vs targetLabel=" + targetLabel;
|
||||||
|
|
|
@ -41,9 +41,22 @@ final class NodeHash<T> {
|
||||||
|
|
||||||
private boolean nodesEqual(Builder.UnCompiledNode<T> node, long address) throws IOException {
|
private boolean nodesEqual(Builder.UnCompiledNode<T> node, long address) throws IOException {
|
||||||
fst.readFirstRealTargetArc(address, scratchArc, in);
|
fst.readFirstRealTargetArc(address, scratchArc, in);
|
||||||
if (scratchArc.isPackedArray() && node.numArcs != scratchArc.numArcs()) {
|
|
||||||
return false;
|
// Fail fast for a node with fixed length arcs.
|
||||||
|
if (scratchArc.bytesPerArc() != 0) {
|
||||||
|
if (scratchArc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH) {
|
||||||
|
if (node.numArcs != scratchArc.numArcs()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
assert scratchArc.nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING;
|
||||||
|
if ((node.arcs[node.numArcs - 1].label - node.arcs[0].label + 1) != scratchArc.numArcs()
|
||||||
|
|| node.numArcs != scratchArc.bitTable().countBits()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for(int arcUpto=0; arcUpto < node.numArcs; arcUpto++) {
|
for(int arcUpto=0; arcUpto < node.numArcs; arcUpto++) {
|
||||||
final Builder.Arc<T> arc = node.arcs[arcUpto];
|
final Builder.Arc<T> arc = node.arcs[arcUpto];
|
||||||
if (arc.label != scratchArc.label() ||
|
if (arc.label != scratchArc.label() ||
|
||||||
|
|
|
@ -151,7 +151,7 @@ public final class Util {
|
||||||
|
|
||||||
fst.readFirstRealTargetArc(arc.target(), arc, in);
|
fst.readFirstRealTargetArc(arc.target(), arc, in);
|
||||||
|
|
||||||
if (arc.bytesPerArc() != 0 && arc.arcIdx() > Integer.MIN_VALUE) {
|
if (arc.bytesPerArc() != 0 && arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH) {
|
||||||
|
|
||||||
int low = 0;
|
int low = 0;
|
||||||
int high = arc.numArcs() -1;
|
int high = arc.numArcs() -1;
|
||||||
|
@ -940,18 +940,27 @@ public final class Util {
|
||||||
}
|
}
|
||||||
fst.readFirstTargetArc(follow, arc, in);
|
fst.readFirstTargetArc(follow, arc, in);
|
||||||
if (arc.bytesPerArc() != 0 && arc.label() != FST.END_LABEL) {
|
if (arc.bytesPerArc() != 0 && arc.label() != FST.END_LABEL) {
|
||||||
if (arc.arcIdx() == Integer.MIN_VALUE) {
|
if (arc.nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING) {
|
||||||
// Arcs are in an array-with-gaps
|
// Fixed length arcs in a direct addressing node.
|
||||||
int offset = label - arc.label();
|
int targetIndex = label - arc.label();
|
||||||
if (offset >= arc.numArcs()) {
|
if (targetIndex >= arc.numArcs()) {
|
||||||
return null;
|
return null;
|
||||||
} else if (offset < 0) {
|
} else if (targetIndex < 0) {
|
||||||
return arc;
|
return arc;
|
||||||
} else {
|
} else {
|
||||||
return fst.readArcAtPosition(arc, in, arc.posArcsStart() - offset * arc.bytesPerArc());
|
if (arc.bitTable().isBitSet(targetIndex)) {
|
||||||
|
fst.readArcByDirectAddressing(arc, in, targetIndex);
|
||||||
|
assert arc.label() == label;
|
||||||
|
} else {
|
||||||
|
int ceilIndex = arc.bitTable().nextBitSet(targetIndex);
|
||||||
|
assert ceilIndex != -1;
|
||||||
|
fst.readArcByDirectAddressing(arc, in, ceilIndex);
|
||||||
|
assert arc.label() > label;
|
||||||
|
}
|
||||||
|
return arc;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Arcs are packed array -- use binary search to find the target.
|
// Fixed length arcs in a binary search node.
|
||||||
int idx = binarySearch(fst, arc, label);
|
int idx = binarySearch(fst, arc, label);
|
||||||
if (idx >= 0) {
|
if (idx >= 0) {
|
||||||
return fst.readArcByIndex(arc, in, idx);
|
return fst.readArcByIndex(arc, in, idx);
|
||||||
|
@ -964,7 +973,8 @@ public final class Util {
|
||||||
return fst.readArcByIndex(arc, in , idx);
|
return fst.readArcByIndex(arc, in , idx);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Linear scan
|
// Variable length arcs in a linear scan list,
|
||||||
|
// or special arc with label == FST.END_LABEL.
|
||||||
fst.readFirstRealTargetArc(follow.target(), arc, in);
|
fst.readFirstRealTargetArc(follow.target(), arc, in);
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
|
@ -995,6 +1005,7 @@ public final class Util {
|
||||||
* @throws IOException when the FST reader does
|
* @throws IOException when the FST reader does
|
||||||
*/
|
*/
|
||||||
static <T> int binarySearch(FST<T> fst, FST.Arc<T> arc, int targetLabel) throws IOException {
|
static <T> int binarySearch(FST<T> fst, FST.Arc<T> arc, int targetLabel) throws IOException {
|
||||||
|
assert arc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH : "Arc is not encoded as packed array for binary search (nodeFlags=" + arc.nodeFlags() + ")";
|
||||||
BytesReader in = fst.getBytesReader();
|
BytesReader in = fst.getBytesReader();
|
||||||
int low = arc.arcIdx();
|
int low = arc.arcIdx();
|
||||||
int mid = 0;
|
int mid = 0;
|
||||||
|
|
|
@ -0,0 +1,85 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.lucene.util;
|
||||||
|
|
||||||
|
public class TestBitUtil extends LuceneTestCase {
|
||||||
|
|
||||||
|
public void testNextBitSet() {
|
||||||
|
for (int i = 0; i < 10000; i++) {
|
||||||
|
long[] bits = buildRandomBits();
|
||||||
|
int numLong = bits.length - 1;
|
||||||
|
|
||||||
|
// Verify nextBitSet with countBitsUpTo for all bit indexes.
|
||||||
|
for (int bitIndex = -1; bitIndex < 64 * numLong; bitIndex++) {
|
||||||
|
int nextIndex = BitUtil.nextBitSet(bits, numLong, bitIndex);
|
||||||
|
if (nextIndex == -1) {
|
||||||
|
assertEquals("No next bit set, so expected no bit count diff"
|
||||||
|
+ " (i=" + i + " bitIndex=" + bitIndex + ")",
|
||||||
|
BitUtil.countBitsUpTo(bits, numLong, bitIndex + 1), BitUtil.countBits(bits, numLong));
|
||||||
|
} else {
|
||||||
|
assertTrue("Expected next bit set at nextIndex=" + nextIndex
|
||||||
|
+ " (i=" + i + " bitIndex=" + bitIndex + ")",
|
||||||
|
BitUtil.isBitSet(bits, numLong, nextIndex));
|
||||||
|
assertEquals("Next bit set at nextIndex=" + nextIndex
|
||||||
|
+ " so expected bit count diff of 1"
|
||||||
|
+ " (i=" + i + " bitIndex=" + bitIndex + ")",
|
||||||
|
BitUtil.countBitsUpTo(bits, numLong, bitIndex + 1) + 1,
|
||||||
|
BitUtil.countBitsUpTo(bits, numLong, nextIndex + 1));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testPreviousBitSet() {
|
||||||
|
for (int i = 0; i < 10000; i++) {
|
||||||
|
long[] bits = buildRandomBits();
|
||||||
|
int numLong = bits.length - 1;
|
||||||
|
|
||||||
|
// Verify previousBitSet with countBitsUpTo for all bit indexes.
|
||||||
|
for (int bitIndex = 0; bitIndex <= 64 * numLong; bitIndex++) {
|
||||||
|
int previousIndex = BitUtil.previousBitSet(bits, numLong, bitIndex);
|
||||||
|
if (previousIndex == -1) {
|
||||||
|
assertEquals("No previous bit set, so expected bit count 0"
|
||||||
|
+ " (i=" + i + " bitIndex=" + bitIndex + ")",
|
||||||
|
0, BitUtil.countBitsUpTo(bits, numLong, bitIndex));
|
||||||
|
} else {
|
||||||
|
assertTrue("Expected previous bit set at previousIndex=" + previousIndex
|
||||||
|
+ " (i=" + i + " bitIndex=" + bitIndex + ")",
|
||||||
|
BitUtil.isBitSet(bits, numLong, previousIndex));
|
||||||
|
int bitCount = BitUtil.countBitsUpTo(bits, numLong, Math.min(bitIndex + 1, numLong * Long.SIZE));
|
||||||
|
int expectedPreviousBitCount = bitIndex < numLong * Long.SIZE && BitUtil.isBitSet(bits, numLong, bitIndex) ?
|
||||||
|
bitCount - 1 : bitCount;
|
||||||
|
assertEquals("Previous bit set at previousIndex=" + previousIndex
|
||||||
|
+ " with current bitCount=" + bitCount
|
||||||
|
+ " so expected previousBitCount=" + expectedPreviousBitCount
|
||||||
|
+ " (i=" + i + " bitIndex=" + bitIndex + ")",
|
||||||
|
expectedPreviousBitCount, BitUtil.countBitsUpTo(bits, numLong, previousIndex + 1));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private long[] buildRandomBits() {
|
||||||
|
long[] bits = new long[random().nextInt(3) + 2];
|
||||||
|
for (int j = 0; j < bits.length; j++) {
|
||||||
|
// Bias towards zeros which require special logic.
|
||||||
|
bits[j] = random().nextInt(4) == 0 ? 0L : random().nextLong();
|
||||||
|
}
|
||||||
|
return bits;
|
||||||
|
}
|
||||||
|
}
|
|
@ -303,7 +303,7 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
public void testBigSet() throws IOException {
|
public void testBigSet() throws IOException {
|
||||||
testRandomWords(TestUtil.nextInt(random(), 50000, 60000), 1);
|
testRandomWords(TestUtil.nextInt(random(), 50000, 60000), 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Build FST for all unique terms in the test line docs
|
// Build FST for all unique terms in the test line docs
|
||||||
// file, up until a doc limit
|
// file, up until a doc limit
|
||||||
public void testRealTerms() throws Exception {
|
public void testRealTerms() throws Exception {
|
||||||
|
@ -1078,10 +1078,10 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
int children = verifyStateAndBelow(fst, new FST.Arc<>().copyFrom(arc), depth + 1);
|
int children = verifyStateAndBelow(fst, new FST.Arc<>().copyFrom(arc), depth + 1);
|
||||||
|
|
||||||
assertEquals(
|
assertEquals(
|
||||||
expanded,
|
(depth <= FST.FIXED_LENGTH_ARC_SHALLOW_DEPTH &&
|
||||||
(depth <= FST.FIXED_ARRAY_SHALLOW_DISTANCE &&
|
children >= FST.FIXED_LENGTH_ARC_SHALLOW_NUM_ARCS) ||
|
||||||
children >= FST.FIXED_ARRAY_NUM_ARCS_SHALLOW) ||
|
children >= FST.FIXED_LENGTH_ARC_DEEP_NUM_ARCS,
|
||||||
children >= FST.FIXED_ARRAY_NUM_ARCS_DEEP);
|
expanded);
|
||||||
if (arc.isLast()) break;
|
if (arc.isLast()) break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1092,8 +1092,8 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Sanity check.
|
// Sanity check.
|
||||||
assertTrue(FST.FIXED_ARRAY_NUM_ARCS_SHALLOW < FST.FIXED_ARRAY_NUM_ARCS_DEEP);
|
assertTrue(FST.FIXED_LENGTH_ARC_SHALLOW_NUM_ARCS < FST.FIXED_LENGTH_ARC_DEEP_NUM_ARCS);
|
||||||
assertTrue(FST.FIXED_ARRAY_SHALLOW_DISTANCE >= 0);
|
assertTrue(FST.FIXED_LENGTH_ARC_SHALLOW_DEPTH >= 0);
|
||||||
|
|
||||||
SyntheticData s = new SyntheticData();
|
SyntheticData s = new SyntheticData();
|
||||||
|
|
||||||
|
@ -1635,4 +1635,23 @@ public class TestFSTs extends LuceneTestCase {
|
||||||
// expected
|
// expected
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testSimpleDepth() throws Exception {
|
||||||
|
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
|
||||||
|
Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||||
|
|
||||||
|
BytesRef ab = new BytesRef("ab");
|
||||||
|
BytesRef ac = new BytesRef("ac");
|
||||||
|
BytesRef bd = new BytesRef("bd");
|
||||||
|
|
||||||
|
builder.add(Util.toIntsRef(ab, new IntsRefBuilder()), 3L);
|
||||||
|
builder.add(Util.toIntsRef(ac, new IntsRefBuilder()), 5L);
|
||||||
|
builder.add(Util.toIntsRef(bd, new IntsRefBuilder()), 7L);
|
||||||
|
|
||||||
|
FST<Long> fst = builder.finish();
|
||||||
|
|
||||||
|
assertEquals(3, (long) Util.get(fst, ab));
|
||||||
|
assertEquals(5, (long) Util.get(fst, ac));
|
||||||
|
assertEquals(7, (long) Util.get(fst, bd));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,106 +0,0 @@
|
||||||
/*
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
package org.apache.lucene.util.fst;
|
|
||||||
|
|
||||||
import java.nio.file.Files;
|
|
||||||
import java.nio.file.Paths;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Locale;
|
|
||||||
|
|
||||||
import org.apache.lucene.store.ByteArrayDataInput;
|
|
||||||
import org.apache.lucene.store.DataInput;
|
|
||||||
import org.apache.lucene.util.BytesRef;
|
|
||||||
import org.apache.lucene.util.IntsRefBuilder;
|
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
|
||||||
|
|
||||||
|
|
||||||
public class TestFstDirect extends LuceneTestCase {
|
|
||||||
|
|
||||||
public void testDenseWithGap() throws Exception {
|
|
||||||
List<String> words = Arrays.asList("ah", "bi", "cj", "dk", "fl", "gm");
|
|
||||||
List<BytesRef> entries = new ArrayList<>();
|
|
||||||
for (String word : words) {
|
|
||||||
entries.add(new BytesRef(word.getBytes("ascii")));
|
|
||||||
}
|
|
||||||
final BytesRefFSTEnum<Object> fstEnum = new BytesRefFSTEnum<>(buildFST(entries));
|
|
||||||
for (BytesRef entry : entries) {
|
|
||||||
assertNotNull(entry.utf8ToString() + " not found", fstEnum.seekExact(entry));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public void testDeDupTails() throws Exception {
|
|
||||||
List<BytesRef> entries = new ArrayList<>();
|
|
||||||
for (int i = 0; i < 1000000; i += 4) {
|
|
||||||
byte[] b = new byte[3];
|
|
||||||
int val = i;
|
|
||||||
for (int j = b.length - 1; j >= 0; --j) {
|
|
||||||
b[j] = (byte) (val & 0xff);
|
|
||||||
val >>= 8;
|
|
||||||
}
|
|
||||||
entries.add(new BytesRef(b));
|
|
||||||
}
|
|
||||||
long size = buildFST(entries).ramBytesUsed();
|
|
||||||
// Size is 1664 when we use only list-encoding. We were previously failing to ever de-dup
|
|
||||||
// arrays-with-gaps, which led this case to blow up.
|
|
||||||
assertTrue(size < 3000);
|
|
||||||
//printf("fst size = %d bytes", size);
|
|
||||||
}
|
|
||||||
|
|
||||||
private FST<Object> buildFST(List<BytesRef> entries) throws Exception {
|
|
||||||
final Outputs<Object> outputs = NoOutputs.getSingleton();
|
|
||||||
final Builder<Object> b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
|
|
||||||
BytesRef last = null;
|
|
||||||
for (BytesRef entry : entries) {
|
|
||||||
if (entry.equals(last) == false) {
|
|
||||||
b.add(Util.toIntsRef(entry, new IntsRefBuilder()), outputs.getNoOutput());
|
|
||||||
}
|
|
||||||
last = entry;
|
|
||||||
}
|
|
||||||
FST<Object> fst = b.finish();
|
|
||||||
return fst;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static void printf(String format, Object ... values) {
|
|
||||||
System.out.println(String.format(Locale.ROOT, format, values));
|
|
||||||
}
|
|
||||||
|
|
||||||
private static long nsToMs(long ns) {
|
|
||||||
return ns / 1_000_000;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static void main(String... args) throws Exception {
|
|
||||||
byte[] buf = Files.readAllBytes(Paths.get(args[0]));
|
|
||||||
DataInput in = new ByteArrayDataInput(buf);
|
|
||||||
FST<BytesRef> fst = new FST<>(in, ByteSequenceOutputs.getSingleton());
|
|
||||||
BytesRefFSTEnum<BytesRef> fstEnum = new BytesRefFSTEnum<>(fst);
|
|
||||||
int sparseArrayArcCount = 0, directArrayArcCount = 0, listArcCount = 0;
|
|
||||||
while(fstEnum.next() != null) {
|
|
||||||
if (fstEnum.arcs[fstEnum.upto].bytesPerArc() == 0) {
|
|
||||||
listArcCount ++;
|
|
||||||
} else if (fstEnum.arcs[fstEnum.upto].arcIdx() == Integer.MIN_VALUE) {
|
|
||||||
directArrayArcCount ++;
|
|
||||||
} else {
|
|
||||||
sparseArrayArcCount ++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
System.out.println("direct arcs = " + directArrayArcCount + ", sparse arcs = " + sparseArrayArcCount +
|
|
||||||
" list arcs = " + listArcCount);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -0,0 +1,212 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.util.fst;
|
||||||
|
|
||||||
|
import java.io.BufferedReader;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Paths;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Locale;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import org.apache.lucene.store.ByteArrayDataInput;
|
||||||
|
import org.apache.lucene.store.DataInput;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.IntsRefBuilder;
|
||||||
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
|
||||||
|
public class TestFstDirectAddressing extends LuceneTestCase {
|
||||||
|
|
||||||
|
public void testDenseWithGap() throws Exception {
|
||||||
|
List<String> words = Arrays.asList("ah", "bi", "cj", "dk", "fl", "gm");
|
||||||
|
List<BytesRef> entries = new ArrayList<>();
|
||||||
|
for (String word : words) {
|
||||||
|
entries.add(new BytesRef(word.getBytes(StandardCharsets.US_ASCII)));
|
||||||
|
}
|
||||||
|
final BytesRefFSTEnum<Object> fstEnum = new BytesRefFSTEnum<>(buildFST(entries));
|
||||||
|
for (BytesRef entry : entries) {
|
||||||
|
assertNotNull(entry.utf8ToString() + " not found", fstEnum.seekExact(entry));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testDeDupTails() throws Exception {
|
||||||
|
List<BytesRef> entries = new ArrayList<>();
|
||||||
|
for (int i = 0; i < 1000000; i += 4) {
|
||||||
|
byte[] b = new byte[3];
|
||||||
|
int val = i;
|
||||||
|
for (int j = b.length - 1; j >= 0; --j) {
|
||||||
|
b[j] = (byte) (val & 0xff);
|
||||||
|
val >>= 8;
|
||||||
|
}
|
||||||
|
entries.add(new BytesRef(b));
|
||||||
|
}
|
||||||
|
long size = buildFST(entries).ramBytesUsed();
|
||||||
|
// Size is 1648 when we use only list-encoding. We were previously failing to ever de-dup
|
||||||
|
// direct addressing, which led this case to blow up.
|
||||||
|
assertTrue(size <= 1080);
|
||||||
|
//printf("fst size = %d bytes", size);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testWorstCaseForDirectAddressing() throws Exception {
|
||||||
|
// This test will fail if there is more than 1% memory increase with direct addressing in this worst case.
|
||||||
|
final double MEMORY_INCREASE_LIMIT_PERCENT = 1d;
|
||||||
|
final int NUM_WORDS = 1000000;
|
||||||
|
|
||||||
|
// Generate words with specially crafted bytes.
|
||||||
|
Set<BytesRef> wordSet = new HashSet<>();
|
||||||
|
for (int i = 0; i < NUM_WORDS; ++i) {
|
||||||
|
byte[] b = new byte[5];
|
||||||
|
random().nextBytes(b);
|
||||||
|
for (int j = 0; j < b.length; ++j) {
|
||||||
|
b[j] &= 0xfc; // Make this byte a multiple of 4.
|
||||||
|
}
|
||||||
|
wordSet.add(new BytesRef(b));
|
||||||
|
}
|
||||||
|
List<BytesRef> wordList = new ArrayList<>(wordSet);
|
||||||
|
Collections.sort(wordList);
|
||||||
|
|
||||||
|
// Disable direct addressing and measure the FST size.
|
||||||
|
Builder<Object> builder = createBuilder(-1f);
|
||||||
|
FST<Object> fst = buildFST(wordList, builder);
|
||||||
|
long ramBytesUsedNoDirectAddressing = fst.ramBytesUsed();
|
||||||
|
|
||||||
|
// Enable direct addressing and measure the FST size.
|
||||||
|
builder = createBuilder(Builder.DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR);
|
||||||
|
fst = buildFST(wordList, builder);
|
||||||
|
long ramBytesUsed = fst.ramBytesUsed();
|
||||||
|
|
||||||
|
// Compute the size increase in percents.
|
||||||
|
double directAddressingMemoryIncreasePercent = ((double) ramBytesUsed / ramBytesUsedNoDirectAddressing - 1) * 100;
|
||||||
|
|
||||||
|
// printStats(builder, ramBytesUsed, directAddressingMemoryIncreasePercent);
|
||||||
|
|
||||||
|
// Verify the FST size does not exceed the limit.
|
||||||
|
assertTrue("FST size exceeds limit, size = " + ramBytesUsed
|
||||||
|
+ ", increase = " + directAddressingMemoryIncreasePercent + " %"
|
||||||
|
+ ", limit = " + MEMORY_INCREASE_LIMIT_PERCENT + " %",
|
||||||
|
directAddressingMemoryIncreasePercent < MEMORY_INCREASE_LIMIT_PERCENT);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void printStats(Builder<Object> builder, long ramBytesUsed, double directAddressingMemoryIncreasePercent) {
|
||||||
|
System.out.println("directAddressingMaxOversizingFactor = " + builder.getDirectAddressingMaxOversizingFactor());
|
||||||
|
System.out.println("ramBytesUsed = "
|
||||||
|
+ String.format(Locale.ENGLISH, "%.2f MB", ramBytesUsed / 1024d / 1024d)
|
||||||
|
+ String.format(Locale.ENGLISH, " (%.2f %% increase with direct addressing)", directAddressingMemoryIncreasePercent));
|
||||||
|
System.out.println("num nodes = " + builder.nodeCount);
|
||||||
|
long fixedLengthArcNodeCount = builder.directAddressingNodeCount + builder.binarySearchNodeCount;
|
||||||
|
System.out.println("num fixed-length-arc nodes = " + fixedLengthArcNodeCount
|
||||||
|
+ String.format(Locale.ENGLISH, " (%.2f %% of all nodes)",
|
||||||
|
((double) fixedLengthArcNodeCount / builder.nodeCount * 100)));
|
||||||
|
System.out.println("num binary-search nodes = " + (builder.binarySearchNodeCount)
|
||||||
|
+ String.format(Locale.ENGLISH, " (%.2f %% of fixed-length-arc nodes)",
|
||||||
|
((double) (builder.binarySearchNodeCount) / fixedLengthArcNodeCount * 100)));
|
||||||
|
System.out.println("num direct-addressing nodes = " + (builder.directAddressingNodeCount)
|
||||||
|
+ String.format(Locale.ENGLISH, " (%.2f %% of fixed-length-arc nodes)",
|
||||||
|
((double) (builder.directAddressingNodeCount) / fixedLengthArcNodeCount * 100)));
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Builder<Object> createBuilder(float directAddressingMaxOversizingFactor) {
|
||||||
|
return new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, NoOutputs.getSingleton(), true, 15)
|
||||||
|
.setDirectAddressingMaxOversizingFactor(directAddressingMaxOversizingFactor);
|
||||||
|
}
|
||||||
|
|
||||||
|
private FST<Object> buildFST(List<BytesRef> entries) throws Exception {
|
||||||
|
return buildFST(entries, createBuilder(Builder.DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR));
|
||||||
|
}
|
||||||
|
|
||||||
|
private static FST<Object> buildFST(List<BytesRef> entries, Builder<Object> builder) throws Exception {
|
||||||
|
BytesRef last = null;
|
||||||
|
for (BytesRef entry : entries) {
|
||||||
|
if (entry.equals(last) == false) {
|
||||||
|
builder.add(Util.toIntsRef(entry, new IntsRefBuilder()), NoOutputs.getSingleton().getNoOutput());
|
||||||
|
}
|
||||||
|
last = entry;
|
||||||
|
}
|
||||||
|
return builder.finish();
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void main(String... args) throws Exception {
|
||||||
|
if (args.length < 2) {
|
||||||
|
throw new IllegalArgumentException("Missing argument");
|
||||||
|
}
|
||||||
|
if (args[0].equals("-countFSTArcs")) {
|
||||||
|
countFSTArcs(args[1]);
|
||||||
|
} else if (args[0].equals("-measureFSTOversizing")) {
|
||||||
|
measureFSTOversizing(args[1]);
|
||||||
|
} else {
|
||||||
|
throw new IllegalArgumentException("Invalid argument " + args[0]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void countFSTArcs(String FSTFilePath) throws IOException {
|
||||||
|
byte[] buf = Files.readAllBytes(Paths.get(FSTFilePath));
|
||||||
|
DataInput in = new ByteArrayDataInput(buf);
|
||||||
|
FST<BytesRef> fst = new FST<>(in, ByteSequenceOutputs.getSingleton());
|
||||||
|
BytesRefFSTEnum<BytesRef> fstEnum = new BytesRefFSTEnum<>(fst);
|
||||||
|
int binarySearchArcCount = 0, directAddressingArcCount = 0, listArcCount = 0;
|
||||||
|
while(fstEnum.next() != null) {
|
||||||
|
if (fstEnum.arcs[fstEnum.upto].bytesPerArc() == 0) {
|
||||||
|
listArcCount ++;
|
||||||
|
} else if (fstEnum.arcs[fstEnum.upto].nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING) {
|
||||||
|
directAddressingArcCount ++;
|
||||||
|
} else {
|
||||||
|
binarySearchArcCount ++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
System.out.println("direct addressing arcs = " + directAddressingArcCount
|
||||||
|
+ ", binary search arcs = " + binarySearchArcCount
|
||||||
|
+ " list arcs = " + listArcCount);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void measureFSTOversizing(String wordsFilePath) throws Exception {
|
||||||
|
final int MAX_NUM_WORDS = 1000000;
|
||||||
|
|
||||||
|
// Read real english words.
|
||||||
|
List<BytesRef> wordList = new ArrayList<>();
|
||||||
|
try (BufferedReader reader = Files.newBufferedReader(Paths.get(wordsFilePath))) {
|
||||||
|
while (wordList.size() < MAX_NUM_WORDS) {
|
||||||
|
String word = reader.readLine();
|
||||||
|
if (word == null) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
wordList.add(new BytesRef(word));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Collections.sort(wordList);
|
||||||
|
|
||||||
|
// Disable direct addressing and measure the FST size.
|
||||||
|
Builder<Object> builder = createBuilder(-1f);
|
||||||
|
FST<Object> fst = buildFST(wordList, builder);
|
||||||
|
long ramBytesUsedNoDirectAddressing = fst.ramBytesUsed();
|
||||||
|
|
||||||
|
// Enable direct addressing and measure the FST size.
|
||||||
|
builder = createBuilder(Builder.DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR);
|
||||||
|
fst = buildFST(wordList, builder);
|
||||||
|
long ramBytesUsed = fst.ramBytesUsed();
|
||||||
|
|
||||||
|
// Compute the size increase in percents.
|
||||||
|
double directAddressingMemoryIncreasePercent = ((double) ramBytesUsed / ramBytesUsedNoDirectAddressing - 1) * 100;
|
||||||
|
|
||||||
|
printStats(builder, ramBytesUsed, directAddressingMemoryIncreasePercent);
|
||||||
|
}
|
||||||
|
}
|
|
@ -27,10 +27,10 @@ import org.apache.lucene.util.LuceneTestCase;
|
||||||
public class TestUtil extends LuceneTestCase {
|
public class TestUtil extends LuceneTestCase {
|
||||||
|
|
||||||
public void testBinarySearch() throws Exception {
|
public void testBinarySearch() throws Exception {
|
||||||
// Creates a node with 8 arcs spanning (z-A) = 57 chars that will be encoded as a sparse array (no gaps)
|
// Create a node with 8 arcs spanning (z-A) and ensure it is encoded as a packed array
|
||||||
// requiring binary search
|
// requiring binary search.
|
||||||
List<String> letters = Arrays.asList("A", "E", "J", "K", "L", "O", "T", "z");
|
List<String> letters = Arrays.asList("A", "E", "J", "K", "L", "O", "T", "z");
|
||||||
FST<Object> fst = buildFST(letters, true);
|
FST<Object> fst = buildFST(letters, true, false);
|
||||||
FST.Arc<Object> arc = fst.getFirstArc(new FST.Arc<>());
|
FST.Arc<Object> arc = fst.getFirstArc(new FST.Arc<>());
|
||||||
arc = fst.readFirstTargetArc(arc, arc, fst.getBytesReader());
|
arc = fst.readFirstTargetArc(arc, arc, fst.getBytesReader());
|
||||||
for (int i = 0; i < letters.size(); i++) {
|
for (int i = 0; i < letters.size(); i++) {
|
||||||
|
@ -47,21 +47,21 @@ public class TestUtil extends LuceneTestCase {
|
||||||
|
|
||||||
public void testReadCeilArcPackedArray() throws Exception {
|
public void testReadCeilArcPackedArray() throws Exception {
|
||||||
List<String> letters = Arrays.asList("A", "E", "J", "K", "L", "O", "T", "z");
|
List<String> letters = Arrays.asList("A", "E", "J", "K", "L", "O", "T", "z");
|
||||||
verifyReadCeilArc(letters, true);
|
verifyReadCeilArc(letters, true, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testReadCeilArcArrayWithGaps() throws Exception {
|
public void testReadCeilArcArrayWithGaps() throws Exception {
|
||||||
List<String> letters = Arrays.asList("A", "E", "J", "K", "L", "O", "T");
|
List<String> letters = Arrays.asList("A", "E", "J", "K", "L", "O", "T");
|
||||||
verifyReadCeilArc(letters, true);
|
verifyReadCeilArc(letters, true, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testReadCeilArcList() throws Exception {
|
public void testReadCeilArcList() throws Exception {
|
||||||
List<String> letters = Arrays.asList("A", "E", "J", "K", "L", "O", "T", "z");
|
List<String> letters = Arrays.asList("A", "E", "J", "K", "L", "O", "T", "z");
|
||||||
verifyReadCeilArc(letters, false);
|
verifyReadCeilArc(letters, false, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void verifyReadCeilArc(List<String> letters, boolean allowArrayArcs) throws Exception {
|
private void verifyReadCeilArc(List<String> letters, boolean allowArrayArcs, boolean allowDirectAddressing) throws Exception {
|
||||||
FST<Object> fst = buildFST(letters, allowArrayArcs);
|
FST<Object> fst = buildFST(letters, allowArrayArcs, allowDirectAddressing);
|
||||||
FST.Arc<Object> first = fst.getFirstArc(new FST.Arc<>());
|
FST.Arc<Object> first = fst.getFirstArc(new FST.Arc<>());
|
||||||
FST.Arc<Object> arc = new FST.Arc<>();
|
FST.Arc<Object> arc = new FST.Arc<>();
|
||||||
FST.BytesReader in = fst.getBytesReader();
|
FST.BytesReader in = fst.getBytesReader();
|
||||||
|
@ -81,9 +81,12 @@ public class TestUtil extends LuceneTestCase {
|
||||||
assertNull(Util.readCeilArc('Z', fst, arc, arc, in));
|
assertNull(Util.readCeilArc('Z', fst, arc, arc, in));
|
||||||
}
|
}
|
||||||
|
|
||||||
private FST<Object> buildFST(List<String> words, boolean allowArrayArcs) throws Exception {
|
private FST<Object> buildFST(List<String> words, boolean allowArrayArcs, boolean allowDirectAddressing) throws Exception {
|
||||||
final Outputs<Object> outputs = NoOutputs.getSingleton();
|
final Outputs<Object> outputs = NoOutputs.getSingleton();
|
||||||
final Builder<Object> b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, allowArrayArcs, 15);
|
final Builder<Object> b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, allowArrayArcs, 15);
|
||||||
|
if (!allowDirectAddressing) {
|
||||||
|
b.setDirectAddressingMaxOversizingFactor(-1f);
|
||||||
|
}
|
||||||
|
|
||||||
for (String word : words) {
|
for (String word : words) {
|
||||||
b.add(Util.toIntsRef(new BytesRef(word), new IntsRefBuilder()), outputs.getNoOutput());
|
b.add(Util.toIntsRef(new BytesRef(word), new IntsRefBuilder()), outputs.getNoOutput());
|
||||||
|
|
Loading…
Reference in New Issue