mirror of https://github.com/apache/lucene.git
Bound the RAM used by the NodeHash (sharing common suffixes) during FST compilation (#12633)
* tweak comments; change if to switch * remove old SOPs, minor comment styling, fixed silly performance bug on rehash using the wrong bitsRequired (count vs node) * first raw cut; some nocommits added; some tests fail * tests pass! * fix silly fallback hash bug * remove SOPs; add some temporary debugging metrics * add temporary tool to test FST performance across differing NodeHash sizes * remove (now deleted) shouldShareNonSingletonNodes call from Lucene90BlockTreeTermsWriter * add simple tool to render results table to GitHub MD * add simple temporary tool to iterate all terms from a provided luceneutil wikipedia index and build an FST from them * first cut at using packed ints for hash t able again * add some nocommits; tweak test_all_sizes.py to new RAM usage approach; when half of the double barrel is full, allocate new primary hash at full size to save cost of continuously rehashing for a large FST * switch to limit suffix hash by RAM usage not count (more intuitive for users); clean up some stale nocommits * switch to more intuitive approximate RAM (mb) limit for allowed size of NodeHash * nuke a few nocommits; a few more remain * remove DO_PRINT_HASH_RAM * no more FST pruning * remove final nocommit: randomly change allowed NodeHash suffix RAM size in TestFSTs.testRealTerms * remove SOP * tidy * delete temp utility tools * remove dead (FST pruning) code * add CHANGES entry; fix one missed fst.addNode -> fstCompiler.addNode during merge conflict resolution * remove a mal-formed nocommit * fold PR feedback * fold feedback * add gradle help test details on how to specify heap size for the test JVM; fix bogus assert (uncovered by Test2BFST); add TODO to Test2BFST anticipating building massive FSTs in small bounded RAM * suppress sysout checks for Test2BFSTs; add helpful comment showing how to run it directly * tidy
This commit is contained in:
parent
0d8a3e6c4f
commit
afb2a60751
|
@ -133,6 +133,15 @@ specifying the project and test task or a fully qualified task path. Example:
|
|||
gradlew -p lucene/core test -Ptests.verbose=true --tests "TestDemo"
|
||||
|
||||
|
||||
Larger heap size
|
||||
--------------------------
|
||||
|
||||
By default tests run with a 512 MB max heap. But some tests (monster/nightly)
|
||||
need more heap. Use "-Dtests.heapsize" for this:
|
||||
|
||||
gradlew -p lucene/core test --tests "Test2BFST" -Dtest.heapsize=32g
|
||||
|
||||
|
||||
Run GUI tests headlessly with Xvfb (Linux only)
|
||||
-----------------------------------------------
|
||||
|
||||
|
|
|
@ -82,6 +82,13 @@ Improvements
|
|||
|
||||
* GITHUB#12447: Hunspell: speed up the dictionary enumeration on suggestion (Peter Gromov)
|
||||
|
||||
* GITHUB#12542: FSTCompiler can now approximately limit how much RAM it uses to share
|
||||
suffixes during FST construction using the suffixRAMLimitMB method. Larger values
|
||||
result in a more minimal FST (more common suffixes are shard). Pass
|
||||
Double.POSITIVE_INFINITY to use as much RAM as is needed to create a purely
|
||||
minimal FST. Inspired by this Rust FST implemention:
|
||||
https://blog.burntsushi.net/transducers (Mike McCandless)
|
||||
|
||||
Optimizations
|
||||
---------------------
|
||||
|
||||
|
|
|
@ -478,9 +478,7 @@ public final class Lucene40BlockTreeTermsWriter extends FieldsConsumer {
|
|||
|
||||
final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
|
||||
final FSTCompiler<BytesRef> fstCompiler =
|
||||
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs)
|
||||
.shouldShareNonSingletonNodes(false)
|
||||
.build();
|
||||
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs).build();
|
||||
// if (DEBUG) {
|
||||
// System.out.println(" compile index for prefix=" + prefix);
|
||||
// }
|
||||
|
|
|
@ -395,9 +395,7 @@ public final class OrdsBlockTreeTermsWriter extends FieldsConsumer {
|
|||
}
|
||||
|
||||
final FSTCompiler<Output> fstCompiler =
|
||||
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, FST_OUTPUTS)
|
||||
.shouldShareNonSingletonNodes(false)
|
||||
.build();
|
||||
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, FST_OUTPUTS).build();
|
||||
// if (DEBUG) {
|
||||
// System.out.println(" compile index for prefix=" + prefix);
|
||||
// }
|
||||
|
|
|
@ -521,10 +521,7 @@ public final class Lucene90BlockTreeTermsWriter extends FieldsConsumer {
|
|||
|
||||
final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
|
||||
final FSTCompiler<BytesRef> fstCompiler =
|
||||
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs)
|
||||
.shouldShareNonSingletonNodes(false)
|
||||
.bytesPageBits(pageBits)
|
||||
.build();
|
||||
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs).bytesPageBits(pageBits).build();
|
||||
// if (DEBUG) {
|
||||
// System.out.println(" compile index for prefix=" + prefix);
|
||||
// }
|
||||
|
|
|
@ -83,13 +83,16 @@ public final class FST<T> implements Accountable {
|
|||
|
||||
static final int BIT_ARC_HAS_FINAL_OUTPUT = 1 << 5;
|
||||
|
||||
/** Value of the arc flags to declare a node with fixed length arcs designed for binary search. */
|
||||
/**
|
||||
* Value of the arc flags to declare a node with fixed length (sparse) arcs designed for binary
|
||||
* search.
|
||||
*/
|
||||
// We use this as a marker because this one flag is illegal by itself.
|
||||
public static final byte ARCS_FOR_BINARY_SEARCH = BIT_ARC_HAS_FINAL_OUTPUT;
|
||||
|
||||
/**
|
||||
* Value of the arc flags to declare a node with fixed length arcs and bit table designed for
|
||||
* direct addressing.
|
||||
* Value of the arc flags to declare a node with fixed length dense arcs and bit table designed
|
||||
* for direct addressing.
|
||||
*/
|
||||
static final byte ARCS_FOR_DIRECT_ADDRESSING = 1 << 6;
|
||||
|
||||
|
@ -751,11 +754,9 @@ public final class FST<T> implements Accountable {
|
|||
private void readFirstArcInfo(long nodeAddress, Arc<T> arc, final BytesReader in)
|
||||
throws IOException {
|
||||
in.setPosition(nodeAddress);
|
||||
// System.out.println(" flags=" + arc.flags);
|
||||
|
||||
byte flags = arc.nodeFlags = in.readByte();
|
||||
if (flags == ARCS_FOR_BINARY_SEARCH || flags == ARCS_FOR_DIRECT_ADDRESSING) {
|
||||
// System.out.println(" fixed length arc");
|
||||
// Special arc which is actually a node header for fixed length arcs.
|
||||
arc.numArcs = in.readVInt();
|
||||
arc.bytesPerArc = in.readVInt();
|
||||
|
@ -766,8 +767,6 @@ public final class FST<T> implements Accountable {
|
|||
arc.presenceIndex = -1;
|
||||
}
|
||||
arc.posArcsStart = in.getPosition();
|
||||
// System.out.println(" bytesPer=" + arc.bytesPerArc + " numArcs=" + arc.numArcs + "
|
||||
// arcsStart=" + pos);
|
||||
} else {
|
||||
arc.nextArc = nodeAddress;
|
||||
arc.bytesPerArc = 0;
|
||||
|
@ -830,14 +829,12 @@ public final class FST<T> implements Accountable {
|
|||
}
|
||||
}
|
||||
} else {
|
||||
if (arc.bytesPerArc() != 0) {
|
||||
// System.out.println(" nextArc real array");
|
||||
// Arcs have fixed length.
|
||||
if (arc.nodeFlags() == ARCS_FOR_BINARY_SEARCH) {
|
||||
switch (arc.nodeFlags()) {
|
||||
case ARCS_FOR_BINARY_SEARCH:
|
||||
// Point to next arc, -1 to skip arc flags.
|
||||
in.setPosition(arc.posArcsStart() - (1 + arc.arcIdx()) * (long) arc.bytesPerArc() - 1);
|
||||
} else {
|
||||
assert arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING;
|
||||
break;
|
||||
case ARCS_FOR_DIRECT_ADDRESSING:
|
||||
// Direct addressing node. The label is not stored but rather inferred
|
||||
// based on first label and arc index in the range.
|
||||
assert BitTable.assertIsValid(arc, in);
|
||||
|
@ -845,12 +842,14 @@ public final class FST<T> implements Accountable {
|
|||
int nextIndex = BitTable.nextBitSet(arc.arcIdx(), arc, in);
|
||||
assert nextIndex != -1;
|
||||
return arc.firstLabel() + nextIndex;
|
||||
}
|
||||
} else {
|
||||
// Arcs have variable length.
|
||||
// System.out.println(" nextArc real list");
|
||||
// Position to next arc, -1 to skip flags.
|
||||
in.setPosition(arc.nextArc() - 1);
|
||||
default:
|
||||
// Variable length arcs - linear search.
|
||||
assert arc.bytesPerArc() == 0;
|
||||
// Arcs have variable length.
|
||||
// System.out.println(" nextArc real list");
|
||||
// Position to next arc, -1 to skip flags.
|
||||
in.setPosition(arc.nextArc() - 1);
|
||||
break;
|
||||
}
|
||||
}
|
||||
return readLabel(in);
|
||||
|
|
|
@ -89,18 +89,6 @@ public class FSTCompiler<T> {
|
|||
|
||||
// private static final boolean DEBUG = true;
|
||||
|
||||
// simplistic pruning: we prune node (and all following
|
||||
// nodes) if less than this number of terms go through it:
|
||||
private final int minSuffixCount1;
|
||||
|
||||
// better pruning: we prune node (and all following
|
||||
// nodes) if the prior node has less than this number of
|
||||
// terms go through it:
|
||||
private final int minSuffixCount2;
|
||||
|
||||
private final boolean doShareNonSingletonNodes;
|
||||
private final int shareMaxTailLength;
|
||||
|
||||
private final IntsRefBuilder lastInput = new IntsRefBuilder();
|
||||
|
||||
// NOTE: cutting this over to ArrayList instead loses ~6%
|
||||
|
@ -135,32 +123,27 @@ public class FSTCompiler<T> {
|
|||
* Instantiates an FST/FSA builder with default settings and pruning options turned off. For more
|
||||
* tuning and tweaking, see {@link Builder}.
|
||||
*/
|
||||
// TODO: remove this? Builder API should be the only entry point?
|
||||
public FSTCompiler(FST.INPUT_TYPE inputType, Outputs<T> outputs) {
|
||||
this(inputType, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15, 1f);
|
||||
this(inputType, 32.0, outputs, true, 15, 1f);
|
||||
}
|
||||
|
||||
private FSTCompiler(
|
||||
FST.INPUT_TYPE inputType,
|
||||
int minSuffixCount1,
|
||||
int minSuffixCount2,
|
||||
boolean doShareSuffix,
|
||||
boolean doShareNonSingletonNodes,
|
||||
int shareMaxTailLength,
|
||||
double suffixRAMLimitMB,
|
||||
Outputs<T> outputs,
|
||||
boolean allowFixedLengthArcs,
|
||||
int bytesPageBits,
|
||||
float directAddressingMaxOversizingFactor) {
|
||||
this.minSuffixCount1 = minSuffixCount1;
|
||||
this.minSuffixCount2 = minSuffixCount2;
|
||||
this.doShareNonSingletonNodes = doShareNonSingletonNodes;
|
||||
this.shareMaxTailLength = shareMaxTailLength;
|
||||
this.allowFixedLengthArcs = allowFixedLengthArcs;
|
||||
this.directAddressingMaxOversizingFactor = directAddressingMaxOversizingFactor;
|
||||
fst = new FST<>(inputType, outputs, bytesPageBits);
|
||||
bytes = fst.bytes;
|
||||
assert bytes != null;
|
||||
if (doShareSuffix) {
|
||||
dedupHash = new NodeHash<>(fst, bytes.getReverseReader(false));
|
||||
if (suffixRAMLimitMB < 0) {
|
||||
throw new IllegalArgumentException("ramLimitMB must be >= 0; got: " + suffixRAMLimitMB);
|
||||
} else if (suffixRAMLimitMB > 0) {
|
||||
dedupHash = new NodeHash<>(fst, suffixRAMLimitMB, bytes.getReverseReader(false));
|
||||
} else {
|
||||
dedupHash = null;
|
||||
}
|
||||
|
@ -184,11 +167,7 @@ public class FSTCompiler<T> {
|
|||
|
||||
private final INPUT_TYPE inputType;
|
||||
private final Outputs<T> outputs;
|
||||
private int minSuffixCount1;
|
||||
private int minSuffixCount2;
|
||||
private boolean shouldShareSuffix = true;
|
||||
private boolean shouldShareNonSingletonNodes = true;
|
||||
private int shareMaxTailLength = Integer.MAX_VALUE;
|
||||
private double suffixRAMLimitMB = 32.0;
|
||||
private boolean allowFixedLengthArcs = true;
|
||||
private int bytesPageBits = 15;
|
||||
private float directAddressingMaxOversizingFactor = DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR;
|
||||
|
@ -207,59 +186,26 @@ public class FSTCompiler<T> {
|
|||
}
|
||||
|
||||
/**
|
||||
* If pruning the input graph during construction, this threshold is used for telling if a node
|
||||
* is kept or pruned. If transition_count(node) >= minSuffixCount1, the node is kept.
|
||||
* The approximate maximum amount of RAM (in MB) to use holding the suffix cache, which enables
|
||||
* the FST to share common suffixes. Pass {@link Double#POSITIVE_INFINITY} to keep all suffixes
|
||||
* and create an exactly minimal FST. In this case, the amount of RAM actually used will be
|
||||
* bounded by the number of unique suffixes. If you pass a value smaller than the builder would
|
||||
* use, the least recently used suffixes will be discarded, thus reducing suffix sharing and
|
||||
* creating a non-minimal FST. In this case, the larger the limit, the closer the FST will be to
|
||||
* its true minimal size, with diminishing returns as you increase the limit. Pass {@code 0} to
|
||||
* disable suffix sharing entirely, but note that the resulting FST can be substantially larger
|
||||
* than the minimal FST.
|
||||
*
|
||||
* <p>Default = 0.
|
||||
*/
|
||||
public Builder<T> minSuffixCount1(int minSuffixCount1) {
|
||||
this.minSuffixCount1 = minSuffixCount1;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Better pruning: we prune node (and all following nodes) if the prior node has less than this
|
||||
* number of terms go through it.
|
||||
* <p>Note that this is not a precise limit. The current implementation uses hash tables to map
|
||||
* the suffixes, and approximates the rough overhead (unused slots) in the hash table.
|
||||
*
|
||||
* <p>Default = 0.
|
||||
* <p>Default = {@code 32.0} MB.
|
||||
*/
|
||||
public Builder<T> minSuffixCount2(int minSuffixCount2) {
|
||||
this.minSuffixCount2 = minSuffixCount2;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* If {@code true}, the shared suffixes will be compacted into unique paths. This requires an
|
||||
* additional RAM-intensive hash map for lookups in memory. Setting this parameter to {@code
|
||||
* false} creates a single suffix path for all input sequences. This will result in a larger
|
||||
* FST, but requires substantially less memory and CPU during building.
|
||||
*
|
||||
* <p>Default = {@code true}.
|
||||
*/
|
||||
public Builder<T> shouldShareSuffix(boolean shouldShareSuffix) {
|
||||
this.shouldShareSuffix = shouldShareSuffix;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Only used if {@code shouldShareSuffix} is true. Set this to true to ensure FST is fully
|
||||
* minimal, at cost of more CPU and more RAM during building.
|
||||
*
|
||||
* <p>Default = {@code true}.
|
||||
*/
|
||||
public Builder<T> shouldShareNonSingletonNodes(boolean shouldShareNonSingletonNodes) {
|
||||
this.shouldShareNonSingletonNodes = shouldShareNonSingletonNodes;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Only used if {@code shouldShareSuffix} is true. Set this to Integer.MAX_VALUE to ensure FST
|
||||
* is fully minimal, at cost of more CPU and more RAM during building.
|
||||
*
|
||||
* <p>Default = {@link Integer#MAX_VALUE}.
|
||||
*/
|
||||
public Builder<T> shareMaxTailLength(int shareMaxTailLength) {
|
||||
this.shareMaxTailLength = shareMaxTailLength;
|
||||
public Builder<T> suffixRAMLimitMB(double mb) {
|
||||
if (mb < 0) {
|
||||
throw new IllegalArgumentException("suffixRAMLimitMB must be >= 0; got: " + mb);
|
||||
}
|
||||
this.suffixRAMLimitMB = mb;
|
||||
return this;
|
||||
}
|
||||
|
||||
|
@ -309,11 +255,7 @@ public class FSTCompiler<T> {
|
|||
FSTCompiler<T> fstCompiler =
|
||||
new FSTCompiler<>(
|
||||
inputType,
|
||||
minSuffixCount1,
|
||||
minSuffixCount2,
|
||||
shouldShareSuffix,
|
||||
shouldShareNonSingletonNodes,
|
||||
shareMaxTailLength,
|
||||
suffixRAMLimitMB,
|
||||
outputs,
|
||||
allowFixedLengthArcs,
|
||||
bytesPageBits,
|
||||
|
@ -346,9 +288,7 @@ public class FSTCompiler<T> {
|
|||
private CompiledNode compileNode(UnCompiledNode<T> nodeIn, int tailLength) throws IOException {
|
||||
final long node;
|
||||
long bytesPosStart = bytes.getPosition();
|
||||
if (dedupHash != null
|
||||
&& (doShareNonSingletonNodes || nodeIn.numArcs <= 1)
|
||||
&& tailLength <= shareMaxTailLength) {
|
||||
if (dedupHash != null) {
|
||||
if (nodeIn.numArcs == 0) {
|
||||
node = addNode(nodeIn);
|
||||
lastFrozenNode = node;
|
||||
|
@ -739,113 +679,36 @@ public class FSTCompiler<T> {
|
|||
}
|
||||
|
||||
private void freezeTail(int prefixLenPlus1) throws IOException {
|
||||
// System.out.println(" compileTail " + prefixLenPlus1);
|
||||
final int downTo = Math.max(1, prefixLenPlus1);
|
||||
for (int idx = lastInput.length(); idx >= downTo; idx--) {
|
||||
|
||||
boolean doPrune = false;
|
||||
boolean doCompile = false;
|
||||
final int downTo = Math.max(1, prefixLenPlus1);
|
||||
|
||||
for (int idx = lastInput.length(); idx >= downTo; idx--) {
|
||||
|
||||
final UnCompiledNode<T> node = frontier[idx];
|
||||
final UnCompiledNode<T> parent = frontier[idx - 1];
|
||||
|
||||
if (node.inputCount < minSuffixCount1) {
|
||||
doPrune = true;
|
||||
doCompile = true;
|
||||
} else if (idx > prefixLenPlus1) {
|
||||
// prune if parent's inputCount is less than suffixMinCount2
|
||||
if (parent.inputCount < minSuffixCount2
|
||||
|| (minSuffixCount2 == 1 && parent.inputCount == 1 && idx > 1)) {
|
||||
// my parent, about to be compiled, doesn't make the cut, so
|
||||
// I'm definitely pruned
|
||||
final T nextFinalOutput = node.output;
|
||||
|
||||
// if minSuffixCount2 is 1, we keep only up
|
||||
// until the 'distinguished edge', ie we keep only the
|
||||
// 'divergent' part of the FST. if my parent, about to be
|
||||
// compiled, has inputCount 1 then we are already past the
|
||||
// distinguished edge. NOTE: this only works if
|
||||
// the FST outputs are not "compressible" (simple
|
||||
// ords ARE compressible).
|
||||
doPrune = true;
|
||||
} else {
|
||||
// my parent, about to be compiled, does make the cut, so
|
||||
// I'm definitely not pruned
|
||||
doPrune = false;
|
||||
}
|
||||
doCompile = true;
|
||||
} else {
|
||||
// if pruning is disabled (count is 0) we can always
|
||||
// compile current node
|
||||
doCompile = minSuffixCount2 == 0;
|
||||
}
|
||||
// We "fake" the node as being final if it has no
|
||||
// outgoing arcs; in theory we could leave it
|
||||
// as non-final (the FST can represent this), but
|
||||
// FSTEnum, Util, etc., have trouble w/ non-final
|
||||
// dead-end states:
|
||||
|
||||
// System.out.println(" label=" + ((char) lastInput.ints[lastInput.offset+idx-1]) + " idx="
|
||||
// + idx + " inputCount=" + frontier[idx].inputCount + " doCompile=" + doCompile + " doPrune="
|
||||
// + doPrune);
|
||||
// TODO: is node.numArcs == 0 always false? we no longer prune any nodes from FST:
|
||||
final boolean isFinal = node.isFinal || node.numArcs == 0;
|
||||
|
||||
if (node.inputCount < minSuffixCount2
|
||||
|| (minSuffixCount2 == 1 && node.inputCount == 1 && idx > 1)) {
|
||||
// drop all arcs
|
||||
for (int arcIdx = 0; arcIdx < node.numArcs; arcIdx++) {
|
||||
@SuppressWarnings({"rawtypes", "unchecked"})
|
||||
final UnCompiledNode<T> target = (UnCompiledNode<T>) node.arcs[arcIdx].target;
|
||||
target.clear();
|
||||
}
|
||||
node.numArcs = 0;
|
||||
}
|
||||
|
||||
if (doPrune) {
|
||||
// this node doesn't make it -- deref it
|
||||
node.clear();
|
||||
parent.deleteLast(lastInput.intAt(idx - 1), node);
|
||||
} else {
|
||||
|
||||
if (minSuffixCount2 != 0) {
|
||||
compileAllTargets(node, lastInput.length() - idx);
|
||||
}
|
||||
final T nextFinalOutput = node.output;
|
||||
|
||||
// We "fake" the node as being final if it has no
|
||||
// outgoing arcs; in theory we could leave it
|
||||
// as non-final (the FST can represent this), but
|
||||
// FSTEnum, Util, etc., have trouble w/ non-final
|
||||
// dead-end states:
|
||||
final boolean isFinal = node.isFinal || node.numArcs == 0;
|
||||
|
||||
if (doCompile) {
|
||||
// this node makes it and we now compile it. first,
|
||||
// compile any targets that were previously
|
||||
// undecided:
|
||||
parent.replaceLast(
|
||||
lastInput.intAt(idx - 1),
|
||||
compileNode(node, 1 + lastInput.length() - idx),
|
||||
nextFinalOutput,
|
||||
isFinal);
|
||||
} else {
|
||||
// replaceLast just to install
|
||||
// nextFinalOutput/isFinal onto the arc
|
||||
parent.replaceLast(lastInput.intAt(idx - 1), node, nextFinalOutput, isFinal);
|
||||
// this node will stay in play for now, since we are
|
||||
// undecided on whether to prune it. later, it
|
||||
// will be either compiled or pruned, so we must
|
||||
// allocate a new node:
|
||||
frontier[idx] = new UnCompiledNode<>(this, idx);
|
||||
}
|
||||
}
|
||||
// this node makes it and we now compile it. first,
|
||||
// compile any targets that were previously
|
||||
// undecided:
|
||||
parent.replaceLast(
|
||||
lastInput.intAt(idx - 1),
|
||||
compileNode(node, 1 + lastInput.length() - idx),
|
||||
nextFinalOutput,
|
||||
isFinal);
|
||||
}
|
||||
}
|
||||
|
||||
// for debugging
|
||||
/*
|
||||
private String toString(BytesRef b) {
|
||||
try {
|
||||
return b.utf8ToString() + " " + b;
|
||||
} catch (Throwable t) {
|
||||
return b.toString();
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
/**
|
||||
* Add the next input/output pair. The provided input must be sorted after the previous one
|
||||
* according to {@link IntsRef#compareTo}. It's also OK to add the same input twice in a row with
|
||||
|
@ -987,20 +850,12 @@ public class FSTCompiler<T> {
|
|||
|
||||
// minimize nodes in the last word's suffix
|
||||
freezeTail(0);
|
||||
if (root.inputCount < minSuffixCount1
|
||||
|| root.inputCount < minSuffixCount2
|
||||
|| root.numArcs == 0) {
|
||||
if (root.numArcs == 0) {
|
||||
if (fst.emptyOutput == null) {
|
||||
return null;
|
||||
} else if (minSuffixCount1 > 0 || minSuffixCount2 > 0) {
|
||||
// empty string got pruned
|
||||
return null;
|
||||
}
|
||||
} else {
|
||||
if (minSuffixCount2 != 0) {
|
||||
compileAllTargets(root, lastInput.length());
|
||||
}
|
||||
}
|
||||
|
||||
// if (DEBUG) System.out.println(" builder.finish root.isFinal=" + root.isFinal + "
|
||||
// root.output=" + root.output);
|
||||
fst.finish(compileNode(root, lastInput.length()).node);
|
||||
|
@ -1008,22 +863,6 @@ public class FSTCompiler<T> {
|
|||
return fst;
|
||||
}
|
||||
|
||||
private void compileAllTargets(UnCompiledNode<T> node, int tailLength) throws IOException {
|
||||
for (int arcIdx = 0; arcIdx < node.numArcs; arcIdx++) {
|
||||
final Arc<T> arc = node.arcs[arcIdx];
|
||||
if (!arc.target.isCompiled()) {
|
||||
// not yet compiled
|
||||
@SuppressWarnings({"rawtypes", "unchecked"})
|
||||
final UnCompiledNode<T> n = (UnCompiledNode<T>) arc.target;
|
||||
if (n.numArcs == 0) {
|
||||
// System.out.println("seg=" + segment + " FORCE final arc=" + (char) arc.label);
|
||||
arc.isFinal = n.isFinal = true;
|
||||
}
|
||||
arc.target = compileNode(n, tailLength - 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Expert: holds a pending (seen but not yet serialized) arc. */
|
||||
static class Arc<T> {
|
||||
int label; // really an "unsigned" byte
|
||||
|
@ -1065,6 +904,9 @@ public class FSTCompiler<T> {
|
|||
// code here...
|
||||
T output;
|
||||
boolean isFinal;
|
||||
|
||||
// TODO: remove this tracking? we used to use it for confusingly pruning NodeHash, but
|
||||
// we switched to LRU by RAM usage instead:
|
||||
long inputCount;
|
||||
|
||||
/** This node's depth, starting from the automaton root. */
|
||||
|
|
|
@ -20,76 +20,160 @@ import java.io.IOException;
|
|||
import org.apache.lucene.util.packed.PackedInts;
|
||||
import org.apache.lucene.util.packed.PagedGrowableWriter;
|
||||
|
||||
// TODO: any way to make a reverse suffix lookup (msokolov's idea) instead of more costly hash?
|
||||
// hmmm, though, hash is not so wasteful
|
||||
// since it does not have to store value of each entry: the value is the node pointer in the FST.
|
||||
// actually, there is much to save
|
||||
// there -- we would not need any long per entry -- we'd be able to start at the FST end node and
|
||||
// work backwards from the transitions
|
||||
|
||||
// TODO: couldn't we prune naturally back until we see a transition with an output? it's highly
|
||||
// unlikely (mostly impossible) such suffixes can be shared?
|
||||
|
||||
// Used to dedup states (lookup already-frozen states)
|
||||
final class NodeHash<T> {
|
||||
|
||||
private PagedGrowableWriter table;
|
||||
private long count;
|
||||
private long mask;
|
||||
// primary table -- we add nodes into this until it reaches the requested tableSizeLimit/2, then
|
||||
// we move it to fallback
|
||||
private PagedGrowableHash primaryTable;
|
||||
|
||||
// how many nodes are allowed to store in both primary and fallback tables; when primary gets full
|
||||
// (tableSizeLimit/2), we move it to the
|
||||
// fallback table
|
||||
private final long ramLimitBytes;
|
||||
|
||||
// fallback table. if we fallback and find the frozen node here, we promote it to primary table,
|
||||
// for a simplistic and lowish-RAM-overhead
|
||||
// (compared to e.g. LinkedHashMap) LRU behaviour. fallbackTable is read-only.
|
||||
private PagedGrowableHash fallbackTable;
|
||||
|
||||
private final FST<T> fst;
|
||||
private final FST.Arc<T> scratchArc = new FST.Arc<>();
|
||||
private final FST.BytesReader in;
|
||||
|
||||
public NodeHash(FST<T> fst, FST.BytesReader in) {
|
||||
table = new PagedGrowableWriter(16, 1 << 27, 8, PackedInts.COMPACT);
|
||||
mask = 15;
|
||||
/**
|
||||
* ramLimitMB is the max RAM we can use for recording suffixes. If we hit this limit, the least
|
||||
* recently used suffixes are discarded, and the FST is no longer minimalI. Still, larger
|
||||
* ramLimitMB will make the FST smaller (closer to minimal).
|
||||
*/
|
||||
public NodeHash(FST<T> fst, double ramLimitMB, FST.BytesReader in) {
|
||||
if (ramLimitMB <= 0) {
|
||||
throw new IllegalArgumentException("ramLimitMB must be > 0; got: " + ramLimitMB);
|
||||
}
|
||||
double asBytes = ramLimitMB * 1024 * 1024;
|
||||
if (asBytes >= Long.MAX_VALUE) {
|
||||
// quietly truncate to Long.MAX_VALUE in bytes too
|
||||
ramLimitBytes = Long.MAX_VALUE;
|
||||
} else {
|
||||
ramLimitBytes = (long) asBytes;
|
||||
}
|
||||
|
||||
primaryTable = new PagedGrowableHash();
|
||||
this.fst = fst;
|
||||
this.in = in;
|
||||
}
|
||||
|
||||
private boolean nodesEqual(FSTCompiler.UnCompiledNode<T> node, long address) throws IOException {
|
||||
fst.readFirstRealTargetArc(address, scratchArc, in);
|
||||
|
||||
// Fail fast for a node with fixed length arcs.
|
||||
if (scratchArc.bytesPerArc() != 0) {
|
||||
if (scratchArc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH) {
|
||||
if (node.numArcs != scratchArc.numArcs()) {
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
assert scratchArc.nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING;
|
||||
if ((node.arcs[node.numArcs - 1].label - node.arcs[0].label + 1) != scratchArc.numArcs()
|
||||
|| node.numArcs != FST.Arc.BitTable.countBits(scratchArc, in)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
private long getFallback(FSTCompiler.UnCompiledNode<T> nodeIn, long hash) throws IOException {
|
||||
if (fallbackTable == null) {
|
||||
// no fallback yet (primary table is not yet large enough to swap)
|
||||
return 0;
|
||||
}
|
||||
|
||||
for (int arcUpto = 0; arcUpto < node.numArcs; arcUpto++) {
|
||||
final FSTCompiler.Arc<T> arc = node.arcs[arcUpto];
|
||||
if (arc.label != scratchArc.label()
|
||||
|| !arc.output.equals(scratchArc.output())
|
||||
|| ((FSTCompiler.CompiledNode) arc.target).node != scratchArc.target()
|
||||
|| !arc.nextFinalOutput.equals(scratchArc.nextFinalOutput())
|
||||
|| arc.isFinal != scratchArc.isFinal()) {
|
||||
return false;
|
||||
long pos = hash & fallbackTable.mask;
|
||||
int c = 0;
|
||||
while (true) {
|
||||
long node = fallbackTable.get(pos);
|
||||
if (node == 0) {
|
||||
// not found
|
||||
return 0;
|
||||
} else if (nodesEqual(nodeIn, node)) {
|
||||
// frozen version of this node is already here
|
||||
return node;
|
||||
}
|
||||
|
||||
if (scratchArc.isLast()) {
|
||||
if (arcUpto == node.numArcs - 1) {
|
||||
return true;
|
||||
// quadratic probe (but is it, really?)
|
||||
pos = (pos + (++c)) & fallbackTable.mask;
|
||||
}
|
||||
}
|
||||
|
||||
public long add(FSTCompiler<T> fstCompiler, FSTCompiler.UnCompiledNode<T> nodeIn)
|
||||
throws IOException {
|
||||
|
||||
long hash = hash(nodeIn);
|
||||
|
||||
long pos = hash & primaryTable.mask;
|
||||
int c = 0;
|
||||
|
||||
while (true) {
|
||||
|
||||
long node = primaryTable.get(pos);
|
||||
if (node == 0) {
|
||||
// node is not in primary table; is it in fallback table?
|
||||
node = getFallback(nodeIn, hash);
|
||||
if (node != 0) {
|
||||
// it was already in fallback -- promote to primary
|
||||
primaryTable.set(pos, node);
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
fst.readNextRealArc(scratchArc, in);
|
||||
}
|
||||
// not in fallback either -- freeze & add the incoming node
|
||||
|
||||
return false;
|
||||
// freeze & add
|
||||
node = fstCompiler.addNode(nodeIn);
|
||||
|
||||
// we use 0 as empty marker in hash table, so it better be impossible to get a frozen node
|
||||
// at 0:
|
||||
assert node != 0;
|
||||
|
||||
// confirm frozen hash and unfrozen hash are the same
|
||||
assert hash(node) == hash : "mismatch frozenHash=" + hash(node) + " vs hash=" + hash;
|
||||
|
||||
primaryTable.set(pos, node);
|
||||
}
|
||||
|
||||
// how many bytes would be used if we had "perfect" hashing:
|
||||
long ramBytesUsed = primaryTable.count * PackedInts.bitsRequired(node) / 8;
|
||||
|
||||
// NOTE: we could instead use the more precise RAM used, but this leads to unpredictable
|
||||
// quantized behavior due to 2X rehashing where for large ranges of the RAM limit, the
|
||||
// size of the FST does not change, and then suddenly when you cross a secret threshold,
|
||||
// it drops. With this approach (measuring "perfect" hash storage and approximating the
|
||||
// overhead), the behaviour is more strictly monotonic: larger RAM limits smoothly result
|
||||
// in smaller FSTs, even if the precise RAM used is not always under the limit.
|
||||
|
||||
// divide limit by 2 because fallback gets half the RAM and primary gets the other half
|
||||
// divide by 2 again to account for approximate hash table overhead halfway between 33.3%
|
||||
// and 66.7% occupancy = 50%
|
||||
if (ramBytesUsed >= ramLimitBytes / (2 * 2)) {
|
||||
// time to fallback -- fallback is now used read-only to promote a node (suffix) to
|
||||
// primary if we encounter it again
|
||||
fallbackTable = primaryTable;
|
||||
// size primary table the same size to reduce rehash cost
|
||||
// TODO: we could clear & reuse the previous fallbackTable, instead of allocating a new
|
||||
// to reduce GC load
|
||||
primaryTable = new PagedGrowableHash(node, Math.max(16, primaryTable.entries.size()));
|
||||
} else if (primaryTable.count > primaryTable.entries.size() * (2f / 3)) {
|
||||
// rehash at 2/3 occupancy
|
||||
primaryTable.rehash(node);
|
||||
}
|
||||
|
||||
return node;
|
||||
|
||||
} else if (nodesEqual(nodeIn, node)) {
|
||||
// same node (in frozen form) is already in primary table
|
||||
return node;
|
||||
}
|
||||
|
||||
// quadratic probe (but is it, really?)
|
||||
pos = (pos + (++c)) & primaryTable.mask;
|
||||
}
|
||||
}
|
||||
|
||||
// hash code for an unfrozen node. This must be identical
|
||||
// to the frozen case (below)!!
|
||||
private long hash(FSTCompiler.UnCompiledNode<T> node) {
|
||||
final int PRIME = 31;
|
||||
// System.out.println("hash unfrozen");
|
||||
long h = 0;
|
||||
// TODO: maybe if number of arcs is high we can safely subsample?
|
||||
for (int arcIdx = 0; arcIdx < node.numArcs; arcIdx++) {
|
||||
final FSTCompiler.Arc<T> arc = node.arcs[arcIdx];
|
||||
// System.out.println(" label=" + arc.label + " target=" + ((Builder.CompiledNode)
|
||||
// arc.target).node + " h=" + h + " output=" + fst.outputs.outputToString(arc.output) + "
|
||||
// isFinal?=" + arc.isFinal);
|
||||
h = PRIME * h + arc.label;
|
||||
long n = ((FSTCompiler.CompiledNode) arc.target).node;
|
||||
h = PRIME * h + (int) (n ^ (n >> 32));
|
||||
|
@ -99,20 +183,18 @@ final class NodeHash<T> {
|
|||
h += 17;
|
||||
}
|
||||
}
|
||||
// System.out.println(" ret " + (h&Integer.MAX_VALUE));
|
||||
return h & Long.MAX_VALUE;
|
||||
|
||||
return h;
|
||||
}
|
||||
|
||||
// hash code for a frozen node
|
||||
// hash code for a frozen node. this must precisely match the hash computation of an unfrozen
|
||||
// node!
|
||||
private long hash(long node) throws IOException {
|
||||
final int PRIME = 31;
|
||||
// System.out.println("hash frozen node=" + node);
|
||||
|
||||
long h = 0;
|
||||
fst.readFirstRealTargetArc(node, scratchArc, in);
|
||||
while (true) {
|
||||
// System.out.println(" label=" + scratchArc.label + " target=" + scratchArc.target + " h=" +
|
||||
// h + " output=" + fst.outputs.outputToString(scratchArc.output) + " next?=" +
|
||||
// scratchArc.flag(4) + " final?=" + scratchArc.isFinal() + " pos=" + in.getPosition());
|
||||
h = PRIME * h + scratchArc.label();
|
||||
h = PRIME * h + (int) (scratchArc.target() ^ (scratchArc.target() >> 32));
|
||||
h = PRIME * h + scratchArc.output().hashCode();
|
||||
|
@ -125,70 +207,129 @@ final class NodeHash<T> {
|
|||
}
|
||||
fst.readNextRealArc(scratchArc, in);
|
||||
}
|
||||
// System.out.println(" ret " + (h&Integer.MAX_VALUE));
|
||||
return h & Long.MAX_VALUE;
|
||||
|
||||
return h;
|
||||
}
|
||||
|
||||
public long add(FSTCompiler<T> fstCompiler, FSTCompiler.UnCompiledNode<T> nodeIn)
|
||||
throws IOException {
|
||||
// System.out.println("hash: add count=" + count + " vs " + table.size() + " mask=" + mask);
|
||||
final long h = hash(nodeIn);
|
||||
long pos = h & mask;
|
||||
int c = 0;
|
||||
while (true) {
|
||||
final long v = table.get(pos);
|
||||
if (v == 0) {
|
||||
// freeze & add
|
||||
final long node = fstCompiler.addNode(nodeIn);
|
||||
// System.out.println(" now freeze node=" + node);
|
||||
assert hash(node) == h : "frozenHash=" + hash(node) + " vs h=" + h;
|
||||
count++;
|
||||
table.set(pos, node);
|
||||
// Rehash at 2/3 occupancy:
|
||||
if (count > 2 * table.size() / 3) {
|
||||
rehash(node);
|
||||
/**
|
||||
* Compares an unfrozen node (UnCompiledNode) with a frozen node at byte location address (long),
|
||||
* returning true if they are equal.
|
||||
*/
|
||||
private boolean nodesEqual(FSTCompiler.UnCompiledNode<T> node, long address) throws IOException {
|
||||
fst.readFirstRealTargetArc(address, scratchArc, in);
|
||||
|
||||
// fail fast for a node with fixed length arcs
|
||||
if (scratchArc.bytesPerArc() != 0) {
|
||||
assert node.numArcs > 0;
|
||||
// the frozen node uses fixed-with arc encoding (same number of bytes per arc), but may be
|
||||
// sparse or dense
|
||||
switch (scratchArc.nodeFlags()) {
|
||||
case FST.ARCS_FOR_BINARY_SEARCH:
|
||||
// sparse
|
||||
if (node.numArcs != scratchArc.numArcs()) {
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
case FST.ARCS_FOR_DIRECT_ADDRESSING:
|
||||
// dense -- compare both the number of labels allocated in the array (some of which may
|
||||
// not actually be arcs), and the number of arcs
|
||||
if ((node.arcs[node.numArcs - 1].label - node.arcs[0].label + 1) != scratchArc.numArcs()
|
||||
|| node.numArcs != FST.Arc.BitTable.countBits(scratchArc, in)) {
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
throw new AssertionError("unhandled scratchArc.nodeFlag() " + scratchArc.nodeFlags());
|
||||
}
|
||||
}
|
||||
|
||||
// compare arc by arc to see if there is a difference
|
||||
for (int arcUpto = 0; arcUpto < node.numArcs; arcUpto++) {
|
||||
final FSTCompiler.Arc<T> arc = node.arcs[arcUpto];
|
||||
if (arc.label != scratchArc.label()
|
||||
|| arc.output.equals(scratchArc.output()) == false
|
||||
|| ((FSTCompiler.CompiledNode) arc.target).node != scratchArc.target()
|
||||
|| arc.nextFinalOutput.equals(scratchArc.nextFinalOutput()) == false
|
||||
|| arc.isFinal != scratchArc.isFinal()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (scratchArc.isLast()) {
|
||||
if (arcUpto == node.numArcs - 1) {
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
return node;
|
||||
} else if (nodesEqual(nodeIn, v)) {
|
||||
// same node is already here
|
||||
return v;
|
||||
}
|
||||
|
||||
// quadratic probe
|
||||
pos = (pos + (++c)) & mask;
|
||||
fst.readNextRealArc(scratchArc, in);
|
||||
}
|
||||
|
||||
// unfrozen node has fewer arcs than frozen node
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
// called only by rehash
|
||||
private void addNew(long address) throws IOException {
|
||||
long pos = hash(address) & mask;
|
||||
int c = 0;
|
||||
while (true) {
|
||||
if (table.get(pos) == 0) {
|
||||
table.set(pos, address);
|
||||
break;
|
||||
}
|
||||
/** Inner class because it needs access to hash function and FST bytes. */
|
||||
private class PagedGrowableHash {
|
||||
private PagedGrowableWriter entries;
|
||||
private long count;
|
||||
private long mask;
|
||||
|
||||
// quadratic probe
|
||||
pos = (pos + (++c)) & mask;
|
||||
// 256K blocks, but note that the final block is sized only as needed so it won't use the full
|
||||
// block size when just a few elements were written to it
|
||||
private static final int BLOCK_SIZE_BYTES = 1 << 18;
|
||||
|
||||
public PagedGrowableHash() {
|
||||
entries = new PagedGrowableWriter(16, BLOCK_SIZE_BYTES, 8, PackedInts.COMPACT);
|
||||
mask = 15;
|
||||
}
|
||||
}
|
||||
|
||||
private void rehash(long lastNodeAddress) throws IOException {
|
||||
final PagedGrowableWriter oldTable = table;
|
||||
public PagedGrowableHash(long lastNodeAddress, long size) {
|
||||
entries =
|
||||
new PagedGrowableWriter(
|
||||
size, BLOCK_SIZE_BYTES, PackedInts.bitsRequired(lastNodeAddress), PackedInts.COMPACT);
|
||||
mask = size - 1;
|
||||
assert (mask & size) == 0 : "size must be a power-of-2; got size=" + size + " mask=" + mask;
|
||||
}
|
||||
|
||||
table =
|
||||
new PagedGrowableWriter(
|
||||
2 * oldTable.size(),
|
||||
1 << 27,
|
||||
PackedInts.bitsRequired(lastNodeAddress),
|
||||
PackedInts.COMPACT);
|
||||
mask = table.size() - 1;
|
||||
for (long idx = 0; idx < oldTable.size(); idx++) {
|
||||
final long address = oldTable.get(idx);
|
||||
if (address != 0) {
|
||||
addNew(address);
|
||||
public long get(long index) {
|
||||
return entries.get(index);
|
||||
}
|
||||
|
||||
public void set(long index, long pointer) throws IOException {
|
||||
entries.set(index, pointer);
|
||||
count++;
|
||||
}
|
||||
|
||||
private void rehash(long lastNodeAddress) throws IOException {
|
||||
// double hash table size on each rehash
|
||||
PagedGrowableWriter newEntries =
|
||||
new PagedGrowableWriter(
|
||||
2 * entries.size(),
|
||||
BLOCK_SIZE_BYTES,
|
||||
PackedInts.bitsRequired(lastNodeAddress),
|
||||
PackedInts.COMPACT);
|
||||
long newMask = newEntries.size() - 1;
|
||||
for (long idx = 0; idx < entries.size(); idx++) {
|
||||
long address = entries.get(idx);
|
||||
if (address != 0) {
|
||||
long pos = hash(address) & newMask;
|
||||
int c = 0;
|
||||
while (true) {
|
||||
if (newEntries.get(pos) == 0) {
|
||||
newEntries.set(pos, address);
|
||||
break;
|
||||
}
|
||||
|
||||
// quadratic probe
|
||||
pos = (pos + (++c)) & newMask;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
mask = newMask;
|
||||
entries = newEntries;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -25,12 +25,20 @@ import org.apache.lucene.store.IndexInput;
|
|||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.store.MMapDirectory;
|
||||
import org.apache.lucene.tests.util.LuceneTestCase;
|
||||
import org.apache.lucene.tests.util.LuceneTestCase.SuppressSysoutChecks;
|
||||
import org.apache.lucene.tests.util.TimeUnits;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
import org.junit.Ignore;
|
||||
|
||||
// TODO: soon we will be able to run this test with small heaps! Once #12633 and #12543
|
||||
// are done
|
||||
//
|
||||
// Run something like this:
|
||||
// ./gradlew test --tests Test2BFST -Dtests.heapsize=32g -Dtests.verbose=true --max-workers=1
|
||||
|
||||
@Ignore("Requires tons of heap to run (30 GB hits OOME but 35 GB passes after ~4.5 hours)")
|
||||
@SuppressSysoutChecks(bugUrl = "test prints helpful progress reports with time")
|
||||
@TimeoutSuite(millis = 100 * TimeUnits.HOUR)
|
||||
public class Test2BFST extends LuceneTestCase {
|
||||
|
||||
|
|
|
@ -82,6 +82,7 @@ import org.apache.lucene.util.fst.FST.Arc;
|
|||
import org.apache.lucene.util.fst.FST.BytesReader;
|
||||
import org.apache.lucene.util.fst.PairOutputs.Pair;
|
||||
import org.apache.lucene.util.fst.Util.Result;
|
||||
import org.junit.Ignore;
|
||||
|
||||
@SuppressCodecs({"SimpleText", "Direct"})
|
||||
public class TestFSTs extends LuceneTestCase {
|
||||
|
@ -141,7 +142,7 @@ public class TestFSTs extends LuceneTestCase {
|
|||
pairs.add(new FSTTester.InputOutput<>(term, NO_OUTPUT));
|
||||
}
|
||||
FSTTester<Object> tester = new FSTTester<>(random(), dir, inputMode, pairs, outputs);
|
||||
FST<Object> fst = tester.doTest(0, 0, false);
|
||||
FST<Object> fst = tester.doTest();
|
||||
assertNotNull(fst);
|
||||
assertEquals(22, tester.nodeCount);
|
||||
assertEquals(27, tester.arcCount);
|
||||
|
@ -155,7 +156,7 @@ public class TestFSTs extends LuceneTestCase {
|
|||
pairs.add(new FSTTester.InputOutput<>(terms2[idx], (long) idx));
|
||||
}
|
||||
FSTTester<Long> tester = new FSTTester<>(random(), dir, inputMode, pairs, outputs);
|
||||
final FST<Long> fst = tester.doTest(0, 0, false);
|
||||
final FST<Long> fst = tester.doTest();
|
||||
assertNotNull(fst);
|
||||
assertEquals(22, tester.nodeCount);
|
||||
assertEquals(27, tester.arcCount);
|
||||
|
@ -170,7 +171,7 @@ public class TestFSTs extends LuceneTestCase {
|
|||
pairs.add(new FSTTester.InputOutput<>(terms2[idx], output));
|
||||
}
|
||||
FSTTester<BytesRef> tester = new FSTTester<>(random(), dir, inputMode, pairs, outputs);
|
||||
final FST<BytesRef> fst = tester.doTest(0, 0, false);
|
||||
final FST<BytesRef> fst = tester.doTest();
|
||||
assertNotNull(fst);
|
||||
assertEquals(24, tester.nodeCount);
|
||||
assertEquals(30, tester.arcCount);
|
||||
|
@ -190,7 +191,7 @@ public class TestFSTs extends LuceneTestCase {
|
|||
for (IntsRef term : terms) {
|
||||
pairs.add(new FSTTester.InputOutput<>(term, NO_OUTPUT));
|
||||
}
|
||||
new FSTTester<>(random(), dir, inputMode, pairs, outputs).doTest(true);
|
||||
new FSTTester<>(random(), dir, inputMode, pairs, outputs).doTest();
|
||||
}
|
||||
|
||||
// PositiveIntOutput (ord)
|
||||
|
@ -200,7 +201,7 @@ public class TestFSTs extends LuceneTestCase {
|
|||
for (int idx = 0; idx < terms.length; idx++) {
|
||||
pairs.add(new FSTTester.InputOutput<>(terms[idx], (long) idx));
|
||||
}
|
||||
new FSTTester<>(random(), dir, inputMode, pairs, outputs).doTest(true);
|
||||
new FSTTester<>(random(), dir, inputMode, pairs, outputs).doTest();
|
||||
}
|
||||
|
||||
// PositiveIntOutput (random monotonically increasing positive number)
|
||||
|
@ -213,7 +214,7 @@ public class TestFSTs extends LuceneTestCase {
|
|||
lastOutput = value;
|
||||
pairs.add(new FSTTester.InputOutput<>(term, value));
|
||||
}
|
||||
new FSTTester<>(random(), dir, inputMode, pairs, outputs).doTest(true);
|
||||
new FSTTester<>(random(), dir, inputMode, pairs, outputs).doTest();
|
||||
}
|
||||
|
||||
// PositiveIntOutput (random positive number)
|
||||
|
@ -224,7 +225,7 @@ public class TestFSTs extends LuceneTestCase {
|
|||
pairs.add(
|
||||
new FSTTester.InputOutput<>(term, TestUtil.nextLong(random(), 0, Long.MAX_VALUE)));
|
||||
}
|
||||
new FSTTester<>(random(), dir, inputMode, pairs, outputs).doTest(true);
|
||||
new FSTTester<>(random(), dir, inputMode, pairs, outputs).doTest();
|
||||
}
|
||||
|
||||
// Pair<ord, (random monotonically increasing positive number>
|
||||
|
@ -240,7 +241,7 @@ public class TestFSTs extends LuceneTestCase {
|
|||
lastOutput = value;
|
||||
pairs.add(new FSTTester.InputOutput<>(terms[idx], outputs.newPair((long) idx, value)));
|
||||
}
|
||||
new FSTTester<>(random(), dir, inputMode, pairs, outputs).doTest(true);
|
||||
new FSTTester<>(random(), dir, inputMode, pairs, outputs).doTest();
|
||||
}
|
||||
|
||||
// Sequence-of-bytes
|
||||
|
@ -253,7 +254,7 @@ public class TestFSTs extends LuceneTestCase {
|
|||
random().nextInt(30) == 17 ? NO_OUTPUT : newBytesRef(Integer.toString(idx));
|
||||
pairs.add(new FSTTester.InputOutput<>(terms[idx], output));
|
||||
}
|
||||
new FSTTester<>(random(), dir, inputMode, pairs, outputs).doTest(true);
|
||||
new FSTTester<>(random(), dir, inputMode, pairs, outputs).doTest();
|
||||
}
|
||||
|
||||
// Sequence-of-ints
|
||||
|
@ -269,7 +270,7 @@ public class TestFSTs extends LuceneTestCase {
|
|||
}
|
||||
pairs.add(new FSTTester.InputOutput<>(terms[idx], output));
|
||||
}
|
||||
new FSTTester<>(random(), dir, inputMode, pairs, outputs).doTest(true);
|
||||
new FSTTester<>(random(), dir, inputMode, pairs, outputs).doTest();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -336,7 +337,22 @@ public class TestFSTs extends LuceneTestCase {
|
|||
writer.close();
|
||||
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
|
||||
|
||||
FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
FSTCompiler.Builder<Long> builder = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
|
||||
double suffixRAMLimitMB;
|
||||
|
||||
if (random().nextInt(10) == 4) {
|
||||
// no suffix sharing
|
||||
suffixRAMLimitMB = 0;
|
||||
} else if (random().nextInt(10) == 7) {
|
||||
// share all suffixes (minimal FST)
|
||||
suffixRAMLimitMB = Double.POSITIVE_INFINITY;
|
||||
} else {
|
||||
suffixRAMLimitMB = (random().nextDouble() + 0.01) * 10.0;
|
||||
}
|
||||
builder.suffixRAMLimitMB(suffixRAMLimitMB);
|
||||
|
||||
FSTCompiler<Long> fstCompiler = builder.build();
|
||||
|
||||
boolean storeOrd = random().nextBoolean();
|
||||
if (VERBOSE) {
|
||||
|
@ -505,12 +521,7 @@ public class TestFSTs extends LuceneTestCase {
|
|||
private final FSTCompiler<T> fstCompiler;
|
||||
|
||||
public VisitTerms(
|
||||
Path dirOut,
|
||||
Path wordsFileIn,
|
||||
int inputMode,
|
||||
int prune,
|
||||
Outputs<T> outputs,
|
||||
boolean noArcArrays) {
|
||||
Path dirOut, Path wordsFileIn, int inputMode, Outputs<T> outputs, boolean noArcArrays) {
|
||||
this.dirOut = dirOut;
|
||||
this.wordsFileIn = wordsFileIn;
|
||||
this.inputMode = inputMode;
|
||||
|
@ -519,8 +530,6 @@ public class TestFSTs extends LuceneTestCase {
|
|||
fstCompiler =
|
||||
new FSTCompiler.Builder<>(
|
||||
inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, outputs)
|
||||
.minSuffixCount2(prune)
|
||||
.shouldShareSuffix(prune == 0)
|
||||
.allowFixedLengthArcs(!noArcArrays)
|
||||
.build();
|
||||
}
|
||||
|
@ -564,10 +573,6 @@ public class TestFSTs extends LuceneTestCase {
|
|||
long tEnd = System.nanoTime();
|
||||
System.out.println(
|
||||
((tEnd - tMid) / (double) TimeUnit.SECONDS.toNanos(1)) + " sec to finish/pack");
|
||||
if (fst == null) {
|
||||
System.out.println("FST was fully pruned!");
|
||||
System.exit(0);
|
||||
}
|
||||
|
||||
if (dirOut == null) {
|
||||
return;
|
||||
|
@ -661,7 +666,6 @@ public class TestFSTs extends LuceneTestCase {
|
|||
// java -cp
|
||||
// ../build/codecs/classes/java:../test-framework/lib/randomizedtesting-runner-*.jar:../build/core/classes/test:../build/core/classes/test-framework:../build/core/classes/java:../build/test-framework/classes/java:../test-framework/lib/junit-4.10.jar org.apache.lucene.util.fst.TestFSTs /xold/tmp/allTerms3.txt out
|
||||
public static void main(String[] args) throws IOException {
|
||||
int prune = 0;
|
||||
int limit = Integer.MAX_VALUE;
|
||||
int inputMode = 0; // utf8
|
||||
boolean storeOrds = false;
|
||||
|
@ -673,10 +677,7 @@ public class TestFSTs extends LuceneTestCase {
|
|||
|
||||
int idx = 0;
|
||||
while (idx < args.length) {
|
||||
if (args[idx].equals("-prune")) {
|
||||
prune = Integer.parseInt(args[1 + idx]);
|
||||
idx++;
|
||||
} else if (args[idx].equals("-limit")) {
|
||||
if (args[idx].equals("-limit")) {
|
||||
limit = Integer.parseInt(args[1 + idx]);
|
||||
idx++;
|
||||
} else if (args[idx].equals("-utf8")) {
|
||||
|
@ -720,7 +721,7 @@ public class TestFSTs extends LuceneTestCase {
|
|||
final PositiveIntOutputs o2 = PositiveIntOutputs.getSingleton();
|
||||
final PairOutputs<Long, Long> outputs = new PairOutputs<>(o1, o2);
|
||||
new VisitTerms<PairOutputs.Pair<Long, Long>>(
|
||||
dirOut, wordsFileIn, inputMode, prune, outputs, noArcArrays) {
|
||||
dirOut, wordsFileIn, inputMode, outputs, noArcArrays) {
|
||||
Random rand;
|
||||
|
||||
@Override
|
||||
|
@ -734,7 +735,7 @@ public class TestFSTs extends LuceneTestCase {
|
|||
} else if (storeOrds) {
|
||||
// Store only ords
|
||||
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
|
||||
new VisitTerms<Long>(dirOut, wordsFileIn, inputMode, prune, outputs, noArcArrays) {
|
||||
new VisitTerms<Long>(dirOut, wordsFileIn, inputMode, outputs, noArcArrays) {
|
||||
@Override
|
||||
public Long getOutput(IntsRef input, int ord) {
|
||||
return (long) ord;
|
||||
|
@ -743,7 +744,7 @@ public class TestFSTs extends LuceneTestCase {
|
|||
} else if (storeDocFreqs) {
|
||||
// Store only docFreq
|
||||
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
|
||||
new VisitTerms<Long>(dirOut, wordsFileIn, inputMode, prune, outputs, noArcArrays) {
|
||||
new VisitTerms<Long>(dirOut, wordsFileIn, inputMode, outputs, noArcArrays) {
|
||||
Random rand;
|
||||
|
||||
@Override
|
||||
|
@ -758,7 +759,7 @@ public class TestFSTs extends LuceneTestCase {
|
|||
// Store nothing
|
||||
final NoOutputs outputs = NoOutputs.getSingleton();
|
||||
final Object NO_OUTPUT = outputs.getNoOutput();
|
||||
new VisitTerms<Object>(dirOut, wordsFileIn, inputMode, prune, outputs, noArcArrays) {
|
||||
new VisitTerms<Object>(dirOut, wordsFileIn, inputMode, outputs, noArcArrays) {
|
||||
@Override
|
||||
public Object getOutput(IntsRef input, int ord) {
|
||||
return NO_OUTPUT;
|
||||
|
@ -1160,19 +1161,20 @@ public class TestFSTs extends LuceneTestCase {
|
|||
s.verifyStateAndBelow(fst, arc, 1);
|
||||
}
|
||||
|
||||
@Ignore("not sure it's possible to get a final state output anymore w/o pruning?")
|
||||
public void testFinalOutputOnEndState() throws Exception {
|
||||
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
|
||||
|
||||
final FSTCompiler<Long> fstCompiler =
|
||||
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE4, outputs).minSuffixCount1(2).build();
|
||||
fstCompiler.add(Util.toUTF32("stat", new IntsRefBuilder()), 17L);
|
||||
fstCompiler.add(Util.toUTF32("station", new IntsRefBuilder()), 10L);
|
||||
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE4, outputs).build();
|
||||
fstCompiler.add(Util.toUTF32("slat", new IntsRefBuilder()), 10L);
|
||||
fstCompiler.add(Util.toUTF32("st", new IntsRefBuilder()), 17L);
|
||||
final FST<Long> fst = fstCompiler.compile();
|
||||
// Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp3/out.dot"));
|
||||
StringWriter w = new StringWriter();
|
||||
Util.toDot(fst, w, false, false);
|
||||
w.close();
|
||||
// System.out.println(w.toString());
|
||||
System.out.println(w.toString());
|
||||
assertTrue(w.toString().contains("label=\"t/[7]\""));
|
||||
}
|
||||
|
||||
|
|
|
@ -121,7 +121,7 @@ public class TestFSTsMisc extends LuceneTestCase {
|
|||
}
|
||||
return output1.equals(output2);
|
||||
}
|
||||
}.doTest(false);
|
||||
}.doTest();
|
||||
}
|
||||
|
||||
// ListOfOutputs(PositiveIntOutputs), generally but not
|
||||
|
@ -157,7 +157,7 @@ public class TestFSTsMisc extends LuceneTestCase {
|
|||
|
||||
pairs.add(new FSTTester.InputOutput<>(terms[idx], output));
|
||||
}
|
||||
new FSTTester<>(random(), dir, inputMode, pairs, outputs).doTest(false);
|
||||
new FSTTester<>(random(), dir, inputMode, pairs, outputs).doTest();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -384,9 +384,7 @@ public final class VersionBlockTreeTermsWriter extends FieldsConsumer {
|
|||
}
|
||||
|
||||
final FSTCompiler<Pair<BytesRef, Long>> fstCompiler =
|
||||
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, FST_OUTPUTS)
|
||||
.shouldShareNonSingletonNodes(false)
|
||||
.build();
|
||||
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, FST_OUTPUTS).build();
|
||||
// if (DEBUG) {
|
||||
// System.out.println(" compile index for prefix=" + prefix);
|
||||
// }
|
||||
|
|
|
@ -121,8 +121,8 @@ public class FSTCompletionBuilder {
|
|||
/** Scratch buffer for {@link #add(BytesRef, int)}. */
|
||||
private final BytesRefBuilder scratch = new BytesRefBuilder();
|
||||
|
||||
/** Max tail sharing length. */
|
||||
private final int shareMaxTailLength;
|
||||
/** Max tail sharing RAM (MB). */
|
||||
private final double suffixRAMLimitMB;
|
||||
|
||||
/**
|
||||
* Creates an {@link FSTCompletion} with default options: 10 buckets, exact match promoted to
|
||||
|
@ -130,7 +130,7 @@ public class FSTCompletionBuilder {
|
|||
* Comparator#naturalOrder()}.
|
||||
*/
|
||||
public FSTCompletionBuilder() {
|
||||
this(DEFAULT_BUCKETS, new InMemorySorter(Comparator.naturalOrder()), Integer.MAX_VALUE);
|
||||
this(DEFAULT_BUCKETS, new InMemorySorter(Comparator.naturalOrder()), Double.POSITIVE_INFINITY);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -141,13 +141,13 @@ public class FSTCompletionBuilder {
|
|||
* @param sorter {@link BytesRefSorter} used for re-sorting input for the automaton. For large
|
||||
* inputs, use on-disk sorting implementations. The sorter is closed automatically in {@link
|
||||
* #build()} if it implements {@link Closeable}.
|
||||
* @param shareMaxTailLength Max shared suffix sharing length.
|
||||
* @param suffixRAMLimitMB Max shared suffix RAM size (MB).
|
||||
* <p>See the description of this parameter in {@link
|
||||
* org.apache.lucene.util.fst.FSTCompiler.Builder}. In general, for very large inputs you'll
|
||||
* want to construct a non-minimal automaton which will be larger, but the construction will
|
||||
* take far less ram. For minimal automata, set it to {@link Integer#MAX_VALUE}.
|
||||
* take far less ram. For minimal automata, set it to {@link Double#MAX_VALUE}.
|
||||
*/
|
||||
public FSTCompletionBuilder(int buckets, BytesRefSorter sorter, int shareMaxTailLength) {
|
||||
public FSTCompletionBuilder(int buckets, BytesRefSorter sorter, double suffixRAMLimitMB) {
|
||||
if (buckets < 1 || buckets > 255) {
|
||||
throw new IllegalArgumentException("Buckets must be >= 1 and <= 255: " + buckets);
|
||||
}
|
||||
|
@ -158,7 +158,7 @@ public class FSTCompletionBuilder {
|
|||
|
||||
this.sorter = sorter;
|
||||
this.buckets = buckets;
|
||||
this.shareMaxTailLength = shareMaxTailLength;
|
||||
this.suffixRAMLimitMB = suffixRAMLimitMB;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -204,7 +204,7 @@ public class FSTCompletionBuilder {
|
|||
final Object empty = outputs.getNoOutput();
|
||||
final FSTCompiler<Object> fstCompiler =
|
||||
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs)
|
||||
.shareMaxTailLength(shareMaxTailLength)
|
||||
.suffixRAMLimitMB(suffixRAMLimitMB)
|
||||
.build();
|
||||
|
||||
BytesRefBuilder scratch = new BytesRefBuilder();
|
||||
|
|
|
@ -29,7 +29,6 @@ import java.nio.charset.Charset;
|
|||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Random;
|
||||
|
@ -180,19 +179,6 @@ public class FSTTester<T> {
|
|||
}
|
||||
}
|
||||
|
||||
public void doTest(boolean testPruning) throws IOException {
|
||||
// no pruning
|
||||
doTest(0, 0, true);
|
||||
|
||||
if (testPruning) {
|
||||
// simple pruning
|
||||
doTest(TestUtil.nextInt(random, 1, 1 + pairs.size()), 0, true);
|
||||
|
||||
// leafy pruning
|
||||
doTest(0, TestUtil.nextInt(random, 1, 1 + pairs.size()), true);
|
||||
}
|
||||
}
|
||||
|
||||
// runs the term, returning the output, or null if term
|
||||
// isn't accepted. if prefixLength is non-null it must be
|
||||
// length 1 int array; prefixLength[0] is set to the length
|
||||
|
@ -267,21 +253,11 @@ public class FSTTester<T> {
|
|||
return output;
|
||||
}
|
||||
|
||||
public FST<T> doTest(int prune1, int prune2, boolean allowRandomSuffixSharing)
|
||||
throws IOException {
|
||||
if (LuceneTestCase.VERBOSE) {
|
||||
System.out.println("\nTEST: prune1=" + prune1 + " prune2=" + prune2);
|
||||
}
|
||||
public FST<T> doTest() throws IOException {
|
||||
|
||||
final FSTCompiler<T> fstCompiler =
|
||||
new FSTCompiler.Builder<>(
|
||||
inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, outputs)
|
||||
.minSuffixCount1(prune1)
|
||||
.minSuffixCount2(prune2)
|
||||
.shouldShareSuffix(prune1 == 0 && prune2 == 0)
|
||||
.shouldShareNonSingletonNodes(allowRandomSuffixSharing ? random.nextBoolean() : true)
|
||||
.shareMaxTailLength(
|
||||
allowRandomSuffixSharing ? TestUtil.nextInt(random, 1, 10) : Integer.MAX_VALUE)
|
||||
.build();
|
||||
|
||||
for (InputOutput<T> pair : pairs) {
|
||||
|
@ -332,11 +308,7 @@ public class FSTTester<T> {
|
|||
}
|
||||
}
|
||||
|
||||
if (prune1 == 0 && prune2 == 0) {
|
||||
verifyUnPruned(inputMode, fst);
|
||||
} else {
|
||||
verifyPruned(inputMode, fst, prune1, prune2);
|
||||
}
|
||||
verifyUnPruned(inputMode, fst);
|
||||
|
||||
nodeCount = fstCompiler.getNodeCount();
|
||||
arcCount = fstCompiler.getArcCount();
|
||||
|
@ -646,207 +618,4 @@ public class FSTTester<T> {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static class CountMinOutput<T> {
|
||||
int count;
|
||||
T output;
|
||||
T finalOutput;
|
||||
boolean isLeaf = true;
|
||||
boolean isFinal;
|
||||
}
|
||||
|
||||
// FST is pruned
|
||||
private void verifyPruned(int inputMode, FST<T> fst, int prune1, int prune2) throws IOException {
|
||||
|
||||
if (LuceneTestCase.VERBOSE) {
|
||||
System.out.println("TEST: now verify pruned " + pairs.size() + " terms; outputs=" + outputs);
|
||||
for (InputOutput<T> pair : pairs) {
|
||||
System.out.println(
|
||||
" "
|
||||
+ inputToString(inputMode, pair.input)
|
||||
+ ": "
|
||||
+ outputs.outputToString(pair.output));
|
||||
}
|
||||
}
|
||||
|
||||
// To validate the FST, we brute-force compute all prefixes
|
||||
// in the terms, matched to their "common" outputs, prune that
|
||||
// set according to the prune thresholds, then assert the FST
|
||||
// matches that same set.
|
||||
|
||||
// NOTE: Crazy RAM intensive!!
|
||||
|
||||
// System.out.println("TEST: tally prefixes");
|
||||
|
||||
// build all prefixes
|
||||
final Map<IntsRef, CountMinOutput<T>> prefixes = new HashMap<>();
|
||||
final IntsRefBuilder scratch = new IntsRefBuilder();
|
||||
for (InputOutput<T> pair : pairs) {
|
||||
scratch.copyInts(pair.input);
|
||||
for (int idx = 0; idx <= pair.input.length; idx++) {
|
||||
scratch.setLength(idx);
|
||||
CountMinOutput<T> cmo = prefixes.get(scratch.get());
|
||||
if (cmo == null) {
|
||||
cmo = new CountMinOutput<>();
|
||||
cmo.count = 1;
|
||||
cmo.output = pair.output;
|
||||
prefixes.put(scratch.toIntsRef(), cmo);
|
||||
} else {
|
||||
cmo.count++;
|
||||
T output1 = cmo.output;
|
||||
if (output1.equals(outputs.getNoOutput())) {
|
||||
output1 = outputs.getNoOutput();
|
||||
}
|
||||
T output2 = pair.output;
|
||||
if (output2.equals(outputs.getNoOutput())) {
|
||||
output2 = outputs.getNoOutput();
|
||||
}
|
||||
cmo.output = outputs.common(output1, output2);
|
||||
}
|
||||
if (idx == pair.input.length) {
|
||||
cmo.isFinal = true;
|
||||
cmo.finalOutput = cmo.output;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (LuceneTestCase.VERBOSE) {
|
||||
System.out.println("TEST: now prune");
|
||||
}
|
||||
|
||||
// prune 'em
|
||||
final Iterator<Map.Entry<IntsRef, CountMinOutput<T>>> it = prefixes.entrySet().iterator();
|
||||
while (it.hasNext()) {
|
||||
Map.Entry<IntsRef, CountMinOutput<T>> ent = it.next();
|
||||
final IntsRef prefix = ent.getKey();
|
||||
final CountMinOutput<T> cmo = ent.getValue();
|
||||
if (LuceneTestCase.VERBOSE) {
|
||||
System.out.println(
|
||||
" term prefix="
|
||||
+ inputToString(inputMode, prefix, false)
|
||||
+ " count="
|
||||
+ cmo.count
|
||||
+ " isLeaf="
|
||||
+ cmo.isLeaf
|
||||
+ " output="
|
||||
+ outputs.outputToString(cmo.output)
|
||||
+ " isFinal="
|
||||
+ cmo.isFinal);
|
||||
}
|
||||
final boolean keep;
|
||||
if (prune1 > 0) {
|
||||
keep = cmo.count >= prune1;
|
||||
} else {
|
||||
assert prune2 > 0;
|
||||
if (prune2 > 1 && cmo.count >= prune2) {
|
||||
keep = true;
|
||||
} else if (prefix.length > 0) {
|
||||
// consult our parent
|
||||
scratch.setLength(prefix.length - 1);
|
||||
System.arraycopy(prefix.ints, prefix.offset, scratch.ints(), 0, scratch.length());
|
||||
final CountMinOutput<T> cmo2 = prefixes.get(scratch.get());
|
||||
// System.out.println(" parent count = " + (cmo2 == null ? -1 : cmo2.count));
|
||||
keep =
|
||||
cmo2 != null
|
||||
&& ((prune2 > 1 && cmo2.count >= prune2)
|
||||
|| (prune2 == 1 && (cmo2.count >= 2 || prefix.length <= 1)));
|
||||
} else {
|
||||
keep = cmo.count >= prune2;
|
||||
}
|
||||
}
|
||||
|
||||
if (!keep) {
|
||||
it.remove();
|
||||
// System.out.println(" remove");
|
||||
} else {
|
||||
// clear isLeaf for all ancestors
|
||||
// System.out.println(" keep");
|
||||
scratch.copyInts(prefix);
|
||||
scratch.setLength(scratch.length() - 1);
|
||||
while (scratch.length() >= 0) {
|
||||
final CountMinOutput<T> cmo2 = prefixes.get(scratch.get());
|
||||
if (cmo2 != null) {
|
||||
// System.out.println(" clear isLeaf " + inputToString(inputMode, scratch));
|
||||
cmo2.isLeaf = false;
|
||||
}
|
||||
scratch.setLength(scratch.length() - 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (LuceneTestCase.VERBOSE) {
|
||||
System.out.println("TEST: after prune");
|
||||
for (Map.Entry<IntsRef, CountMinOutput<T>> ent : prefixes.entrySet()) {
|
||||
System.out.println(
|
||||
" "
|
||||
+ inputToString(inputMode, ent.getKey(), false)
|
||||
+ ": isLeaf="
|
||||
+ ent.getValue().isLeaf
|
||||
+ " isFinal="
|
||||
+ ent.getValue().isFinal);
|
||||
if (ent.getValue().isFinal) {
|
||||
System.out.println(
|
||||
" finalOutput=" + outputs.outputToString(ent.getValue().finalOutput));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (prefixes.size() <= 1) {
|
||||
assertNull(fst);
|
||||
return;
|
||||
}
|
||||
|
||||
assertNotNull(fst);
|
||||
|
||||
// make sure FST only enums valid prefixes
|
||||
if (LuceneTestCase.VERBOSE) {
|
||||
System.out.println("TEST: check pruned enum");
|
||||
}
|
||||
IntsRefFSTEnum<T> fstEnum = new IntsRefFSTEnum<>(fst);
|
||||
IntsRefFSTEnum.InputOutput<T> current;
|
||||
while ((current = fstEnum.next()) != null) {
|
||||
if (LuceneTestCase.VERBOSE) {
|
||||
System.out.println(
|
||||
" fstEnum.next prefix="
|
||||
+ inputToString(inputMode, current.input, false)
|
||||
+ " output="
|
||||
+ outputs.outputToString(current.output));
|
||||
}
|
||||
final CountMinOutput<T> cmo = prefixes.get(current.input);
|
||||
assertNotNull(cmo);
|
||||
assertTrue(cmo.isLeaf || cmo.isFinal);
|
||||
// if (cmo.isFinal && !cmo.isLeaf) {
|
||||
if (cmo.isFinal) {
|
||||
assertEquals(cmo.finalOutput, current.output);
|
||||
} else {
|
||||
assertEquals(cmo.output, current.output);
|
||||
}
|
||||
}
|
||||
|
||||
// make sure all non-pruned prefixes are present in the FST
|
||||
if (LuceneTestCase.VERBOSE) {
|
||||
System.out.println("TEST: verify all prefixes");
|
||||
}
|
||||
final int[] stopNode = new int[1];
|
||||
for (Map.Entry<IntsRef, CountMinOutput<T>> ent : prefixes.entrySet()) {
|
||||
if (ent.getKey().length > 0) {
|
||||
final CountMinOutput<T> cmo = ent.getValue();
|
||||
final T output = run(fst, ent.getKey(), stopNode);
|
||||
if (LuceneTestCase.VERBOSE) {
|
||||
System.out.println(
|
||||
"TEST: verify prefix="
|
||||
+ inputToString(inputMode, ent.getKey(), false)
|
||||
+ " output="
|
||||
+ outputs.outputToString(cmo.output));
|
||||
}
|
||||
// if (cmo.isFinal && !cmo.isLeaf) {
|
||||
if (cmo.isFinal) {
|
||||
assertEquals(cmo.finalOutput, output);
|
||||
} else {
|
||||
assertEquals(cmo.output, output);
|
||||
}
|
||||
assertEquals(ent.getKey().length, stopNode[0]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue