Bound the RAM used by the NodeHash (sharing common suffixes) during FST compilation (#12633)

* tweak comments; change if to switch

* remove old SOPs, minor comment styling, fixed silly performance bug on rehash using the wrong bitsRequired (count vs node)

* first raw cut; some nocommits added; some tests fail

* tests pass!

* fix silly fallback hash bug

* remove SOPs; add some temporary debugging metrics

* add temporary tool to test FST performance across differing NodeHash sizes

* remove (now deleted) shouldShareNonSingletonNodes call from Lucene90BlockTreeTermsWriter

* add simple tool to render results table to GitHub MD

* add simple temporary tool to iterate all terms from a provided luceneutil wikipedia index and build an FST from them

* first cut at using packed ints for hash t able again

* add some nocommits; tweak test_all_sizes.py to new RAM usage approach; when half of the double barrel is full, allocate new primary hash at full size to save cost of continuously rehashing for a large FST

* switch to limit suffix hash by RAM usage not count (more intuitive for users); clean up some stale nocommits

* switch to more intuitive approximate RAM (mb) limit for allowed size of NodeHash

* nuke a few nocommits; a few more remain

* remove DO_PRINT_HASH_RAM

* no more FST pruning

* remove final nocommit: randomly change allowed NodeHash suffix RAM size in TestFSTs.testRealTerms

* remove SOP

* tidy

* delete temp utility tools

* remove dead (FST pruning) code

* add CHANGES entry; fix one missed fst.addNode -> fstCompiler.addNode during merge conflict resolution

* remove a mal-formed nocommit

* fold PR feedback

* fold feedback

* add gradle help test details on how to specify heap size for the test JVM; fix bogus assert (uncovered by Test2BFST); add TODO to Test2BFST anticipating building massive FSTs in small bounded RAM

* suppress sysout checks for Test2BFSTs; add helpful comment showing how to run it directly

* tidy
This commit is contained in:
Michael McCandless 2023-10-20 11:52:55 -04:00 committed by GitHub
parent 0d8a3e6c4f
commit afb2a60751
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 390 additions and 622 deletions

View File

@ -133,6 +133,15 @@ specifying the project and test task or a fully qualified task path. Example:
gradlew -p lucene/core test -Ptests.verbose=true --tests "TestDemo"
Larger heap size
--------------------------
By default tests run with a 512 MB max heap. But some tests (monster/nightly)
need more heap. Use "-Dtests.heapsize" for this:
gradlew -p lucene/core test --tests "Test2BFST" -Dtest.heapsize=32g
Run GUI tests headlessly with Xvfb (Linux only)
-----------------------------------------------

View File

@ -82,6 +82,13 @@ Improvements
* GITHUB#12447: Hunspell: speed up the dictionary enumeration on suggestion (Peter Gromov)
* GITHUB#12542: FSTCompiler can now approximately limit how much RAM it uses to share
suffixes during FST construction using the suffixRAMLimitMB method. Larger values
result in a more minimal FST (more common suffixes are shard). Pass
Double.POSITIVE_INFINITY to use as much RAM as is needed to create a purely
minimal FST. Inspired by this Rust FST implemention:
https://blog.burntsushi.net/transducers (Mike McCandless)
Optimizations
---------------------

View File

@ -478,9 +478,7 @@ public final class Lucene40BlockTreeTermsWriter extends FieldsConsumer {
final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
final FSTCompiler<BytesRef> fstCompiler =
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs)
.shouldShareNonSingletonNodes(false)
.build();
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs).build();
// if (DEBUG) {
// System.out.println(" compile index for prefix=" + prefix);
// }

View File

@ -395,9 +395,7 @@ public final class OrdsBlockTreeTermsWriter extends FieldsConsumer {
}
final FSTCompiler<Output> fstCompiler =
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, FST_OUTPUTS)
.shouldShareNonSingletonNodes(false)
.build();
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, FST_OUTPUTS).build();
// if (DEBUG) {
// System.out.println(" compile index for prefix=" + prefix);
// }

View File

@ -521,10 +521,7 @@ public final class Lucene90BlockTreeTermsWriter extends FieldsConsumer {
final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
final FSTCompiler<BytesRef> fstCompiler =
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs)
.shouldShareNonSingletonNodes(false)
.bytesPageBits(pageBits)
.build();
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs).bytesPageBits(pageBits).build();
// if (DEBUG) {
// System.out.println(" compile index for prefix=" + prefix);
// }

View File

@ -83,13 +83,16 @@ public final class FST<T> implements Accountable {
static final int BIT_ARC_HAS_FINAL_OUTPUT = 1 << 5;
/** Value of the arc flags to declare a node with fixed length arcs designed for binary search. */
/**
* Value of the arc flags to declare a node with fixed length (sparse) arcs designed for binary
* search.
*/
// We use this as a marker because this one flag is illegal by itself.
public static final byte ARCS_FOR_BINARY_SEARCH = BIT_ARC_HAS_FINAL_OUTPUT;
/**
* Value of the arc flags to declare a node with fixed length arcs and bit table designed for
* direct addressing.
* Value of the arc flags to declare a node with fixed length dense arcs and bit table designed
* for direct addressing.
*/
static final byte ARCS_FOR_DIRECT_ADDRESSING = 1 << 6;
@ -751,11 +754,9 @@ public final class FST<T> implements Accountable {
private void readFirstArcInfo(long nodeAddress, Arc<T> arc, final BytesReader in)
throws IOException {
in.setPosition(nodeAddress);
// System.out.println(" flags=" + arc.flags);
byte flags = arc.nodeFlags = in.readByte();
if (flags == ARCS_FOR_BINARY_SEARCH || flags == ARCS_FOR_DIRECT_ADDRESSING) {
// System.out.println(" fixed length arc");
// Special arc which is actually a node header for fixed length arcs.
arc.numArcs = in.readVInt();
arc.bytesPerArc = in.readVInt();
@ -766,8 +767,6 @@ public final class FST<T> implements Accountable {
arc.presenceIndex = -1;
}
arc.posArcsStart = in.getPosition();
// System.out.println(" bytesPer=" + arc.bytesPerArc + " numArcs=" + arc.numArcs + "
// arcsStart=" + pos);
} else {
arc.nextArc = nodeAddress;
arc.bytesPerArc = 0;
@ -830,14 +829,12 @@ public final class FST<T> implements Accountable {
}
}
} else {
if (arc.bytesPerArc() != 0) {
// System.out.println(" nextArc real array");
// Arcs have fixed length.
if (arc.nodeFlags() == ARCS_FOR_BINARY_SEARCH) {
switch (arc.nodeFlags()) {
case ARCS_FOR_BINARY_SEARCH:
// Point to next arc, -1 to skip arc flags.
in.setPosition(arc.posArcsStart() - (1 + arc.arcIdx()) * (long) arc.bytesPerArc() - 1);
} else {
assert arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING;
break;
case ARCS_FOR_DIRECT_ADDRESSING:
// Direct addressing node. The label is not stored but rather inferred
// based on first label and arc index in the range.
assert BitTable.assertIsValid(arc, in);
@ -845,12 +842,14 @@ public final class FST<T> implements Accountable {
int nextIndex = BitTable.nextBitSet(arc.arcIdx(), arc, in);
assert nextIndex != -1;
return arc.firstLabel() + nextIndex;
}
} else {
// Arcs have variable length.
// System.out.println(" nextArc real list");
// Position to next arc, -1 to skip flags.
in.setPosition(arc.nextArc() - 1);
default:
// Variable length arcs - linear search.
assert arc.bytesPerArc() == 0;
// Arcs have variable length.
// System.out.println(" nextArc real list");
// Position to next arc, -1 to skip flags.
in.setPosition(arc.nextArc() - 1);
break;
}
}
return readLabel(in);

View File

@ -89,18 +89,6 @@ public class FSTCompiler<T> {
// private static final boolean DEBUG = true;
// simplistic pruning: we prune node (and all following
// nodes) if less than this number of terms go through it:
private final int minSuffixCount1;
// better pruning: we prune node (and all following
// nodes) if the prior node has less than this number of
// terms go through it:
private final int minSuffixCount2;
private final boolean doShareNonSingletonNodes;
private final int shareMaxTailLength;
private final IntsRefBuilder lastInput = new IntsRefBuilder();
// NOTE: cutting this over to ArrayList instead loses ~6%
@ -135,32 +123,27 @@ public class FSTCompiler<T> {
* Instantiates an FST/FSA builder with default settings and pruning options turned off. For more
* tuning and tweaking, see {@link Builder}.
*/
// TODO: remove this? Builder API should be the only entry point?
public FSTCompiler(FST.INPUT_TYPE inputType, Outputs<T> outputs) {
this(inputType, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15, 1f);
this(inputType, 32.0, outputs, true, 15, 1f);
}
private FSTCompiler(
FST.INPUT_TYPE inputType,
int minSuffixCount1,
int minSuffixCount2,
boolean doShareSuffix,
boolean doShareNonSingletonNodes,
int shareMaxTailLength,
double suffixRAMLimitMB,
Outputs<T> outputs,
boolean allowFixedLengthArcs,
int bytesPageBits,
float directAddressingMaxOversizingFactor) {
this.minSuffixCount1 = minSuffixCount1;
this.minSuffixCount2 = minSuffixCount2;
this.doShareNonSingletonNodes = doShareNonSingletonNodes;
this.shareMaxTailLength = shareMaxTailLength;
this.allowFixedLengthArcs = allowFixedLengthArcs;
this.directAddressingMaxOversizingFactor = directAddressingMaxOversizingFactor;
fst = new FST<>(inputType, outputs, bytesPageBits);
bytes = fst.bytes;
assert bytes != null;
if (doShareSuffix) {
dedupHash = new NodeHash<>(fst, bytes.getReverseReader(false));
if (suffixRAMLimitMB < 0) {
throw new IllegalArgumentException("ramLimitMB must be >= 0; got: " + suffixRAMLimitMB);
} else if (suffixRAMLimitMB > 0) {
dedupHash = new NodeHash<>(fst, suffixRAMLimitMB, bytes.getReverseReader(false));
} else {
dedupHash = null;
}
@ -184,11 +167,7 @@ public class FSTCompiler<T> {
private final INPUT_TYPE inputType;
private final Outputs<T> outputs;
private int minSuffixCount1;
private int minSuffixCount2;
private boolean shouldShareSuffix = true;
private boolean shouldShareNonSingletonNodes = true;
private int shareMaxTailLength = Integer.MAX_VALUE;
private double suffixRAMLimitMB = 32.0;
private boolean allowFixedLengthArcs = true;
private int bytesPageBits = 15;
private float directAddressingMaxOversizingFactor = DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR;
@ -207,59 +186,26 @@ public class FSTCompiler<T> {
}
/**
* If pruning the input graph during construction, this threshold is used for telling if a node
* is kept or pruned. If transition_count(node) &gt;= minSuffixCount1, the node is kept.
* The approximate maximum amount of RAM (in MB) to use holding the suffix cache, which enables
* the FST to share common suffixes. Pass {@link Double#POSITIVE_INFINITY} to keep all suffixes
* and create an exactly minimal FST. In this case, the amount of RAM actually used will be
* bounded by the number of unique suffixes. If you pass a value smaller than the builder would
* use, the least recently used suffixes will be discarded, thus reducing suffix sharing and
* creating a non-minimal FST. In this case, the larger the limit, the closer the FST will be to
* its true minimal size, with diminishing returns as you increase the limit. Pass {@code 0} to
* disable suffix sharing entirely, but note that the resulting FST can be substantially larger
* than the minimal FST.
*
* <p>Default = 0.
*/
public Builder<T> minSuffixCount1(int minSuffixCount1) {
this.minSuffixCount1 = minSuffixCount1;
return this;
}
/**
* Better pruning: we prune node (and all following nodes) if the prior node has less than this
* number of terms go through it.
* <p>Note that this is not a precise limit. The current implementation uses hash tables to map
* the suffixes, and approximates the rough overhead (unused slots) in the hash table.
*
* <p>Default = 0.
* <p>Default = {@code 32.0} MB.
*/
public Builder<T> minSuffixCount2(int minSuffixCount2) {
this.minSuffixCount2 = minSuffixCount2;
return this;
}
/**
* If {@code true}, the shared suffixes will be compacted into unique paths. This requires an
* additional RAM-intensive hash map for lookups in memory. Setting this parameter to {@code
* false} creates a single suffix path for all input sequences. This will result in a larger
* FST, but requires substantially less memory and CPU during building.
*
* <p>Default = {@code true}.
*/
public Builder<T> shouldShareSuffix(boolean shouldShareSuffix) {
this.shouldShareSuffix = shouldShareSuffix;
return this;
}
/**
* Only used if {@code shouldShareSuffix} is true. Set this to true to ensure FST is fully
* minimal, at cost of more CPU and more RAM during building.
*
* <p>Default = {@code true}.
*/
public Builder<T> shouldShareNonSingletonNodes(boolean shouldShareNonSingletonNodes) {
this.shouldShareNonSingletonNodes = shouldShareNonSingletonNodes;
return this;
}
/**
* Only used if {@code shouldShareSuffix} is true. Set this to Integer.MAX_VALUE to ensure FST
* is fully minimal, at cost of more CPU and more RAM during building.
*
* <p>Default = {@link Integer#MAX_VALUE}.
*/
public Builder<T> shareMaxTailLength(int shareMaxTailLength) {
this.shareMaxTailLength = shareMaxTailLength;
public Builder<T> suffixRAMLimitMB(double mb) {
if (mb < 0) {
throw new IllegalArgumentException("suffixRAMLimitMB must be >= 0; got: " + mb);
}
this.suffixRAMLimitMB = mb;
return this;
}
@ -309,11 +255,7 @@ public class FSTCompiler<T> {
FSTCompiler<T> fstCompiler =
new FSTCompiler<>(
inputType,
minSuffixCount1,
minSuffixCount2,
shouldShareSuffix,
shouldShareNonSingletonNodes,
shareMaxTailLength,
suffixRAMLimitMB,
outputs,
allowFixedLengthArcs,
bytesPageBits,
@ -346,9 +288,7 @@ public class FSTCompiler<T> {
private CompiledNode compileNode(UnCompiledNode<T> nodeIn, int tailLength) throws IOException {
final long node;
long bytesPosStart = bytes.getPosition();
if (dedupHash != null
&& (doShareNonSingletonNodes || nodeIn.numArcs <= 1)
&& tailLength <= shareMaxTailLength) {
if (dedupHash != null) {
if (nodeIn.numArcs == 0) {
node = addNode(nodeIn);
lastFrozenNode = node;
@ -739,113 +679,36 @@ public class FSTCompiler<T> {
}
private void freezeTail(int prefixLenPlus1) throws IOException {
// System.out.println(" compileTail " + prefixLenPlus1);
final int downTo = Math.max(1, prefixLenPlus1);
for (int idx = lastInput.length(); idx >= downTo; idx--) {
boolean doPrune = false;
boolean doCompile = false;
final int downTo = Math.max(1, prefixLenPlus1);
for (int idx = lastInput.length(); idx >= downTo; idx--) {
final UnCompiledNode<T> node = frontier[idx];
final UnCompiledNode<T> parent = frontier[idx - 1];
if (node.inputCount < minSuffixCount1) {
doPrune = true;
doCompile = true;
} else if (idx > prefixLenPlus1) {
// prune if parent's inputCount is less than suffixMinCount2
if (parent.inputCount < minSuffixCount2
|| (minSuffixCount2 == 1 && parent.inputCount == 1 && idx > 1)) {
// my parent, about to be compiled, doesn't make the cut, so
// I'm definitely pruned
final T nextFinalOutput = node.output;
// if minSuffixCount2 is 1, we keep only up
// until the 'distinguished edge', ie we keep only the
// 'divergent' part of the FST. if my parent, about to be
// compiled, has inputCount 1 then we are already past the
// distinguished edge. NOTE: this only works if
// the FST outputs are not "compressible" (simple
// ords ARE compressible).
doPrune = true;
} else {
// my parent, about to be compiled, does make the cut, so
// I'm definitely not pruned
doPrune = false;
}
doCompile = true;
} else {
// if pruning is disabled (count is 0) we can always
// compile current node
doCompile = minSuffixCount2 == 0;
}
// We "fake" the node as being final if it has no
// outgoing arcs; in theory we could leave it
// as non-final (the FST can represent this), but
// FSTEnum, Util, etc., have trouble w/ non-final
// dead-end states:
// System.out.println(" label=" + ((char) lastInput.ints[lastInput.offset+idx-1]) + " idx="
// + idx + " inputCount=" + frontier[idx].inputCount + " doCompile=" + doCompile + " doPrune="
// + doPrune);
// TODO: is node.numArcs == 0 always false? we no longer prune any nodes from FST:
final boolean isFinal = node.isFinal || node.numArcs == 0;
if (node.inputCount < minSuffixCount2
|| (minSuffixCount2 == 1 && node.inputCount == 1 && idx > 1)) {
// drop all arcs
for (int arcIdx = 0; arcIdx < node.numArcs; arcIdx++) {
@SuppressWarnings({"rawtypes", "unchecked"})
final UnCompiledNode<T> target = (UnCompiledNode<T>) node.arcs[arcIdx].target;
target.clear();
}
node.numArcs = 0;
}
if (doPrune) {
// this node doesn't make it -- deref it
node.clear();
parent.deleteLast(lastInput.intAt(idx - 1), node);
} else {
if (minSuffixCount2 != 0) {
compileAllTargets(node, lastInput.length() - idx);
}
final T nextFinalOutput = node.output;
// We "fake" the node as being final if it has no
// outgoing arcs; in theory we could leave it
// as non-final (the FST can represent this), but
// FSTEnum, Util, etc., have trouble w/ non-final
// dead-end states:
final boolean isFinal = node.isFinal || node.numArcs == 0;
if (doCompile) {
// this node makes it and we now compile it. first,
// compile any targets that were previously
// undecided:
parent.replaceLast(
lastInput.intAt(idx - 1),
compileNode(node, 1 + lastInput.length() - idx),
nextFinalOutput,
isFinal);
} else {
// replaceLast just to install
// nextFinalOutput/isFinal onto the arc
parent.replaceLast(lastInput.intAt(idx - 1), node, nextFinalOutput, isFinal);
// this node will stay in play for now, since we are
// undecided on whether to prune it. later, it
// will be either compiled or pruned, so we must
// allocate a new node:
frontier[idx] = new UnCompiledNode<>(this, idx);
}
}
// this node makes it and we now compile it. first,
// compile any targets that were previously
// undecided:
parent.replaceLast(
lastInput.intAt(idx - 1),
compileNode(node, 1 + lastInput.length() - idx),
nextFinalOutput,
isFinal);
}
}
// for debugging
/*
private String toString(BytesRef b) {
try {
return b.utf8ToString() + " " + b;
} catch (Throwable t) {
return b.toString();
}
}
*/
/**
* Add the next input/output pair. The provided input must be sorted after the previous one
* according to {@link IntsRef#compareTo}. It's also OK to add the same input twice in a row with
@ -987,20 +850,12 @@ public class FSTCompiler<T> {
// minimize nodes in the last word's suffix
freezeTail(0);
if (root.inputCount < minSuffixCount1
|| root.inputCount < minSuffixCount2
|| root.numArcs == 0) {
if (root.numArcs == 0) {
if (fst.emptyOutput == null) {
return null;
} else if (minSuffixCount1 > 0 || minSuffixCount2 > 0) {
// empty string got pruned
return null;
}
} else {
if (minSuffixCount2 != 0) {
compileAllTargets(root, lastInput.length());
}
}
// if (DEBUG) System.out.println(" builder.finish root.isFinal=" + root.isFinal + "
// root.output=" + root.output);
fst.finish(compileNode(root, lastInput.length()).node);
@ -1008,22 +863,6 @@ public class FSTCompiler<T> {
return fst;
}
private void compileAllTargets(UnCompiledNode<T> node, int tailLength) throws IOException {
for (int arcIdx = 0; arcIdx < node.numArcs; arcIdx++) {
final Arc<T> arc = node.arcs[arcIdx];
if (!arc.target.isCompiled()) {
// not yet compiled
@SuppressWarnings({"rawtypes", "unchecked"})
final UnCompiledNode<T> n = (UnCompiledNode<T>) arc.target;
if (n.numArcs == 0) {
// System.out.println("seg=" + segment + " FORCE final arc=" + (char) arc.label);
arc.isFinal = n.isFinal = true;
}
arc.target = compileNode(n, tailLength - 1);
}
}
}
/** Expert: holds a pending (seen but not yet serialized) arc. */
static class Arc<T> {
int label; // really an "unsigned" byte
@ -1065,6 +904,9 @@ public class FSTCompiler<T> {
// code here...
T output;
boolean isFinal;
// TODO: remove this tracking? we used to use it for confusingly pruning NodeHash, but
// we switched to LRU by RAM usage instead:
long inputCount;
/** This node's depth, starting from the automaton root. */

View File

@ -20,76 +20,160 @@ import java.io.IOException;
import org.apache.lucene.util.packed.PackedInts;
import org.apache.lucene.util.packed.PagedGrowableWriter;
// TODO: any way to make a reverse suffix lookup (msokolov's idea) instead of more costly hash?
// hmmm, though, hash is not so wasteful
// since it does not have to store value of each entry: the value is the node pointer in the FST.
// actually, there is much to save
// there -- we would not need any long per entry -- we'd be able to start at the FST end node and
// work backwards from the transitions
// TODO: couldn't we prune naturally back until we see a transition with an output? it's highly
// unlikely (mostly impossible) such suffixes can be shared?
// Used to dedup states (lookup already-frozen states)
final class NodeHash<T> {
private PagedGrowableWriter table;
private long count;
private long mask;
// primary table -- we add nodes into this until it reaches the requested tableSizeLimit/2, then
// we move it to fallback
private PagedGrowableHash primaryTable;
// how many nodes are allowed to store in both primary and fallback tables; when primary gets full
// (tableSizeLimit/2), we move it to the
// fallback table
private final long ramLimitBytes;
// fallback table. if we fallback and find the frozen node here, we promote it to primary table,
// for a simplistic and lowish-RAM-overhead
// (compared to e.g. LinkedHashMap) LRU behaviour. fallbackTable is read-only.
private PagedGrowableHash fallbackTable;
private final FST<T> fst;
private final FST.Arc<T> scratchArc = new FST.Arc<>();
private final FST.BytesReader in;
public NodeHash(FST<T> fst, FST.BytesReader in) {
table = new PagedGrowableWriter(16, 1 << 27, 8, PackedInts.COMPACT);
mask = 15;
/**
* ramLimitMB is the max RAM we can use for recording suffixes. If we hit this limit, the least
* recently used suffixes are discarded, and the FST is no longer minimalI. Still, larger
* ramLimitMB will make the FST smaller (closer to minimal).
*/
public NodeHash(FST<T> fst, double ramLimitMB, FST.BytesReader in) {
if (ramLimitMB <= 0) {
throw new IllegalArgumentException("ramLimitMB must be > 0; got: " + ramLimitMB);
}
double asBytes = ramLimitMB * 1024 * 1024;
if (asBytes >= Long.MAX_VALUE) {
// quietly truncate to Long.MAX_VALUE in bytes too
ramLimitBytes = Long.MAX_VALUE;
} else {
ramLimitBytes = (long) asBytes;
}
primaryTable = new PagedGrowableHash();
this.fst = fst;
this.in = in;
}
private boolean nodesEqual(FSTCompiler.UnCompiledNode<T> node, long address) throws IOException {
fst.readFirstRealTargetArc(address, scratchArc, in);
// Fail fast for a node with fixed length arcs.
if (scratchArc.bytesPerArc() != 0) {
if (scratchArc.nodeFlags() == FST.ARCS_FOR_BINARY_SEARCH) {
if (node.numArcs != scratchArc.numArcs()) {
return false;
}
} else {
assert scratchArc.nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING;
if ((node.arcs[node.numArcs - 1].label - node.arcs[0].label + 1) != scratchArc.numArcs()
|| node.numArcs != FST.Arc.BitTable.countBits(scratchArc, in)) {
return false;
}
}
private long getFallback(FSTCompiler.UnCompiledNode<T> nodeIn, long hash) throws IOException {
if (fallbackTable == null) {
// no fallback yet (primary table is not yet large enough to swap)
return 0;
}
for (int arcUpto = 0; arcUpto < node.numArcs; arcUpto++) {
final FSTCompiler.Arc<T> arc = node.arcs[arcUpto];
if (arc.label != scratchArc.label()
|| !arc.output.equals(scratchArc.output())
|| ((FSTCompiler.CompiledNode) arc.target).node != scratchArc.target()
|| !arc.nextFinalOutput.equals(scratchArc.nextFinalOutput())
|| arc.isFinal != scratchArc.isFinal()) {
return false;
long pos = hash & fallbackTable.mask;
int c = 0;
while (true) {
long node = fallbackTable.get(pos);
if (node == 0) {
// not found
return 0;
} else if (nodesEqual(nodeIn, node)) {
// frozen version of this node is already here
return node;
}
if (scratchArc.isLast()) {
if (arcUpto == node.numArcs - 1) {
return true;
// quadratic probe (but is it, really?)
pos = (pos + (++c)) & fallbackTable.mask;
}
}
public long add(FSTCompiler<T> fstCompiler, FSTCompiler.UnCompiledNode<T> nodeIn)
throws IOException {
long hash = hash(nodeIn);
long pos = hash & primaryTable.mask;
int c = 0;
while (true) {
long node = primaryTable.get(pos);
if (node == 0) {
// node is not in primary table; is it in fallback table?
node = getFallback(nodeIn, hash);
if (node != 0) {
// it was already in fallback -- promote to primary
primaryTable.set(pos, node);
} else {
return false;
}
}
fst.readNextRealArc(scratchArc, in);
}
// not in fallback either -- freeze & add the incoming node
return false;
// freeze & add
node = fstCompiler.addNode(nodeIn);
// we use 0 as empty marker in hash table, so it better be impossible to get a frozen node
// at 0:
assert node != 0;
// confirm frozen hash and unfrozen hash are the same
assert hash(node) == hash : "mismatch frozenHash=" + hash(node) + " vs hash=" + hash;
primaryTable.set(pos, node);
}
// how many bytes would be used if we had "perfect" hashing:
long ramBytesUsed = primaryTable.count * PackedInts.bitsRequired(node) / 8;
// NOTE: we could instead use the more precise RAM used, but this leads to unpredictable
// quantized behavior due to 2X rehashing where for large ranges of the RAM limit, the
// size of the FST does not change, and then suddenly when you cross a secret threshold,
// it drops. With this approach (measuring "perfect" hash storage and approximating the
// overhead), the behaviour is more strictly monotonic: larger RAM limits smoothly result
// in smaller FSTs, even if the precise RAM used is not always under the limit.
// divide limit by 2 because fallback gets half the RAM and primary gets the other half
// divide by 2 again to account for approximate hash table overhead halfway between 33.3%
// and 66.7% occupancy = 50%
if (ramBytesUsed >= ramLimitBytes / (2 * 2)) {
// time to fallback -- fallback is now used read-only to promote a node (suffix) to
// primary if we encounter it again
fallbackTable = primaryTable;
// size primary table the same size to reduce rehash cost
// TODO: we could clear & reuse the previous fallbackTable, instead of allocating a new
// to reduce GC load
primaryTable = new PagedGrowableHash(node, Math.max(16, primaryTable.entries.size()));
} else if (primaryTable.count > primaryTable.entries.size() * (2f / 3)) {
// rehash at 2/3 occupancy
primaryTable.rehash(node);
}
return node;
} else if (nodesEqual(nodeIn, node)) {
// same node (in frozen form) is already in primary table
return node;
}
// quadratic probe (but is it, really?)
pos = (pos + (++c)) & primaryTable.mask;
}
}
// hash code for an unfrozen node. This must be identical
// to the frozen case (below)!!
private long hash(FSTCompiler.UnCompiledNode<T> node) {
final int PRIME = 31;
// System.out.println("hash unfrozen");
long h = 0;
// TODO: maybe if number of arcs is high we can safely subsample?
for (int arcIdx = 0; arcIdx < node.numArcs; arcIdx++) {
final FSTCompiler.Arc<T> arc = node.arcs[arcIdx];
// System.out.println(" label=" + arc.label + " target=" + ((Builder.CompiledNode)
// arc.target).node + " h=" + h + " output=" + fst.outputs.outputToString(arc.output) + "
// isFinal?=" + arc.isFinal);
h = PRIME * h + arc.label;
long n = ((FSTCompiler.CompiledNode) arc.target).node;
h = PRIME * h + (int) (n ^ (n >> 32));
@ -99,20 +183,18 @@ final class NodeHash<T> {
h += 17;
}
}
// System.out.println(" ret " + (h&Integer.MAX_VALUE));
return h & Long.MAX_VALUE;
return h;
}
// hash code for a frozen node
// hash code for a frozen node. this must precisely match the hash computation of an unfrozen
// node!
private long hash(long node) throws IOException {
final int PRIME = 31;
// System.out.println("hash frozen node=" + node);
long h = 0;
fst.readFirstRealTargetArc(node, scratchArc, in);
while (true) {
// System.out.println(" label=" + scratchArc.label + " target=" + scratchArc.target + " h=" +
// h + " output=" + fst.outputs.outputToString(scratchArc.output) + " next?=" +
// scratchArc.flag(4) + " final?=" + scratchArc.isFinal() + " pos=" + in.getPosition());
h = PRIME * h + scratchArc.label();
h = PRIME * h + (int) (scratchArc.target() ^ (scratchArc.target() >> 32));
h = PRIME * h + scratchArc.output().hashCode();
@ -125,70 +207,129 @@ final class NodeHash<T> {
}
fst.readNextRealArc(scratchArc, in);
}
// System.out.println(" ret " + (h&Integer.MAX_VALUE));
return h & Long.MAX_VALUE;
return h;
}
public long add(FSTCompiler<T> fstCompiler, FSTCompiler.UnCompiledNode<T> nodeIn)
throws IOException {
// System.out.println("hash: add count=" + count + " vs " + table.size() + " mask=" + mask);
final long h = hash(nodeIn);
long pos = h & mask;
int c = 0;
while (true) {
final long v = table.get(pos);
if (v == 0) {
// freeze & add
final long node = fstCompiler.addNode(nodeIn);
// System.out.println(" now freeze node=" + node);
assert hash(node) == h : "frozenHash=" + hash(node) + " vs h=" + h;
count++;
table.set(pos, node);
// Rehash at 2/3 occupancy:
if (count > 2 * table.size() / 3) {
rehash(node);
/**
* Compares an unfrozen node (UnCompiledNode) with a frozen node at byte location address (long),
* returning true if they are equal.
*/
private boolean nodesEqual(FSTCompiler.UnCompiledNode<T> node, long address) throws IOException {
fst.readFirstRealTargetArc(address, scratchArc, in);
// fail fast for a node with fixed length arcs
if (scratchArc.bytesPerArc() != 0) {
assert node.numArcs > 0;
// the frozen node uses fixed-with arc encoding (same number of bytes per arc), but may be
// sparse or dense
switch (scratchArc.nodeFlags()) {
case FST.ARCS_FOR_BINARY_SEARCH:
// sparse
if (node.numArcs != scratchArc.numArcs()) {
return false;
}
break;
case FST.ARCS_FOR_DIRECT_ADDRESSING:
// dense -- compare both the number of labels allocated in the array (some of which may
// not actually be arcs), and the number of arcs
if ((node.arcs[node.numArcs - 1].label - node.arcs[0].label + 1) != scratchArc.numArcs()
|| node.numArcs != FST.Arc.BitTable.countBits(scratchArc, in)) {
return false;
}
break;
default:
throw new AssertionError("unhandled scratchArc.nodeFlag() " + scratchArc.nodeFlags());
}
}
// compare arc by arc to see if there is a difference
for (int arcUpto = 0; arcUpto < node.numArcs; arcUpto++) {
final FSTCompiler.Arc<T> arc = node.arcs[arcUpto];
if (arc.label != scratchArc.label()
|| arc.output.equals(scratchArc.output()) == false
|| ((FSTCompiler.CompiledNode) arc.target).node != scratchArc.target()
|| arc.nextFinalOutput.equals(scratchArc.nextFinalOutput()) == false
|| arc.isFinal != scratchArc.isFinal()) {
return false;
}
if (scratchArc.isLast()) {
if (arcUpto == node.numArcs - 1) {
return true;
} else {
return false;
}
return node;
} else if (nodesEqual(nodeIn, v)) {
// same node is already here
return v;
}
// quadratic probe
pos = (pos + (++c)) & mask;
fst.readNextRealArc(scratchArc, in);
}
// unfrozen node has fewer arcs than frozen node
return false;
}
// called only by rehash
private void addNew(long address) throws IOException {
long pos = hash(address) & mask;
int c = 0;
while (true) {
if (table.get(pos) == 0) {
table.set(pos, address);
break;
}
/** Inner class because it needs access to hash function and FST bytes. */
private class PagedGrowableHash {
private PagedGrowableWriter entries;
private long count;
private long mask;
// quadratic probe
pos = (pos + (++c)) & mask;
// 256K blocks, but note that the final block is sized only as needed so it won't use the full
// block size when just a few elements were written to it
private static final int BLOCK_SIZE_BYTES = 1 << 18;
public PagedGrowableHash() {
entries = new PagedGrowableWriter(16, BLOCK_SIZE_BYTES, 8, PackedInts.COMPACT);
mask = 15;
}
}
private void rehash(long lastNodeAddress) throws IOException {
final PagedGrowableWriter oldTable = table;
public PagedGrowableHash(long lastNodeAddress, long size) {
entries =
new PagedGrowableWriter(
size, BLOCK_SIZE_BYTES, PackedInts.bitsRequired(lastNodeAddress), PackedInts.COMPACT);
mask = size - 1;
assert (mask & size) == 0 : "size must be a power-of-2; got size=" + size + " mask=" + mask;
}
table =
new PagedGrowableWriter(
2 * oldTable.size(),
1 << 27,
PackedInts.bitsRequired(lastNodeAddress),
PackedInts.COMPACT);
mask = table.size() - 1;
for (long idx = 0; idx < oldTable.size(); idx++) {
final long address = oldTable.get(idx);
if (address != 0) {
addNew(address);
public long get(long index) {
return entries.get(index);
}
public void set(long index, long pointer) throws IOException {
entries.set(index, pointer);
count++;
}
private void rehash(long lastNodeAddress) throws IOException {
// double hash table size on each rehash
PagedGrowableWriter newEntries =
new PagedGrowableWriter(
2 * entries.size(),
BLOCK_SIZE_BYTES,
PackedInts.bitsRequired(lastNodeAddress),
PackedInts.COMPACT);
long newMask = newEntries.size() - 1;
for (long idx = 0; idx < entries.size(); idx++) {
long address = entries.get(idx);
if (address != 0) {
long pos = hash(address) & newMask;
int c = 0;
while (true) {
if (newEntries.get(pos) == 0) {
newEntries.set(pos, address);
break;
}
// quadratic probe
pos = (pos + (++c)) & newMask;
}
}
}
mask = newMask;
entries = newEntries;
}
}
}

View File

@ -25,12 +25,20 @@ import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.MMapDirectory;
import org.apache.lucene.tests.util.LuceneTestCase;
import org.apache.lucene.tests.util.LuceneTestCase.SuppressSysoutChecks;
import org.apache.lucene.tests.util.TimeUnits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
import org.junit.Ignore;
// TODO: soon we will be able to run this test with small heaps! Once #12633 and #12543
// are done
//
// Run something like this:
// ./gradlew test --tests Test2BFST -Dtests.heapsize=32g -Dtests.verbose=true --max-workers=1
@Ignore("Requires tons of heap to run (30 GB hits OOME but 35 GB passes after ~4.5 hours)")
@SuppressSysoutChecks(bugUrl = "test prints helpful progress reports with time")
@TimeoutSuite(millis = 100 * TimeUnits.HOUR)
public class Test2BFST extends LuceneTestCase {

View File

@ -82,6 +82,7 @@ import org.apache.lucene.util.fst.FST.Arc;
import org.apache.lucene.util.fst.FST.BytesReader;
import org.apache.lucene.util.fst.PairOutputs.Pair;
import org.apache.lucene.util.fst.Util.Result;
import org.junit.Ignore;
@SuppressCodecs({"SimpleText", "Direct"})
public class TestFSTs extends LuceneTestCase {
@ -141,7 +142,7 @@ public class TestFSTs extends LuceneTestCase {
pairs.add(new FSTTester.InputOutput<>(term, NO_OUTPUT));
}
FSTTester<Object> tester = new FSTTester<>(random(), dir, inputMode, pairs, outputs);
FST<Object> fst = tester.doTest(0, 0, false);
FST<Object> fst = tester.doTest();
assertNotNull(fst);
assertEquals(22, tester.nodeCount);
assertEquals(27, tester.arcCount);
@ -155,7 +156,7 @@ public class TestFSTs extends LuceneTestCase {
pairs.add(new FSTTester.InputOutput<>(terms2[idx], (long) idx));
}
FSTTester<Long> tester = new FSTTester<>(random(), dir, inputMode, pairs, outputs);
final FST<Long> fst = tester.doTest(0, 0, false);
final FST<Long> fst = tester.doTest();
assertNotNull(fst);
assertEquals(22, tester.nodeCount);
assertEquals(27, tester.arcCount);
@ -170,7 +171,7 @@ public class TestFSTs extends LuceneTestCase {
pairs.add(new FSTTester.InputOutput<>(terms2[idx], output));
}
FSTTester<BytesRef> tester = new FSTTester<>(random(), dir, inputMode, pairs, outputs);
final FST<BytesRef> fst = tester.doTest(0, 0, false);
final FST<BytesRef> fst = tester.doTest();
assertNotNull(fst);
assertEquals(24, tester.nodeCount);
assertEquals(30, tester.arcCount);
@ -190,7 +191,7 @@ public class TestFSTs extends LuceneTestCase {
for (IntsRef term : terms) {
pairs.add(new FSTTester.InputOutput<>(term, NO_OUTPUT));
}
new FSTTester<>(random(), dir, inputMode, pairs, outputs).doTest(true);
new FSTTester<>(random(), dir, inputMode, pairs, outputs).doTest();
}
// PositiveIntOutput (ord)
@ -200,7 +201,7 @@ public class TestFSTs extends LuceneTestCase {
for (int idx = 0; idx < terms.length; idx++) {
pairs.add(new FSTTester.InputOutput<>(terms[idx], (long) idx));
}
new FSTTester<>(random(), dir, inputMode, pairs, outputs).doTest(true);
new FSTTester<>(random(), dir, inputMode, pairs, outputs).doTest();
}
// PositiveIntOutput (random monotonically increasing positive number)
@ -213,7 +214,7 @@ public class TestFSTs extends LuceneTestCase {
lastOutput = value;
pairs.add(new FSTTester.InputOutput<>(term, value));
}
new FSTTester<>(random(), dir, inputMode, pairs, outputs).doTest(true);
new FSTTester<>(random(), dir, inputMode, pairs, outputs).doTest();
}
// PositiveIntOutput (random positive number)
@ -224,7 +225,7 @@ public class TestFSTs extends LuceneTestCase {
pairs.add(
new FSTTester.InputOutput<>(term, TestUtil.nextLong(random(), 0, Long.MAX_VALUE)));
}
new FSTTester<>(random(), dir, inputMode, pairs, outputs).doTest(true);
new FSTTester<>(random(), dir, inputMode, pairs, outputs).doTest();
}
// Pair<ord, (random monotonically increasing positive number>
@ -240,7 +241,7 @@ public class TestFSTs extends LuceneTestCase {
lastOutput = value;
pairs.add(new FSTTester.InputOutput<>(terms[idx], outputs.newPair((long) idx, value)));
}
new FSTTester<>(random(), dir, inputMode, pairs, outputs).doTest(true);
new FSTTester<>(random(), dir, inputMode, pairs, outputs).doTest();
}
// Sequence-of-bytes
@ -253,7 +254,7 @@ public class TestFSTs extends LuceneTestCase {
random().nextInt(30) == 17 ? NO_OUTPUT : newBytesRef(Integer.toString(idx));
pairs.add(new FSTTester.InputOutput<>(terms[idx], output));
}
new FSTTester<>(random(), dir, inputMode, pairs, outputs).doTest(true);
new FSTTester<>(random(), dir, inputMode, pairs, outputs).doTest();
}
// Sequence-of-ints
@ -269,7 +270,7 @@ public class TestFSTs extends LuceneTestCase {
}
pairs.add(new FSTTester.InputOutput<>(terms[idx], output));
}
new FSTTester<>(random(), dir, inputMode, pairs, outputs).doTest(true);
new FSTTester<>(random(), dir, inputMode, pairs, outputs).doTest();
}
}
@ -336,7 +337,22 @@ public class TestFSTs extends LuceneTestCase {
writer.close();
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
FSTCompiler.Builder<Long> builder = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
double suffixRAMLimitMB;
if (random().nextInt(10) == 4) {
// no suffix sharing
suffixRAMLimitMB = 0;
} else if (random().nextInt(10) == 7) {
// share all suffixes (minimal FST)
suffixRAMLimitMB = Double.POSITIVE_INFINITY;
} else {
suffixRAMLimitMB = (random().nextDouble() + 0.01) * 10.0;
}
builder.suffixRAMLimitMB(suffixRAMLimitMB);
FSTCompiler<Long> fstCompiler = builder.build();
boolean storeOrd = random().nextBoolean();
if (VERBOSE) {
@ -505,12 +521,7 @@ public class TestFSTs extends LuceneTestCase {
private final FSTCompiler<T> fstCompiler;
public VisitTerms(
Path dirOut,
Path wordsFileIn,
int inputMode,
int prune,
Outputs<T> outputs,
boolean noArcArrays) {
Path dirOut, Path wordsFileIn, int inputMode, Outputs<T> outputs, boolean noArcArrays) {
this.dirOut = dirOut;
this.wordsFileIn = wordsFileIn;
this.inputMode = inputMode;
@ -519,8 +530,6 @@ public class TestFSTs extends LuceneTestCase {
fstCompiler =
new FSTCompiler.Builder<>(
inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, outputs)
.minSuffixCount2(prune)
.shouldShareSuffix(prune == 0)
.allowFixedLengthArcs(!noArcArrays)
.build();
}
@ -564,10 +573,6 @@ public class TestFSTs extends LuceneTestCase {
long tEnd = System.nanoTime();
System.out.println(
((tEnd - tMid) / (double) TimeUnit.SECONDS.toNanos(1)) + " sec to finish/pack");
if (fst == null) {
System.out.println("FST was fully pruned!");
System.exit(0);
}
if (dirOut == null) {
return;
@ -661,7 +666,6 @@ public class TestFSTs extends LuceneTestCase {
// java -cp
// ../build/codecs/classes/java:../test-framework/lib/randomizedtesting-runner-*.jar:../build/core/classes/test:../build/core/classes/test-framework:../build/core/classes/java:../build/test-framework/classes/java:../test-framework/lib/junit-4.10.jar org.apache.lucene.util.fst.TestFSTs /xold/tmp/allTerms3.txt out
public static void main(String[] args) throws IOException {
int prune = 0;
int limit = Integer.MAX_VALUE;
int inputMode = 0; // utf8
boolean storeOrds = false;
@ -673,10 +677,7 @@ public class TestFSTs extends LuceneTestCase {
int idx = 0;
while (idx < args.length) {
if (args[idx].equals("-prune")) {
prune = Integer.parseInt(args[1 + idx]);
idx++;
} else if (args[idx].equals("-limit")) {
if (args[idx].equals("-limit")) {
limit = Integer.parseInt(args[1 + idx]);
idx++;
} else if (args[idx].equals("-utf8")) {
@ -720,7 +721,7 @@ public class TestFSTs extends LuceneTestCase {
final PositiveIntOutputs o2 = PositiveIntOutputs.getSingleton();
final PairOutputs<Long, Long> outputs = new PairOutputs<>(o1, o2);
new VisitTerms<PairOutputs.Pair<Long, Long>>(
dirOut, wordsFileIn, inputMode, prune, outputs, noArcArrays) {
dirOut, wordsFileIn, inputMode, outputs, noArcArrays) {
Random rand;
@Override
@ -734,7 +735,7 @@ public class TestFSTs extends LuceneTestCase {
} else if (storeOrds) {
// Store only ords
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
new VisitTerms<Long>(dirOut, wordsFileIn, inputMode, prune, outputs, noArcArrays) {
new VisitTerms<Long>(dirOut, wordsFileIn, inputMode, outputs, noArcArrays) {
@Override
public Long getOutput(IntsRef input, int ord) {
return (long) ord;
@ -743,7 +744,7 @@ public class TestFSTs extends LuceneTestCase {
} else if (storeDocFreqs) {
// Store only docFreq
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
new VisitTerms<Long>(dirOut, wordsFileIn, inputMode, prune, outputs, noArcArrays) {
new VisitTerms<Long>(dirOut, wordsFileIn, inputMode, outputs, noArcArrays) {
Random rand;
@Override
@ -758,7 +759,7 @@ public class TestFSTs extends LuceneTestCase {
// Store nothing
final NoOutputs outputs = NoOutputs.getSingleton();
final Object NO_OUTPUT = outputs.getNoOutput();
new VisitTerms<Object>(dirOut, wordsFileIn, inputMode, prune, outputs, noArcArrays) {
new VisitTerms<Object>(dirOut, wordsFileIn, inputMode, outputs, noArcArrays) {
@Override
public Object getOutput(IntsRef input, int ord) {
return NO_OUTPUT;
@ -1160,19 +1161,20 @@ public class TestFSTs extends LuceneTestCase {
s.verifyStateAndBelow(fst, arc, 1);
}
@Ignore("not sure it's possible to get a final state output anymore w/o pruning?")
public void testFinalOutputOnEndState() throws Exception {
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
final FSTCompiler<Long> fstCompiler =
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE4, outputs).minSuffixCount1(2).build();
fstCompiler.add(Util.toUTF32("stat", new IntsRefBuilder()), 17L);
fstCompiler.add(Util.toUTF32("station", new IntsRefBuilder()), 10L);
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE4, outputs).build();
fstCompiler.add(Util.toUTF32("slat", new IntsRefBuilder()), 10L);
fstCompiler.add(Util.toUTF32("st", new IntsRefBuilder()), 17L);
final FST<Long> fst = fstCompiler.compile();
// Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp3/out.dot"));
StringWriter w = new StringWriter();
Util.toDot(fst, w, false, false);
w.close();
// System.out.println(w.toString());
System.out.println(w.toString());
assertTrue(w.toString().contains("label=\"t/[7]\""));
}

View File

@ -121,7 +121,7 @@ public class TestFSTsMisc extends LuceneTestCase {
}
return output1.equals(output2);
}
}.doTest(false);
}.doTest();
}
// ListOfOutputs(PositiveIntOutputs), generally but not
@ -157,7 +157,7 @@ public class TestFSTsMisc extends LuceneTestCase {
pairs.add(new FSTTester.InputOutput<>(terms[idx], output));
}
new FSTTester<>(random(), dir, inputMode, pairs, outputs).doTest(false);
new FSTTester<>(random(), dir, inputMode, pairs, outputs).doTest();
}
}

View File

@ -384,9 +384,7 @@ public final class VersionBlockTreeTermsWriter extends FieldsConsumer {
}
final FSTCompiler<Pair<BytesRef, Long>> fstCompiler =
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, FST_OUTPUTS)
.shouldShareNonSingletonNodes(false)
.build();
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, FST_OUTPUTS).build();
// if (DEBUG) {
// System.out.println(" compile index for prefix=" + prefix);
// }

View File

@ -121,8 +121,8 @@ public class FSTCompletionBuilder {
/** Scratch buffer for {@link #add(BytesRef, int)}. */
private final BytesRefBuilder scratch = new BytesRefBuilder();
/** Max tail sharing length. */
private final int shareMaxTailLength;
/** Max tail sharing RAM (MB). */
private final double suffixRAMLimitMB;
/**
* Creates an {@link FSTCompletion} with default options: 10 buckets, exact match promoted to
@ -130,7 +130,7 @@ public class FSTCompletionBuilder {
* Comparator#naturalOrder()}.
*/
public FSTCompletionBuilder() {
this(DEFAULT_BUCKETS, new InMemorySorter(Comparator.naturalOrder()), Integer.MAX_VALUE);
this(DEFAULT_BUCKETS, new InMemorySorter(Comparator.naturalOrder()), Double.POSITIVE_INFINITY);
}
/**
@ -141,13 +141,13 @@ public class FSTCompletionBuilder {
* @param sorter {@link BytesRefSorter} used for re-sorting input for the automaton. For large
* inputs, use on-disk sorting implementations. The sorter is closed automatically in {@link
* #build()} if it implements {@link Closeable}.
* @param shareMaxTailLength Max shared suffix sharing length.
* @param suffixRAMLimitMB Max shared suffix RAM size (MB).
* <p>See the description of this parameter in {@link
* org.apache.lucene.util.fst.FSTCompiler.Builder}. In general, for very large inputs you'll
* want to construct a non-minimal automaton which will be larger, but the construction will
* take far less ram. For minimal automata, set it to {@link Integer#MAX_VALUE}.
* take far less ram. For minimal automata, set it to {@link Double#MAX_VALUE}.
*/
public FSTCompletionBuilder(int buckets, BytesRefSorter sorter, int shareMaxTailLength) {
public FSTCompletionBuilder(int buckets, BytesRefSorter sorter, double suffixRAMLimitMB) {
if (buckets < 1 || buckets > 255) {
throw new IllegalArgumentException("Buckets must be >= 1 and <= 255: " + buckets);
}
@ -158,7 +158,7 @@ public class FSTCompletionBuilder {
this.sorter = sorter;
this.buckets = buckets;
this.shareMaxTailLength = shareMaxTailLength;
this.suffixRAMLimitMB = suffixRAMLimitMB;
}
/**
@ -204,7 +204,7 @@ public class FSTCompletionBuilder {
final Object empty = outputs.getNoOutput();
final FSTCompiler<Object> fstCompiler =
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs)
.shareMaxTailLength(shareMaxTailLength)
.suffixRAMLimitMB(suffixRAMLimitMB)
.build();
BytesRefBuilder scratch = new BytesRefBuilder();

View File

@ -29,7 +29,6 @@ import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Random;
@ -180,19 +179,6 @@ public class FSTTester<T> {
}
}
public void doTest(boolean testPruning) throws IOException {
// no pruning
doTest(0, 0, true);
if (testPruning) {
// simple pruning
doTest(TestUtil.nextInt(random, 1, 1 + pairs.size()), 0, true);
// leafy pruning
doTest(0, TestUtil.nextInt(random, 1, 1 + pairs.size()), true);
}
}
// runs the term, returning the output, or null if term
// isn't accepted. if prefixLength is non-null it must be
// length 1 int array; prefixLength[0] is set to the length
@ -267,21 +253,11 @@ public class FSTTester<T> {
return output;
}
public FST<T> doTest(int prune1, int prune2, boolean allowRandomSuffixSharing)
throws IOException {
if (LuceneTestCase.VERBOSE) {
System.out.println("\nTEST: prune1=" + prune1 + " prune2=" + prune2);
}
public FST<T> doTest() throws IOException {
final FSTCompiler<T> fstCompiler =
new FSTCompiler.Builder<>(
inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, outputs)
.minSuffixCount1(prune1)
.minSuffixCount2(prune2)
.shouldShareSuffix(prune1 == 0 && prune2 == 0)
.shouldShareNonSingletonNodes(allowRandomSuffixSharing ? random.nextBoolean() : true)
.shareMaxTailLength(
allowRandomSuffixSharing ? TestUtil.nextInt(random, 1, 10) : Integer.MAX_VALUE)
.build();
for (InputOutput<T> pair : pairs) {
@ -332,11 +308,7 @@ public class FSTTester<T> {
}
}
if (prune1 == 0 && prune2 == 0) {
verifyUnPruned(inputMode, fst);
} else {
verifyPruned(inputMode, fst, prune1, prune2);
}
verifyUnPruned(inputMode, fst);
nodeCount = fstCompiler.getNodeCount();
arcCount = fstCompiler.getArcCount();
@ -646,207 +618,4 @@ public class FSTTester<T> {
}
}
}
private static class CountMinOutput<T> {
int count;
T output;
T finalOutput;
boolean isLeaf = true;
boolean isFinal;
}
// FST is pruned
private void verifyPruned(int inputMode, FST<T> fst, int prune1, int prune2) throws IOException {
if (LuceneTestCase.VERBOSE) {
System.out.println("TEST: now verify pruned " + pairs.size() + " terms; outputs=" + outputs);
for (InputOutput<T> pair : pairs) {
System.out.println(
" "
+ inputToString(inputMode, pair.input)
+ ": "
+ outputs.outputToString(pair.output));
}
}
// To validate the FST, we brute-force compute all prefixes
// in the terms, matched to their "common" outputs, prune that
// set according to the prune thresholds, then assert the FST
// matches that same set.
// NOTE: Crazy RAM intensive!!
// System.out.println("TEST: tally prefixes");
// build all prefixes
final Map<IntsRef, CountMinOutput<T>> prefixes = new HashMap<>();
final IntsRefBuilder scratch = new IntsRefBuilder();
for (InputOutput<T> pair : pairs) {
scratch.copyInts(pair.input);
for (int idx = 0; idx <= pair.input.length; idx++) {
scratch.setLength(idx);
CountMinOutput<T> cmo = prefixes.get(scratch.get());
if (cmo == null) {
cmo = new CountMinOutput<>();
cmo.count = 1;
cmo.output = pair.output;
prefixes.put(scratch.toIntsRef(), cmo);
} else {
cmo.count++;
T output1 = cmo.output;
if (output1.equals(outputs.getNoOutput())) {
output1 = outputs.getNoOutput();
}
T output2 = pair.output;
if (output2.equals(outputs.getNoOutput())) {
output2 = outputs.getNoOutput();
}
cmo.output = outputs.common(output1, output2);
}
if (idx == pair.input.length) {
cmo.isFinal = true;
cmo.finalOutput = cmo.output;
}
}
}
if (LuceneTestCase.VERBOSE) {
System.out.println("TEST: now prune");
}
// prune 'em
final Iterator<Map.Entry<IntsRef, CountMinOutput<T>>> it = prefixes.entrySet().iterator();
while (it.hasNext()) {
Map.Entry<IntsRef, CountMinOutput<T>> ent = it.next();
final IntsRef prefix = ent.getKey();
final CountMinOutput<T> cmo = ent.getValue();
if (LuceneTestCase.VERBOSE) {
System.out.println(
" term prefix="
+ inputToString(inputMode, prefix, false)
+ " count="
+ cmo.count
+ " isLeaf="
+ cmo.isLeaf
+ " output="
+ outputs.outputToString(cmo.output)
+ " isFinal="
+ cmo.isFinal);
}
final boolean keep;
if (prune1 > 0) {
keep = cmo.count >= prune1;
} else {
assert prune2 > 0;
if (prune2 > 1 && cmo.count >= prune2) {
keep = true;
} else if (prefix.length > 0) {
// consult our parent
scratch.setLength(prefix.length - 1);
System.arraycopy(prefix.ints, prefix.offset, scratch.ints(), 0, scratch.length());
final CountMinOutput<T> cmo2 = prefixes.get(scratch.get());
// System.out.println(" parent count = " + (cmo2 == null ? -1 : cmo2.count));
keep =
cmo2 != null
&& ((prune2 > 1 && cmo2.count >= prune2)
|| (prune2 == 1 && (cmo2.count >= 2 || prefix.length <= 1)));
} else {
keep = cmo.count >= prune2;
}
}
if (!keep) {
it.remove();
// System.out.println(" remove");
} else {
// clear isLeaf for all ancestors
// System.out.println(" keep");
scratch.copyInts(prefix);
scratch.setLength(scratch.length() - 1);
while (scratch.length() >= 0) {
final CountMinOutput<T> cmo2 = prefixes.get(scratch.get());
if (cmo2 != null) {
// System.out.println(" clear isLeaf " + inputToString(inputMode, scratch));
cmo2.isLeaf = false;
}
scratch.setLength(scratch.length() - 1);
}
}
}
if (LuceneTestCase.VERBOSE) {
System.out.println("TEST: after prune");
for (Map.Entry<IntsRef, CountMinOutput<T>> ent : prefixes.entrySet()) {
System.out.println(
" "
+ inputToString(inputMode, ent.getKey(), false)
+ ": isLeaf="
+ ent.getValue().isLeaf
+ " isFinal="
+ ent.getValue().isFinal);
if (ent.getValue().isFinal) {
System.out.println(
" finalOutput=" + outputs.outputToString(ent.getValue().finalOutput));
}
}
}
if (prefixes.size() <= 1) {
assertNull(fst);
return;
}
assertNotNull(fst);
// make sure FST only enums valid prefixes
if (LuceneTestCase.VERBOSE) {
System.out.println("TEST: check pruned enum");
}
IntsRefFSTEnum<T> fstEnum = new IntsRefFSTEnum<>(fst);
IntsRefFSTEnum.InputOutput<T> current;
while ((current = fstEnum.next()) != null) {
if (LuceneTestCase.VERBOSE) {
System.out.println(
" fstEnum.next prefix="
+ inputToString(inputMode, current.input, false)
+ " output="
+ outputs.outputToString(current.output));
}
final CountMinOutput<T> cmo = prefixes.get(current.input);
assertNotNull(cmo);
assertTrue(cmo.isLeaf || cmo.isFinal);
// if (cmo.isFinal && !cmo.isLeaf) {
if (cmo.isFinal) {
assertEquals(cmo.finalOutput, current.output);
} else {
assertEquals(cmo.output, current.output);
}
}
// make sure all non-pruned prefixes are present in the FST
if (LuceneTestCase.VERBOSE) {
System.out.println("TEST: verify all prefixes");
}
final int[] stopNode = new int[1];
for (Map.Entry<IntsRef, CountMinOutput<T>> ent : prefixes.entrySet()) {
if (ent.getKey().length > 0) {
final CountMinOutput<T> cmo = ent.getValue();
final T output = run(fst, ent.getKey(), stopNode);
if (LuceneTestCase.VERBOSE) {
System.out.println(
"TEST: verify prefix="
+ inputToString(inputMode, ent.getKey(), false)
+ " output="
+ outputs.outputToString(cmo.output));
}
// if (cmo.isFinal && !cmo.isLeaf) {
if (cmo.isFinal) {
assertEquals(cmo.finalOutput, output);
} else {
assertEquals(cmo.output, output);
}
assertEquals(ent.getKey().length, stopNode[0]);
}
}
}
}