LUCENE-3289: add options to FST Builder to tradeoff RAM/CPU used during build vs how small the resulting FST is

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1145292 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2011-07-11 18:53:13 +00:00
parent 94a47f4415
commit fbf9f4ccad
10 changed files with 74 additions and 43 deletions

View File

@ -532,6 +532,11 @@ Optimizations
directly if possible and merges separately written files on the fly instead
of during close. (Simon Willnauer, Robert Muir)
* LUCENE-3289: When building an FST you can now tune how aggressively
the FST should try to share common suffixes. Typically you can
greatly reduce RAM required during building, and CPU consumed, at
the cost of a somewhat larger FST. (Mike McCandless)
======================= Lucene 3.3.0 =======================
Changes in backwards compatibility policy

View File

@ -190,7 +190,7 @@ public class VariableGapTermsIndexReader extends TermsIndexReaderBase {
if (indexDivisor > 1) {
// subsample
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
final Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, 0, 0, true, outputs);
final Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
final BytesRefFSTEnum<Long> fstEnum = new BytesRefFSTEnum<Long>(fst);
BytesRefFSTEnum.InputOutput<Long> result;
int count = indexDivisor;

View File

@ -222,9 +222,7 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
public FSTFieldWriter(FieldInfo fieldInfo, long termsFilePointer) throws IOException {
this.fieldInfo = fieldInfo;
fstOutputs = PositiveIntOutputs.getSingleton(true);
fstBuilder = new Builder<Long>(FST.INPUT_TYPE.BYTE1,
0, 0, true,
fstOutputs);
fstBuilder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, fstOutputs);
indexStart = out.getFilePointer();
////System.out.println("VGW: field=" + fieldInfo.name);

View File

@ -478,9 +478,6 @@ class SimpleTextFieldsReader extends FieldsProducer {
PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton(false);
final Builder<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> b;
b = new Builder<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>>(FST.INPUT_TYPE.BYTE1,
0,
0,
true,
new PairOutputs<Long,PairOutputs.Pair<Long,Long>>(posIntOutputs,
new PairOutputs<Long,Long>(posIntOutputs, posIntOutputs)));
IndexInput in = (IndexInput) SimpleTextFieldsReader.this.in.clone();

View File

@ -62,6 +62,9 @@ public class Builder<T> {
// terms go through it:
private final int minSuffixCount2;
private final boolean doShareNonSingletonNodes;
private final int shareMaxTailLength;
private final IntsRef lastInput = new IntsRef();
// NOTE: cutting this over to ArrayList instead loses ~6%
@ -72,12 +75,11 @@ public class Builder<T> {
/**
* Instantiates an FST/FSA builder without any pruning. A shortcut
* to {@link #Builder(FST.INPUT_TYPE, int, int, boolean, Outputs)} with
* to {@link #Builder(FST.INPUT_TYPE, int, int, boolean, boolean, int, Outputs)} with
* pruning options turned off.
*/
public Builder(FST.INPUT_TYPE inputType, Outputs<T> outputs)
{
this(inputType, 0, 0, true, outputs);
public Builder(FST.INPUT_TYPE inputType, Outputs<T> outputs) {
this(inputType, 0, 0, true, true, Integer.MAX_VALUE, outputs);
}
/**
@ -97,20 +99,34 @@ public class Builder<T> {
* @param minSuffixCount2
* (Note: only Mike McCandless knows what this one is really doing...)
*
* @param doMinSuffix
* @param doShareSuffix
* If <code>true</code>, the shared suffixes will be compacted into unique paths.
* This requires an additional hash map for lookups in memory. Setting this parameter to
* <code>false</code> creates a single path for all input sequences. This will result in a larger
* graph, but may require less memory and will speed up construction.
*
* @param doShareNonSingletonNodes
* Only used if doShareSuffix is true. Set this to
* true to ensure FST is fully minimal, at cost of more
* CPU and more RAM during building.
*
* @param shareMaxTailLength
* Only used if doShareSuffix is true. Set this to
* Integer.MAX_VALUE to ensure FST is fully minimal, at cost of more
* CPU and more RAM during building.
*
* @param outputs The output type for each input sequence. Applies only if building an FST. For
* FSA, use {@link NoOutputs#getSingleton()} and {@link NoOutputs#getNoOutput()} as the
* singleton output object.
*/
public Builder(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doMinSuffix, Outputs<T> outputs) {
public Builder(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doShareSuffix,
boolean doShareNonSingletonNodes, int shareMaxTailLength, Outputs<T> outputs) {
this.minSuffixCount1 = minSuffixCount1;
this.minSuffixCount2 = minSuffixCount2;
this.doShareNonSingletonNodes = doShareNonSingletonNodes;
this.shareMaxTailLength = shareMaxTailLength;
fst = new FST<T>(inputType, outputs);
if (doMinSuffix) {
if (doShareSuffix) {
dedupHash = new NodeHash<T>(fst);
} else {
dedupHash = null;
@ -143,10 +159,9 @@ public class Builder<T> {
fst.setAllowArrayArcs(b);
}
private CompiledNode compileNode(UnCompiledNode<T> n) throws IOException {
private CompiledNode compileNode(UnCompiledNode<T> n, int tailLength) throws IOException {
final int address;
if (dedupHash != null) {
if (dedupHash != null && (doShareNonSingletonNodes || n.numArcs <= 1) && tailLength <= shareMaxTailLength) {
if (n.numArcs == 0) {
address = fst.addNode(n);
} else {
@ -221,7 +236,7 @@ public class Builder<T> {
} else {
if (minSuffixCount2 != 0) {
compileAllTargets(node);
compileAllTargets(node, lastInput.length-idx);
}
final T nextFinalOutput = node.output;
@ -237,7 +252,7 @@ public class Builder<T> {
// compile any targets that were previously
// undecided:
parent.replaceLast(lastInput.ints[lastInput.offset + idx-1],
compileNode(node),
compileNode(node, 1+lastInput.length-idx),
nextFinalOutput,
isFinal);
} else {
@ -428,22 +443,28 @@ public class Builder<T> {
// empty string got pruned
return null;
} else {
fst.finish(compileNode(frontier[0]).address);
fst.finish(compileNode(frontier[0], lastInput.length).address);
//System.out.println("compile addr = " + fst.getStartNode());
return fst;
}
} else {
if (minSuffixCount2 != 0) {
compileAllTargets(frontier[0]);
compileAllTargets(frontier[0], lastInput.length);
}
//System.out.println("NOW: " + frontier[0].numArcs);
fst.finish(compileNode(frontier[0]).address);
fst.finish(compileNode(frontier[0], lastInput.length).address);
}
/*
if (dedupHash != null) {
System.out.println("NH: " + dedupHash.count());
}
*/
return fst;
}
private void compileAllTargets(UnCompiledNode<T> node) throws IOException {
private void compileAllTargets(UnCompiledNode<T> node, int tailLength) throws IOException {
for(int arcIdx=0;arcIdx<node.numArcs;arcIdx++) {
final Arc<T> arc = node.arcs[arcIdx];
if (!arc.target.isCompiled()) {
@ -453,7 +474,7 @@ public class Builder<T> {
//System.out.println("seg=" + segment + " FORCE final arc=" + (char) arc.label);
arc.isFinal = n.isFinal = true;
}
arc.target = compileNode(n);
arc.target = compileNode(n, tailLength-1);
}
}
}

View File

@ -25,8 +25,12 @@ import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.CodecUtil;
import org.apache.lucene.util.fst.Builder.UnCompiledNode;
// TODO: if FST is pure prefix trie we can do a more compact
// job, ie, once we are at a 'suffix only', just store the
// completion labels as a string not as a series of arcs.
// NOTE: while the FST is able to represent a non-final
// dead-end state (NON_FINAL_END_NODE=0), the layres above
// dead-end state (NON_FINAL_END_NODE=0), the layers above
// (FSTEnum, Util) have problems with this!!
/** Represents an FST using a compact byte[] format.

View File

@ -164,4 +164,8 @@ final class NodeHash<T> {
}
}
}
public int count() {
return count;
}
}

View File

@ -150,7 +150,7 @@ public class TestFSTs extends LuceneTestCase {
for(IntsRef term : terms2) {
pairs.add(new FSTTester.InputOutput<Object>(term, NO_OUTPUT));
}
FST<Object> fst = new FSTTester<Object>(random, dir, inputMode, pairs, outputs).doTest(0, 0);
FST<Object> fst = new FSTTester<Object>(random, dir, inputMode, pairs, outputs).doTest(0, 0, false);
assertNotNull(fst);
assertEquals(22, fst.getNodeCount());
assertEquals(27, fst.getArcCount());
@ -163,7 +163,7 @@ public class TestFSTs extends LuceneTestCase {
for(int idx=0;idx<terms2.length;idx++) {
pairs.add(new FSTTester.InputOutput<Long>(terms2[idx], outputs.get(idx)));
}
final FST<Long> fst = new FSTTester<Long>(random, dir, inputMode, pairs, outputs).doTest(0, 0);
final FST<Long> fst = new FSTTester<Long>(random, dir, inputMode, pairs, outputs).doTest(0, 0, false);
assertNotNull(fst);
assertEquals(22, fst.getNodeCount());
assertEquals(27, fst.getArcCount());
@ -178,7 +178,7 @@ public class TestFSTs extends LuceneTestCase {
final BytesRef output = random.nextInt(30) == 17 ? NO_OUTPUT : new BytesRef(Integer.toString(idx));
pairs.add(new FSTTester.InputOutput<BytesRef>(terms2[idx], output));
}
final FST<BytesRef> fst = new FSTTester<BytesRef>(random, dir, inputMode, pairs, outputs).doTest(0, 0);
final FST<BytesRef> fst = new FSTTester<BytesRef>(random, dir, inputMode, pairs, outputs).doTest(0, 0, false);
assertNotNull(fst);
assertEquals(24, fst.getNodeCount());
assertEquals(30, fst.getArcCount());
@ -359,14 +359,14 @@ public class TestFSTs extends LuceneTestCase {
public void doTest() throws IOException {
// no pruning
doTest(0, 0);
doTest(0, 0, true);
if (!(outputs instanceof UpToTwoPositiveIntOutputs)) {
// simple pruning
doTest(_TestUtil.nextInt(random, 1, 1+pairs.size()), 0);
doTest(_TestUtil.nextInt(random, 1, 1+pairs.size()), 0, true);
// leafy pruning
doTest(0, _TestUtil.nextInt(random, 1, 1+pairs.size()));
doTest(0, _TestUtil.nextInt(random, 1, 1+pairs.size()), true);
}
}
@ -446,14 +446,17 @@ public class TestFSTs extends LuceneTestCase {
}
FST<T> doTest(int prune1, int prune2) throws IOException {
FST<T> doTest(int prune1, int prune2, boolean allowRandomSuffixSharing) throws IOException {
if (VERBOSE) {
System.out.println("TEST: prune1=" + prune1 + " prune2=" + prune2);
}
final Builder<T> builder = new Builder<T>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4,
prune1, prune2,
prune1==0 && prune2==0, outputs);
prune1==0 && prune2==0,
allowRandomSuffixSharing ? random.nextBoolean() : true,
allowRandomSuffixSharing ? _TestUtil.nextInt(random, 1, 10) : Integer.MAX_VALUE,
outputs);
for(InputOutput<T> pair : pairs) {
if (pair.output instanceof UpToTwoPositiveIntOutputs.TwoLongs) {
@ -1017,7 +1020,7 @@ public class TestFSTs extends LuceneTestCase {
IndexReader r = IndexReader.open(writer, true);
writer.close();
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(random.nextBoolean());
Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, 0, 0, true, outputs);
Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
boolean storeOrd = random.nextBoolean();
if (VERBOSE) {
@ -1145,7 +1148,7 @@ public class TestFSTs extends LuceneTestCase {
this.inputMode = inputMode;
this.outputs = outputs;
builder = new Builder<T>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, outputs);
builder = new Builder<T>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, true, Integer.MAX_VALUE, outputs);
}
protected abstract T getOutput(IntsRef input, int ord) throws IOException;
@ -1245,7 +1248,7 @@ public class TestFSTs extends LuceneTestCase {
}
}
// java -cp build/classes/test:build/classes/java:lib/junit-4.7.jar org.apache.lucene.util.fst.TestFSTs /x/tmp/allTerms3.txt out
// java -cp build/classes/test:build/classes/java:build/classes/test-framework:lib/junit-4.7.jar org.apache.lucene.util.fst.TestFSTs /x/tmp/allTerms3.txt out
public static void main(String[] args) throws IOException {
int prune = 0;
int limit = Integer.MAX_VALUE;
@ -1351,7 +1354,7 @@ public class TestFSTs extends LuceneTestCase {
public void testSingleString() throws Exception {
final Outputs<Object> outputs = NoOutputs.getSingleton();
final Builder<Object> b = new Builder<Object>(FST.INPUT_TYPE.BYTE1, 0, 0, true, outputs);
final Builder<Object> b = new Builder<Object>(FST.INPUT_TYPE.BYTE1, outputs);
b.add(new BytesRef("foobar"), outputs.getNoOutput());
final BytesRefFSTEnum<Object> fstEnum = new BytesRefFSTEnum<Object>(b.finish());
assertNull(fstEnum.seekFloor(new BytesRef("foo")));
@ -1368,7 +1371,7 @@ public class TestFSTs extends LuceneTestCase {
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
// Build an FST mapping BytesRef -> Long
final Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, 0, 0, true, outputs);
final Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
final BytesRef a = new BytesRef("a");
final BytesRef b = new BytesRef("b");
@ -1413,7 +1416,7 @@ public class TestFSTs extends LuceneTestCase {
FST<Object> compile(String[] lines) throws IOException {
final NoOutputs outputs = NoOutputs.getSingleton();
final Object nothing = outputs.getNoOutput();
final Builder<Object> b = new Builder<Object>(FST.INPUT_TYPE.BYTE1, 0, 0, true, outputs);
final Builder<Object> b = new Builder<Object>(FST.INPUT_TYPE.BYTE1, outputs);
int line = 0;
final BytesRef term = new BytesRef();
@ -1488,7 +1491,7 @@ public class TestFSTs extends LuceneTestCase {
public void testNonFinalStopNodes() throws Exception {
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
final Long nothing = outputs.getNoOutput();
final Builder<Long> b = new Builder<Long>(FST.INPUT_TYPE.BYTE1, 0, 0, true, outputs);
final Builder<Long> b = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
final FST<Long> fst = new FST<Long>(FST.INPUT_TYPE.BYTE1, outputs);

View File

@ -244,7 +244,7 @@ public class SynonymMap {
ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
// TODO: are we using the best sharing options?
org.apache.lucene.util.fst.Builder<BytesRef> builder =
new org.apache.lucene.util.fst.Builder<BytesRef>(FST.INPUT_TYPE.BYTE4, 0, 0, true, outputs);
new org.apache.lucene.util.fst.Builder<BytesRef>(FST.INPUT_TYPE.BYTE4, outputs);
BytesRef scratch = new BytesRef(64);
ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput();

View File

@ -450,8 +450,7 @@ public class FSTLookup extends Lookup {
// Build the automaton.
final Outputs<Object> outputs = NoOutputs.getSingleton();
final Object empty = outputs.getNoOutput();
final Builder<Object> builder =
new Builder<Object>(FST.INPUT_TYPE.BYTE4, 0, 0, true, outputs);
final Builder<Object> builder = new Builder<Object>(FST.INPUT_TYPE.BYTE4, outputs);
final IntsRef scratchIntsRef = new IntsRef(10);
for (Entry e : entries) {
final int termLength = scratchIntsRef.length = e.term.length;