LUCENE-3289: add options to FST Builder to tradeoff RAM/CPU used during build vs how small the resulting FST is

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1145292 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2011-07-11 18:53:13 +00:00
parent 94a47f4415
commit fbf9f4ccad
10 changed files with 74 additions and 43 deletions

View File

@ -532,6 +532,11 @@ Optimizations
directly if possible and merges separately written files on the fly instead directly if possible and merges separately written files on the fly instead
of during close. (Simon Willnauer, Robert Muir) of during close. (Simon Willnauer, Robert Muir)
* LUCENE-3289: When building an FST you can now tune how aggressively
the FST should try to share common suffixes. Typically you can
greatly reduce RAM required during building, and CPU consumed, at
the cost of a somewhat larger FST. (Mike McCandless)
======================= Lucene 3.3.0 ======================= ======================= Lucene 3.3.0 =======================
Changes in backwards compatibility policy Changes in backwards compatibility policy

View File

@ -190,7 +190,7 @@ public class VariableGapTermsIndexReader extends TermsIndexReaderBase {
if (indexDivisor > 1) { if (indexDivisor > 1) {
// subsample // subsample
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
final Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, 0, 0, true, outputs); final Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
final BytesRefFSTEnum<Long> fstEnum = new BytesRefFSTEnum<Long>(fst); final BytesRefFSTEnum<Long> fstEnum = new BytesRefFSTEnum<Long>(fst);
BytesRefFSTEnum.InputOutput<Long> result; BytesRefFSTEnum.InputOutput<Long> result;
int count = indexDivisor; int count = indexDivisor;

View File

@ -222,9 +222,7 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
public FSTFieldWriter(FieldInfo fieldInfo, long termsFilePointer) throws IOException { public FSTFieldWriter(FieldInfo fieldInfo, long termsFilePointer) throws IOException {
this.fieldInfo = fieldInfo; this.fieldInfo = fieldInfo;
fstOutputs = PositiveIntOutputs.getSingleton(true); fstOutputs = PositiveIntOutputs.getSingleton(true);
fstBuilder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, fstBuilder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, fstOutputs);
0, 0, true,
fstOutputs);
indexStart = out.getFilePointer(); indexStart = out.getFilePointer();
////System.out.println("VGW: field=" + fieldInfo.name); ////System.out.println("VGW: field=" + fieldInfo.name);

View File

@ -478,9 +478,6 @@ class SimpleTextFieldsReader extends FieldsProducer {
PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton(false); PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton(false);
final Builder<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> b; final Builder<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> b;
b = new Builder<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>>(FST.INPUT_TYPE.BYTE1, b = new Builder<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>>(FST.INPUT_TYPE.BYTE1,
0,
0,
true,
new PairOutputs<Long,PairOutputs.Pair<Long,Long>>(posIntOutputs, new PairOutputs<Long,PairOutputs.Pair<Long,Long>>(posIntOutputs,
new PairOutputs<Long,Long>(posIntOutputs, posIntOutputs))); new PairOutputs<Long,Long>(posIntOutputs, posIntOutputs)));
IndexInput in = (IndexInput) SimpleTextFieldsReader.this.in.clone(); IndexInput in = (IndexInput) SimpleTextFieldsReader.this.in.clone();

View File

@ -62,6 +62,9 @@ public class Builder<T> {
// terms go through it: // terms go through it:
private final int minSuffixCount2; private final int minSuffixCount2;
private final boolean doShareNonSingletonNodes;
private final int shareMaxTailLength;
private final IntsRef lastInput = new IntsRef(); private final IntsRef lastInput = new IntsRef();
// NOTE: cutting this over to ArrayList instead loses ~6% // NOTE: cutting this over to ArrayList instead loses ~6%
@ -72,12 +75,11 @@ public class Builder<T> {
/** /**
* Instantiates an FST/FSA builder without any pruning. A shortcut * Instantiates an FST/FSA builder without any pruning. A shortcut
* to {@link #Builder(FST.INPUT_TYPE, int, int, boolean, Outputs)} with * to {@link #Builder(FST.INPUT_TYPE, int, int, boolean, boolean, int, Outputs)} with
* pruning options turned off. * pruning options turned off.
*/ */
public Builder(FST.INPUT_TYPE inputType, Outputs<T> outputs) public Builder(FST.INPUT_TYPE inputType, Outputs<T> outputs) {
{ this(inputType, 0, 0, true, true, Integer.MAX_VALUE, outputs);
this(inputType, 0, 0, true, outputs);
} }
/** /**
@ -97,20 +99,34 @@ public class Builder<T> {
* @param minSuffixCount2 * @param minSuffixCount2
* (Note: only Mike McCandless knows what this one is really doing...) * (Note: only Mike McCandless knows what this one is really doing...)
* *
* @param doMinSuffix * @param doShareSuffix
* If <code>true</code>, the shared suffixes will be compacted into unique paths. * If <code>true</code>, the shared suffixes will be compacted into unique paths.
* This requires an additional hash map for lookups in memory. Setting this parameter to * This requires an additional hash map for lookups in memory. Setting this parameter to
* <code>false</code> creates a single path for all input sequences. This will result in a larger * <code>false</code> creates a single path for all input sequences. This will result in a larger
* graph, but may require less memory and will speed up construction. * graph, but may require less memory and will speed up construction.
*
* @param doShareNonSingletonNodes
* Only used if doShareSuffix is true. Set this to
* true to ensure FST is fully minimal, at cost of more
* CPU and more RAM during building.
*
* @param shareMaxTailLength
* Only used if doShareSuffix is true. Set this to
* Integer.MAX_VALUE to ensure FST is fully minimal, at cost of more
* CPU and more RAM during building.
*
* @param outputs The output type for each input sequence. Applies only if building an FST. For * @param outputs The output type for each input sequence. Applies only if building an FST. For
* FSA, use {@link NoOutputs#getSingleton()} and {@link NoOutputs#getNoOutput()} as the * FSA, use {@link NoOutputs#getSingleton()} and {@link NoOutputs#getNoOutput()} as the
* singleton output object. * singleton output object.
*/ */
public Builder(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doMinSuffix, Outputs<T> outputs) { public Builder(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doShareSuffix,
boolean doShareNonSingletonNodes, int shareMaxTailLength, Outputs<T> outputs) {
this.minSuffixCount1 = minSuffixCount1; this.minSuffixCount1 = minSuffixCount1;
this.minSuffixCount2 = minSuffixCount2; this.minSuffixCount2 = minSuffixCount2;
this.doShareNonSingletonNodes = doShareNonSingletonNodes;
this.shareMaxTailLength = shareMaxTailLength;
fst = new FST<T>(inputType, outputs); fst = new FST<T>(inputType, outputs);
if (doMinSuffix) { if (doShareSuffix) {
dedupHash = new NodeHash<T>(fst); dedupHash = new NodeHash<T>(fst);
} else { } else {
dedupHash = null; dedupHash = null;
@ -143,10 +159,9 @@ public class Builder<T> {
fst.setAllowArrayArcs(b); fst.setAllowArrayArcs(b);
} }
private CompiledNode compileNode(UnCompiledNode<T> n) throws IOException { private CompiledNode compileNode(UnCompiledNode<T> n, int tailLength) throws IOException {
final int address; final int address;
if (dedupHash != null) { if (dedupHash != null && (doShareNonSingletonNodes || n.numArcs <= 1) && tailLength <= shareMaxTailLength) {
if (n.numArcs == 0) { if (n.numArcs == 0) {
address = fst.addNode(n); address = fst.addNode(n);
} else { } else {
@ -221,7 +236,7 @@ public class Builder<T> {
} else { } else {
if (minSuffixCount2 != 0) { if (minSuffixCount2 != 0) {
compileAllTargets(node); compileAllTargets(node, lastInput.length-idx);
} }
final T nextFinalOutput = node.output; final T nextFinalOutput = node.output;
@ -237,7 +252,7 @@ public class Builder<T> {
// compile any targets that were previously // compile any targets that were previously
// undecided: // undecided:
parent.replaceLast(lastInput.ints[lastInput.offset + idx-1], parent.replaceLast(lastInput.ints[lastInput.offset + idx-1],
compileNode(node), compileNode(node, 1+lastInput.length-idx),
nextFinalOutput, nextFinalOutput,
isFinal); isFinal);
} else { } else {
@ -428,22 +443,28 @@ public class Builder<T> {
// empty string got pruned // empty string got pruned
return null; return null;
} else { } else {
fst.finish(compileNode(frontier[0]).address); fst.finish(compileNode(frontier[0], lastInput.length).address);
//System.out.println("compile addr = " + fst.getStartNode()); //System.out.println("compile addr = " + fst.getStartNode());
return fst; return fst;
} }
} else { } else {
if (minSuffixCount2 != 0) { if (minSuffixCount2 != 0) {
compileAllTargets(frontier[0]); compileAllTargets(frontier[0], lastInput.length);
} }
//System.out.println("NOW: " + frontier[0].numArcs); //System.out.println("NOW: " + frontier[0].numArcs);
fst.finish(compileNode(frontier[0]).address); fst.finish(compileNode(frontier[0], lastInput.length).address);
} }
/*
if (dedupHash != null) {
System.out.println("NH: " + dedupHash.count());
}
*/
return fst; return fst;
} }
private void compileAllTargets(UnCompiledNode<T> node) throws IOException { private void compileAllTargets(UnCompiledNode<T> node, int tailLength) throws IOException {
for(int arcIdx=0;arcIdx<node.numArcs;arcIdx++) { for(int arcIdx=0;arcIdx<node.numArcs;arcIdx++) {
final Arc<T> arc = node.arcs[arcIdx]; final Arc<T> arc = node.arcs[arcIdx];
if (!arc.target.isCompiled()) { if (!arc.target.isCompiled()) {
@ -453,7 +474,7 @@ public class Builder<T> {
//System.out.println("seg=" + segment + " FORCE final arc=" + (char) arc.label); //System.out.println("seg=" + segment + " FORCE final arc=" + (char) arc.label);
arc.isFinal = n.isFinal = true; arc.isFinal = n.isFinal = true;
} }
arc.target = compileNode(n); arc.target = compileNode(n, tailLength-1);
} }
} }
} }

View File

@ -25,8 +25,12 @@ import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.CodecUtil; import org.apache.lucene.util.CodecUtil;
import org.apache.lucene.util.fst.Builder.UnCompiledNode; import org.apache.lucene.util.fst.Builder.UnCompiledNode;
// TODO: if FST is pure prefix trie we can do a more compact
// job, ie, once we are at a 'suffix only', just store the
// completion labels as a string not as a series of arcs.
// NOTE: while the FST is able to represent a non-final // NOTE: while the FST is able to represent a non-final
// dead-end state (NON_FINAL_END_NODE=0), the layres above // dead-end state (NON_FINAL_END_NODE=0), the layers above
// (FSTEnum, Util) have problems with this!! // (FSTEnum, Util) have problems with this!!
/** Represents an FST using a compact byte[] format. /** Represents an FST using a compact byte[] format.

View File

@ -164,4 +164,8 @@ final class NodeHash<T> {
} }
} }
} }
public int count() {
return count;
}
} }

View File

@ -150,7 +150,7 @@ public class TestFSTs extends LuceneTestCase {
for(IntsRef term : terms2) { for(IntsRef term : terms2) {
pairs.add(new FSTTester.InputOutput<Object>(term, NO_OUTPUT)); pairs.add(new FSTTester.InputOutput<Object>(term, NO_OUTPUT));
} }
FST<Object> fst = new FSTTester<Object>(random, dir, inputMode, pairs, outputs).doTest(0, 0); FST<Object> fst = new FSTTester<Object>(random, dir, inputMode, pairs, outputs).doTest(0, 0, false);
assertNotNull(fst); assertNotNull(fst);
assertEquals(22, fst.getNodeCount()); assertEquals(22, fst.getNodeCount());
assertEquals(27, fst.getArcCount()); assertEquals(27, fst.getArcCount());
@ -163,7 +163,7 @@ public class TestFSTs extends LuceneTestCase {
for(int idx=0;idx<terms2.length;idx++) { for(int idx=0;idx<terms2.length;idx++) {
pairs.add(new FSTTester.InputOutput<Long>(terms2[idx], outputs.get(idx))); pairs.add(new FSTTester.InputOutput<Long>(terms2[idx], outputs.get(idx)));
} }
final FST<Long> fst = new FSTTester<Long>(random, dir, inputMode, pairs, outputs).doTest(0, 0); final FST<Long> fst = new FSTTester<Long>(random, dir, inputMode, pairs, outputs).doTest(0, 0, false);
assertNotNull(fst); assertNotNull(fst);
assertEquals(22, fst.getNodeCount()); assertEquals(22, fst.getNodeCount());
assertEquals(27, fst.getArcCount()); assertEquals(27, fst.getArcCount());
@ -178,7 +178,7 @@ public class TestFSTs extends LuceneTestCase {
final BytesRef output = random.nextInt(30) == 17 ? NO_OUTPUT : new BytesRef(Integer.toString(idx)); final BytesRef output = random.nextInt(30) == 17 ? NO_OUTPUT : new BytesRef(Integer.toString(idx));
pairs.add(new FSTTester.InputOutput<BytesRef>(terms2[idx], output)); pairs.add(new FSTTester.InputOutput<BytesRef>(terms2[idx], output));
} }
final FST<BytesRef> fst = new FSTTester<BytesRef>(random, dir, inputMode, pairs, outputs).doTest(0, 0); final FST<BytesRef> fst = new FSTTester<BytesRef>(random, dir, inputMode, pairs, outputs).doTest(0, 0, false);
assertNotNull(fst); assertNotNull(fst);
assertEquals(24, fst.getNodeCount()); assertEquals(24, fst.getNodeCount());
assertEquals(30, fst.getArcCount()); assertEquals(30, fst.getArcCount());
@ -359,14 +359,14 @@ public class TestFSTs extends LuceneTestCase {
public void doTest() throws IOException { public void doTest() throws IOException {
// no pruning // no pruning
doTest(0, 0); doTest(0, 0, true);
if (!(outputs instanceof UpToTwoPositiveIntOutputs)) { if (!(outputs instanceof UpToTwoPositiveIntOutputs)) {
// simple pruning // simple pruning
doTest(_TestUtil.nextInt(random, 1, 1+pairs.size()), 0); doTest(_TestUtil.nextInt(random, 1, 1+pairs.size()), 0, true);
// leafy pruning // leafy pruning
doTest(0, _TestUtil.nextInt(random, 1, 1+pairs.size())); doTest(0, _TestUtil.nextInt(random, 1, 1+pairs.size()), true);
} }
} }
@ -446,14 +446,17 @@ public class TestFSTs extends LuceneTestCase {
} }
FST<T> doTest(int prune1, int prune2) throws IOException { FST<T> doTest(int prune1, int prune2, boolean allowRandomSuffixSharing) throws IOException {
if (VERBOSE) { if (VERBOSE) {
System.out.println("TEST: prune1=" + prune1 + " prune2=" + prune2); System.out.println("TEST: prune1=" + prune1 + " prune2=" + prune2);
} }
final Builder<T> builder = new Builder<T>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, final Builder<T> builder = new Builder<T>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4,
prune1, prune2, prune1, prune2,
prune1==0 && prune2==0, outputs); prune1==0 && prune2==0,
allowRandomSuffixSharing ? random.nextBoolean() : true,
allowRandomSuffixSharing ? _TestUtil.nextInt(random, 1, 10) : Integer.MAX_VALUE,
outputs);
for(InputOutput<T> pair : pairs) { for(InputOutput<T> pair : pairs) {
if (pair.output instanceof UpToTwoPositiveIntOutputs.TwoLongs) { if (pair.output instanceof UpToTwoPositiveIntOutputs.TwoLongs) {
@ -1017,7 +1020,7 @@ public class TestFSTs extends LuceneTestCase {
IndexReader r = IndexReader.open(writer, true); IndexReader r = IndexReader.open(writer, true);
writer.close(); writer.close();
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(random.nextBoolean()); final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(random.nextBoolean());
Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, 0, 0, true, outputs); Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
boolean storeOrd = random.nextBoolean(); boolean storeOrd = random.nextBoolean();
if (VERBOSE) { if (VERBOSE) {
@ -1145,7 +1148,7 @@ public class TestFSTs extends LuceneTestCase {
this.inputMode = inputMode; this.inputMode = inputMode;
this.outputs = outputs; this.outputs = outputs;
builder = new Builder<T>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, outputs); builder = new Builder<T>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, true, Integer.MAX_VALUE, outputs);
} }
protected abstract T getOutput(IntsRef input, int ord) throws IOException; protected abstract T getOutput(IntsRef input, int ord) throws IOException;
@ -1245,7 +1248,7 @@ public class TestFSTs extends LuceneTestCase {
} }
} }
// java -cp build/classes/test:build/classes/java:lib/junit-4.7.jar org.apache.lucene.util.fst.TestFSTs /x/tmp/allTerms3.txt out // java -cp build/classes/test:build/classes/java:build/classes/test-framework:lib/junit-4.7.jar org.apache.lucene.util.fst.TestFSTs /x/tmp/allTerms3.txt out
public static void main(String[] args) throws IOException { public static void main(String[] args) throws IOException {
int prune = 0; int prune = 0;
int limit = Integer.MAX_VALUE; int limit = Integer.MAX_VALUE;
@ -1351,7 +1354,7 @@ public class TestFSTs extends LuceneTestCase {
public void testSingleString() throws Exception { public void testSingleString() throws Exception {
final Outputs<Object> outputs = NoOutputs.getSingleton(); final Outputs<Object> outputs = NoOutputs.getSingleton();
final Builder<Object> b = new Builder<Object>(FST.INPUT_TYPE.BYTE1, 0, 0, true, outputs); final Builder<Object> b = new Builder<Object>(FST.INPUT_TYPE.BYTE1, outputs);
b.add(new BytesRef("foobar"), outputs.getNoOutput()); b.add(new BytesRef("foobar"), outputs.getNoOutput());
final BytesRefFSTEnum<Object> fstEnum = new BytesRefFSTEnum<Object>(b.finish()); final BytesRefFSTEnum<Object> fstEnum = new BytesRefFSTEnum<Object>(b.finish());
assertNull(fstEnum.seekFloor(new BytesRef("foo"))); assertNull(fstEnum.seekFloor(new BytesRef("foo")));
@ -1368,7 +1371,7 @@ public class TestFSTs extends LuceneTestCase {
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
// Build an FST mapping BytesRef -> Long // Build an FST mapping BytesRef -> Long
final Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, 0, 0, true, outputs); final Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
final BytesRef a = new BytesRef("a"); final BytesRef a = new BytesRef("a");
final BytesRef b = new BytesRef("b"); final BytesRef b = new BytesRef("b");
@ -1413,7 +1416,7 @@ public class TestFSTs extends LuceneTestCase {
FST<Object> compile(String[] lines) throws IOException { FST<Object> compile(String[] lines) throws IOException {
final NoOutputs outputs = NoOutputs.getSingleton(); final NoOutputs outputs = NoOutputs.getSingleton();
final Object nothing = outputs.getNoOutput(); final Object nothing = outputs.getNoOutput();
final Builder<Object> b = new Builder<Object>(FST.INPUT_TYPE.BYTE1, 0, 0, true, outputs); final Builder<Object> b = new Builder<Object>(FST.INPUT_TYPE.BYTE1, outputs);
int line = 0; int line = 0;
final BytesRef term = new BytesRef(); final BytesRef term = new BytesRef();
@ -1488,7 +1491,7 @@ public class TestFSTs extends LuceneTestCase {
public void testNonFinalStopNodes() throws Exception { public void testNonFinalStopNodes() throws Exception {
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
final Long nothing = outputs.getNoOutput(); final Long nothing = outputs.getNoOutput();
final Builder<Long> b = new Builder<Long>(FST.INPUT_TYPE.BYTE1, 0, 0, true, outputs); final Builder<Long> b = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
final FST<Long> fst = new FST<Long>(FST.INPUT_TYPE.BYTE1, outputs); final FST<Long> fst = new FST<Long>(FST.INPUT_TYPE.BYTE1, outputs);

View File

@ -244,7 +244,7 @@ public class SynonymMap {
ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
// TODO: are we using the best sharing options? // TODO: are we using the best sharing options?
org.apache.lucene.util.fst.Builder<BytesRef> builder = org.apache.lucene.util.fst.Builder<BytesRef> builder =
new org.apache.lucene.util.fst.Builder<BytesRef>(FST.INPUT_TYPE.BYTE4, 0, 0, true, outputs); new org.apache.lucene.util.fst.Builder<BytesRef>(FST.INPUT_TYPE.BYTE4, outputs);
BytesRef scratch = new BytesRef(64); BytesRef scratch = new BytesRef(64);
ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput(); ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput();

View File

@ -450,8 +450,7 @@ public class FSTLookup extends Lookup {
// Build the automaton. // Build the automaton.
final Outputs<Object> outputs = NoOutputs.getSingleton(); final Outputs<Object> outputs = NoOutputs.getSingleton();
final Object empty = outputs.getNoOutput(); final Object empty = outputs.getNoOutput();
final Builder<Object> builder = final Builder<Object> builder = new Builder<Object>(FST.INPUT_TYPE.BYTE4, outputs);
new Builder<Object>(FST.INPUT_TYPE.BYTE4, 0, 0, true, outputs);
final IntsRef scratchIntsRef = new IntsRef(10); final IntsRef scratchIntsRef = new IntsRef(10);
for (Entry e : entries) { for (Entry e : entries) {
final int termLength = scratchIntsRef.length = e.term.length; final int termLength = scratchIntsRef.length = e.term.length;