diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 574fc4571a2..458aecbe563 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -532,6 +532,11 @@ Optimizations directly if possible and merges separately written files on the fly instead of during close. (Simon Willnauer, Robert Muir) +* LUCENE-3289: When building an FST you can now tune how aggressively + the FST should try to share common suffixes. Typically you can + greatly reduce RAM required during building, and CPU consumed, at + the cost of a somewhat larger FST. (Mike McCandless) + ======================= Lucene 3.3.0 ======================= Changes in backwards compatibility policy diff --git a/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexReader.java b/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexReader.java index 0befc752163..14281a4923e 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexReader.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexReader.java @@ -190,7 +190,7 @@ public class VariableGapTermsIndexReader extends TermsIndexReaderBase { if (indexDivisor > 1) { // subsample final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); - final Builder builder = new Builder(FST.INPUT_TYPE.BYTE1, 0, 0, true, outputs); + final Builder builder = new Builder(FST.INPUT_TYPE.BYTE1, outputs); final BytesRefFSTEnum fstEnum = new BytesRefFSTEnum(fst); BytesRefFSTEnum.InputOutput result; int count = indexDivisor; diff --git a/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexWriter.java b/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexWriter.java index c8d3d3e32ca..c987c9fba69 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexWriter.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexWriter.java @@ -222,9 +222,7 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase { public FSTFieldWriter(FieldInfo fieldInfo, long termsFilePointer) throws IOException { this.fieldInfo = fieldInfo; fstOutputs = PositiveIntOutputs.getSingleton(true); - fstBuilder = new Builder(FST.INPUT_TYPE.BYTE1, - 0, 0, true, - fstOutputs); + fstBuilder = new Builder(FST.INPUT_TYPE.BYTE1, fstOutputs); indexStart = out.getFilePointer(); ////System.out.println("VGW: field=" + fieldInfo.name); diff --git a/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java b/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java index e2a37f6b199..24ab068f0d7 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java @@ -478,9 +478,6 @@ class SimpleTextFieldsReader extends FieldsProducer { PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton(false); final Builder>> b; b = new Builder>>(FST.INPUT_TYPE.BYTE1, - 0, - 0, - true, new PairOutputs>(posIntOutputs, new PairOutputs(posIntOutputs, posIntOutputs))); IndexInput in = (IndexInput) SimpleTextFieldsReader.this.in.clone(); diff --git a/lucene/src/java/org/apache/lucene/util/fst/Builder.java b/lucene/src/java/org/apache/lucene/util/fst/Builder.java index 84245a6d577..0fb0b82727b 100644 --- a/lucene/src/java/org/apache/lucene/util/fst/Builder.java +++ b/lucene/src/java/org/apache/lucene/util/fst/Builder.java @@ -62,6 +62,9 @@ public class Builder { // terms go through it: private final int minSuffixCount2; + private final boolean doShareNonSingletonNodes; + private final int shareMaxTailLength; + private final IntsRef lastInput = new IntsRef(); // NOTE: cutting this over to ArrayList instead loses ~6% @@ -72,12 +75,11 @@ public class Builder { /** * Instantiates an FST/FSA builder without any pruning. A shortcut - * to {@link #Builder(FST.INPUT_TYPE, int, int, boolean, Outputs)} with + * to {@link #Builder(FST.INPUT_TYPE, int, int, boolean, boolean, int, Outputs)} with * pruning options turned off. */ - public Builder(FST.INPUT_TYPE inputType, Outputs outputs) - { - this(inputType, 0, 0, true, outputs); + public Builder(FST.INPUT_TYPE inputType, Outputs outputs) { + this(inputType, 0, 0, true, true, Integer.MAX_VALUE, outputs); } /** @@ -97,20 +99,34 @@ public class Builder { * @param minSuffixCount2 * (Note: only Mike McCandless knows what this one is really doing...) * - * @param doMinSuffix + * @param doShareSuffix * If true, the shared suffixes will be compacted into unique paths. * This requires an additional hash map for lookups in memory. Setting this parameter to * false creates a single path for all input sequences. This will result in a larger * graph, but may require less memory and will speed up construction. + * + * @param doShareNonSingletonNodes + * Only used if doShareSuffix is true. Set this to + * true to ensure FST is fully minimal, at cost of more + * CPU and more RAM during building. + * + * @param shareMaxTailLength + * Only used if doShareSuffix is true. Set this to + * Integer.MAX_VALUE to ensure FST is fully minimal, at cost of more + * CPU and more RAM during building. + * * @param outputs The output type for each input sequence. Applies only if building an FST. For * FSA, use {@link NoOutputs#getSingleton()} and {@link NoOutputs#getNoOutput()} as the * singleton output object. */ - public Builder(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doMinSuffix, Outputs outputs) { + public Builder(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doShareSuffix, + boolean doShareNonSingletonNodes, int shareMaxTailLength, Outputs outputs) { this.minSuffixCount1 = minSuffixCount1; this.minSuffixCount2 = minSuffixCount2; + this.doShareNonSingletonNodes = doShareNonSingletonNodes; + this.shareMaxTailLength = shareMaxTailLength; fst = new FST(inputType, outputs); - if (doMinSuffix) { + if (doShareSuffix) { dedupHash = new NodeHash(fst); } else { dedupHash = null; @@ -143,10 +159,9 @@ public class Builder { fst.setAllowArrayArcs(b); } - private CompiledNode compileNode(UnCompiledNode n) throws IOException { - + private CompiledNode compileNode(UnCompiledNode n, int tailLength) throws IOException { final int address; - if (dedupHash != null) { + if (dedupHash != null && (doShareNonSingletonNodes || n.numArcs <= 1) && tailLength <= shareMaxTailLength) { if (n.numArcs == 0) { address = fst.addNode(n); } else { @@ -221,7 +236,7 @@ public class Builder { } else { if (minSuffixCount2 != 0) { - compileAllTargets(node); + compileAllTargets(node, lastInput.length-idx); } final T nextFinalOutput = node.output; @@ -237,7 +252,7 @@ public class Builder { // compile any targets that were previously // undecided: parent.replaceLast(lastInput.ints[lastInput.offset + idx-1], - compileNode(node), + compileNode(node, 1+lastInput.length-idx), nextFinalOutput, isFinal); } else { @@ -428,22 +443,28 @@ public class Builder { // empty string got pruned return null; } else { - fst.finish(compileNode(frontier[0]).address); + fst.finish(compileNode(frontier[0], lastInput.length).address); //System.out.println("compile addr = " + fst.getStartNode()); return fst; } } else { if (minSuffixCount2 != 0) { - compileAllTargets(frontier[0]); + compileAllTargets(frontier[0], lastInput.length); } //System.out.println("NOW: " + frontier[0].numArcs); - fst.finish(compileNode(frontier[0]).address); + fst.finish(compileNode(frontier[0], lastInput.length).address); } + + /* + if (dedupHash != null) { + System.out.println("NH: " + dedupHash.count()); + } + */ return fst; } - private void compileAllTargets(UnCompiledNode node) throws IOException { + private void compileAllTargets(UnCompiledNode node, int tailLength) throws IOException { for(int arcIdx=0;arcIdx arc = node.arcs[arcIdx]; if (!arc.target.isCompiled()) { @@ -453,7 +474,7 @@ public class Builder { //System.out.println("seg=" + segment + " FORCE final arc=" + (char) arc.label); arc.isFinal = n.isFinal = true; } - arc.target = compileNode(n); + arc.target = compileNode(n, tailLength-1); } } } diff --git a/lucene/src/java/org/apache/lucene/util/fst/FST.java b/lucene/src/java/org/apache/lucene/util/fst/FST.java index a084b7a443e..3422382ac28 100644 --- a/lucene/src/java/org/apache/lucene/util/fst/FST.java +++ b/lucene/src/java/org/apache/lucene/util/fst/FST.java @@ -25,8 +25,12 @@ import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.CodecUtil; import org.apache.lucene.util.fst.Builder.UnCompiledNode; +// TODO: if FST is pure prefix trie we can do a more compact +// job, ie, once we are at a 'suffix only', just store the +// completion labels as a string not as a series of arcs. + // NOTE: while the FST is able to represent a non-final -// dead-end state (NON_FINAL_END_NODE=0), the layres above +// dead-end state (NON_FINAL_END_NODE=0), the layers above // (FSTEnum, Util) have problems with this!! /** Represents an FST using a compact byte[] format. diff --git a/lucene/src/java/org/apache/lucene/util/fst/NodeHash.java b/lucene/src/java/org/apache/lucene/util/fst/NodeHash.java index 276aa997214..0d854b722f1 100644 --- a/lucene/src/java/org/apache/lucene/util/fst/NodeHash.java +++ b/lucene/src/java/org/apache/lucene/util/fst/NodeHash.java @@ -164,4 +164,8 @@ final class NodeHash { } } } + + public int count() { + return count; + } } diff --git a/lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java b/lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java index e298a5d86c1..3efb42b58b4 100644 --- a/lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java +++ b/lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java @@ -150,7 +150,7 @@ public class TestFSTs extends LuceneTestCase { for(IntsRef term : terms2) { pairs.add(new FSTTester.InputOutput(term, NO_OUTPUT)); } - FST fst = new FSTTester(random, dir, inputMode, pairs, outputs).doTest(0, 0); + FST fst = new FSTTester(random, dir, inputMode, pairs, outputs).doTest(0, 0, false); assertNotNull(fst); assertEquals(22, fst.getNodeCount()); assertEquals(27, fst.getArcCount()); @@ -163,7 +163,7 @@ public class TestFSTs extends LuceneTestCase { for(int idx=0;idx(terms2[idx], outputs.get(idx))); } - final FST fst = new FSTTester(random, dir, inputMode, pairs, outputs).doTest(0, 0); + final FST fst = new FSTTester(random, dir, inputMode, pairs, outputs).doTest(0, 0, false); assertNotNull(fst); assertEquals(22, fst.getNodeCount()); assertEquals(27, fst.getArcCount()); @@ -178,7 +178,7 @@ public class TestFSTs extends LuceneTestCase { final BytesRef output = random.nextInt(30) == 17 ? NO_OUTPUT : new BytesRef(Integer.toString(idx)); pairs.add(new FSTTester.InputOutput(terms2[idx], output)); } - final FST fst = new FSTTester(random, dir, inputMode, pairs, outputs).doTest(0, 0); + final FST fst = new FSTTester(random, dir, inputMode, pairs, outputs).doTest(0, 0, false); assertNotNull(fst); assertEquals(24, fst.getNodeCount()); assertEquals(30, fst.getArcCount()); @@ -359,14 +359,14 @@ public class TestFSTs extends LuceneTestCase { public void doTest() throws IOException { // no pruning - doTest(0, 0); + doTest(0, 0, true); if (!(outputs instanceof UpToTwoPositiveIntOutputs)) { // simple pruning - doTest(_TestUtil.nextInt(random, 1, 1+pairs.size()), 0); + doTest(_TestUtil.nextInt(random, 1, 1+pairs.size()), 0, true); // leafy pruning - doTest(0, _TestUtil.nextInt(random, 1, 1+pairs.size())); + doTest(0, _TestUtil.nextInt(random, 1, 1+pairs.size()), true); } } @@ -446,14 +446,17 @@ public class TestFSTs extends LuceneTestCase { } - FST doTest(int prune1, int prune2) throws IOException { + FST doTest(int prune1, int prune2, boolean allowRandomSuffixSharing) throws IOException { if (VERBOSE) { System.out.println("TEST: prune1=" + prune1 + " prune2=" + prune2); } final Builder builder = new Builder(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, prune1, prune2, - prune1==0 && prune2==0, outputs); + prune1==0 && prune2==0, + allowRandomSuffixSharing ? random.nextBoolean() : true, + allowRandomSuffixSharing ? _TestUtil.nextInt(random, 1, 10) : Integer.MAX_VALUE, + outputs); for(InputOutput pair : pairs) { if (pair.output instanceof UpToTwoPositiveIntOutputs.TwoLongs) { @@ -1017,7 +1020,7 @@ public class TestFSTs extends LuceneTestCase { IndexReader r = IndexReader.open(writer, true); writer.close(); final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(random.nextBoolean()); - Builder builder = new Builder(FST.INPUT_TYPE.BYTE1, 0, 0, true, outputs); + Builder builder = new Builder(FST.INPUT_TYPE.BYTE1, outputs); boolean storeOrd = random.nextBoolean(); if (VERBOSE) { @@ -1145,7 +1148,7 @@ public class TestFSTs extends LuceneTestCase { this.inputMode = inputMode; this.outputs = outputs; - builder = new Builder(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, outputs); + builder = new Builder(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, true, Integer.MAX_VALUE, outputs); } protected abstract T getOutput(IntsRef input, int ord) throws IOException; @@ -1245,7 +1248,7 @@ public class TestFSTs extends LuceneTestCase { } } - // java -cp build/classes/test:build/classes/java:lib/junit-4.7.jar org.apache.lucene.util.fst.TestFSTs /x/tmp/allTerms3.txt out + // java -cp build/classes/test:build/classes/java:build/classes/test-framework:lib/junit-4.7.jar org.apache.lucene.util.fst.TestFSTs /x/tmp/allTerms3.txt out public static void main(String[] args) throws IOException { int prune = 0; int limit = Integer.MAX_VALUE; @@ -1351,7 +1354,7 @@ public class TestFSTs extends LuceneTestCase { public void testSingleString() throws Exception { final Outputs outputs = NoOutputs.getSingleton(); - final Builder b = new Builder(FST.INPUT_TYPE.BYTE1, 0, 0, true, outputs); + final Builder b = new Builder(FST.INPUT_TYPE.BYTE1, outputs); b.add(new BytesRef("foobar"), outputs.getNoOutput()); final BytesRefFSTEnum fstEnum = new BytesRefFSTEnum(b.finish()); assertNull(fstEnum.seekFloor(new BytesRef("foo"))); @@ -1368,7 +1371,7 @@ public class TestFSTs extends LuceneTestCase { final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); // Build an FST mapping BytesRef -> Long - final Builder builder = new Builder(FST.INPUT_TYPE.BYTE1, 0, 0, true, outputs); + final Builder builder = new Builder(FST.INPUT_TYPE.BYTE1, outputs); final BytesRef a = new BytesRef("a"); final BytesRef b = new BytesRef("b"); @@ -1413,7 +1416,7 @@ public class TestFSTs extends LuceneTestCase { FST compile(String[] lines) throws IOException { final NoOutputs outputs = NoOutputs.getSingleton(); final Object nothing = outputs.getNoOutput(); - final Builder b = new Builder(FST.INPUT_TYPE.BYTE1, 0, 0, true, outputs); + final Builder b = new Builder(FST.INPUT_TYPE.BYTE1, outputs); int line = 0; final BytesRef term = new BytesRef(); @@ -1488,7 +1491,7 @@ public class TestFSTs extends LuceneTestCase { public void testNonFinalStopNodes() throws Exception { final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); final Long nothing = outputs.getNoOutput(); - final Builder b = new Builder(FST.INPUT_TYPE.BYTE1, 0, 0, true, outputs); + final Builder b = new Builder(FST.INPUT_TYPE.BYTE1, outputs); final FST fst = new FST(FST.INPUT_TYPE.BYTE1, outputs); diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java index ad8b6bd4814..1cc22a4d6d3 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java @@ -244,7 +244,7 @@ public class SynonymMap { ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); // TODO: are we using the best sharing options? org.apache.lucene.util.fst.Builder builder = - new org.apache.lucene.util.fst.Builder(FST.INPUT_TYPE.BYTE4, 0, 0, true, outputs); + new org.apache.lucene.util.fst.Builder(FST.INPUT_TYPE.BYTE4, outputs); BytesRef scratch = new BytesRef(64); ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput(); diff --git a/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTLookup.java b/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTLookup.java index 317090863eb..8b927726230 100644 --- a/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTLookup.java +++ b/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTLookup.java @@ -450,8 +450,7 @@ public class FSTLookup extends Lookup { // Build the automaton. final Outputs outputs = NoOutputs.getSingleton(); final Object empty = outputs.getNoOutput(); - final Builder builder = - new Builder(FST.INPUT_TYPE.BYTE4, 0, 0, true, outputs); + final Builder builder = new Builder(FST.INPUT_TYPE.BYTE4, outputs); final IntsRef scratchIntsRef = new IntsRef(10); for (Entry e : entries) { final int termLength = scratchIntsRef.length = e.term.length;