LUCENE-3289: add options to FST Builder to tradeoff RAM/CPU used during build vs how small the resulting FST is

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1145292 13f79535-47bb-0310-9956-ffa450edef68
2011-07-11 18:53:13 +00:00 · 2011-07-11 18:53:13 +00:00 · fbf9f4ccad
parent 94a47f4415
commit fbf9f4ccad
10 changed files with 74 additions and 43 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -532,6 +532,11 @@ Optimizations
  directly if possible and merges separately written files on the fly instead
  of during close. (Simon Willnauer, Robert Muir)

+* LUCENE-3289: When building an FST you can now tune how aggressively
+  the FST should try to share common suffixes.  Typically you can
+  greatly reduce RAM required during building, and CPU consumed, at
+  the cost of a somewhat larger FST.  (Mike McCandless)
+
 ======================= Lucene 3.3.0 =======================

 Changes in backwards compatibility policy
--- a/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexReader.java
+++ b/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexReader.java
@ -190,7 +190,7 @@ public class VariableGapTermsIndexReader extends TermsIndexReaderBase {
        if (indexDivisor > 1) {
          // subsample
          final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
-          final Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, 0, 0, true, outputs);
+          final Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
          final BytesRefFSTEnum<Long> fstEnum = new BytesRefFSTEnum<Long>(fst);
          BytesRefFSTEnum.InputOutput<Long> result;
          int count = indexDivisor;
--- a/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexWriter.java
+++ b/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexWriter.java
@ -222,9 +222,7 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
    public FSTFieldWriter(FieldInfo fieldInfo, long termsFilePointer) throws IOException {
      this.fieldInfo = fieldInfo;
      fstOutputs = PositiveIntOutputs.getSingleton(true);
-      fstBuilder = new Builder<Long>(FST.INPUT_TYPE.BYTE1,
-                                     0, 0, true,
-                                     fstOutputs);
+      fstBuilder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, fstOutputs);
      indexStart = out.getFilePointer();
      ////System.out.println("VGW: field=" + fieldInfo.name);

--- a/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java
+++ b/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java
@ -478,9 +478,6 @@ class SimpleTextFieldsReader extends FieldsProducer {
      PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton(false);
      final Builder<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> b;
      b = new Builder<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>>(FST.INPUT_TYPE.BYTE1,
-                                                                          0,
-                                                                          0,
-                                                                          true,
                                                                          new PairOutputs<Long,PairOutputs.Pair<Long,Long>>(posIntOutputs,
                                                                                                                            new PairOutputs<Long,Long>(posIntOutputs, posIntOutputs)));
      IndexInput in = (IndexInput) SimpleTextFieldsReader.this.in.clone();
--- a/lucene/src/java/org/apache/lucene/util/fst/Builder.java
+++ b/lucene/src/java/org/apache/lucene/util/fst/Builder.java
@ -62,6 +62,9 @@ public class Builder<T> {
  // terms go through it:
  private final int minSuffixCount2;

+  private final boolean doShareNonSingletonNodes;
+  private final int shareMaxTailLength;
+
  private final IntsRef lastInput = new IntsRef();

  // NOTE: cutting this over to ArrayList instead loses ~6%
@ -72,12 +75,11 @@ public class Builder<T> {

  /**
   * Instantiates an FST/FSA builder without any pruning. A shortcut
-   * to {@link #Builder(FST.INPUT_TYPE, int, int, boolean, Outputs)} with 
+   * to {@link #Builder(FST.INPUT_TYPE, int, int, boolean, boolean, int, Outputs)} with 
   * pruning options turned off.
   */
-  public Builder(FST.INPUT_TYPE inputType, Outputs<T> outputs)
-  {
-      this(inputType, 0, 0, true, outputs);
+  public Builder(FST.INPUT_TYPE inputType, Outputs<T> outputs) {
+    this(inputType, 0, 0, true, true, Integer.MAX_VALUE, outputs);
  }

  /**
@ -97,20 +99,34 @@ public class Builder<T> {
   * @param minSuffixCount2
   *    (Note: only Mike McCandless knows what this one is really doing...) 
   * 
-   * @param doMinSuffix 
+   * @param doShareSuffix 
   *    If <code>true</code>, the shared suffixes will be compacted into unique paths.
   *    This requires an additional hash map for lookups in memory. Setting this parameter to
   *    <code>false</code> creates a single path for all input sequences. This will result in a larger
   *    graph, but may require less memory and will speed up construction.  
+   *
+   * @param doShareNonSingletonNodes
+   *    Only used if doShareSuffix is true.  Set this to
+   *    true to ensure FST is fully minimal, at cost of more
+   *    CPU and more RAM during building.
+   *
+   * @param shareMaxTailLength
+   *    Only used if doShareSuffix is true.  Set this to
+   *    Integer.MAX_VALUE to ensure FST is fully minimal, at cost of more
+   *    CPU and more RAM during building.
+   *
   * @param outputs The output type for each input sequence. Applies only if building an FST. For
   *    FSA, use {@link NoOutputs#getSingleton()} and {@link NoOutputs#getNoOutput()} as the
   *    singleton output object.
   */
-  public Builder(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doMinSuffix, Outputs<T> outputs) {
+  public Builder(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doShareSuffix,
+                 boolean doShareNonSingletonNodes, int shareMaxTailLength, Outputs<T> outputs) {
    this.minSuffixCount1 = minSuffixCount1;
    this.minSuffixCount2 = minSuffixCount2;
+    this.doShareNonSingletonNodes = doShareNonSingletonNodes;
+    this.shareMaxTailLength = shareMaxTailLength;
    fst = new FST<T>(inputType, outputs);
-    if (doMinSuffix) {
+    if (doShareSuffix) {
      dedupHash = new NodeHash<T>(fst);
    } else {
      dedupHash = null;
@ -143,10 +159,9 @@ public class Builder<T> {
    fst.setAllowArrayArcs(b);
  }

-  private CompiledNode compileNode(UnCompiledNode<T> n) throws IOException {
-
+  private CompiledNode compileNode(UnCompiledNode<T> n, int tailLength) throws IOException {
    final int address;
-    if (dedupHash != null) {
+    if (dedupHash != null && (doShareNonSingletonNodes || n.numArcs <= 1) && tailLength <= shareMaxTailLength) {
      if (n.numArcs == 0) {
        address = fst.addNode(n);
      } else {
@ -221,7 +236,7 @@ public class Builder<T> {
      } else {

        if (minSuffixCount2 != 0) {
-          compileAllTargets(node);
+          compileAllTargets(node, lastInput.length-idx);
        }
        final T nextFinalOutput = node.output;

@ -237,7 +252,7 @@ public class Builder<T> {
          // compile any targets that were previously
          // undecided:
          parent.replaceLast(lastInput.ints[lastInput.offset + idx-1],
-                             compileNode(node),
+                             compileNode(node, 1+lastInput.length-idx),
                             nextFinalOutput,
                             isFinal);
        } else {
@ -428,22 +443,28 @@ public class Builder<T> {
        // empty string got pruned
        return null;
      } else {
-        fst.finish(compileNode(frontier[0]).address);
+        fst.finish(compileNode(frontier[0], lastInput.length).address);
        //System.out.println("compile addr = " + fst.getStartNode());
        return fst;
      }
    } else {
      if (minSuffixCount2 != 0) {
-        compileAllTargets(frontier[0]);
+        compileAllTargets(frontier[0], lastInput.length);
      }
      //System.out.println("NOW: " + frontier[0].numArcs);
-      fst.finish(compileNode(frontier[0]).address);
+      fst.finish(compileNode(frontier[0], lastInput.length).address);
    }

+    /*
+    if (dedupHash != null) {
+      System.out.println("NH: " + dedupHash.count()); 
+    }
+    */
+    
    return fst;
  }

-  private void compileAllTargets(UnCompiledNode<T> node) throws IOException {
+  private void compileAllTargets(UnCompiledNode<T> node, int tailLength) throws IOException {
    for(int arcIdx=0;arcIdx<node.numArcs;arcIdx++) {
      final Arc<T> arc = node.arcs[arcIdx];
      if (!arc.target.isCompiled()) {
@ -453,7 +474,7 @@ public class Builder<T> {
          //System.out.println("seg=" + segment + "        FORCE final arc=" + (char) arc.label);
          arc.isFinal = n.isFinal = true;
        }
-        arc.target = compileNode(n);
+        arc.target = compileNode(n, tailLength-1);
      }
    }
  }
--- a/lucene/src/java/org/apache/lucene/util/fst/FST.java
+++ b/lucene/src/java/org/apache/lucene/util/fst/FST.java
@ -25,8 +25,12 @@ import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.CodecUtil;
 import org.apache.lucene.util.fst.Builder.UnCompiledNode;

+// TODO: if FST is pure prefix trie we can do a more compact
+// job, ie, once we are at a 'suffix only', just store the
+// completion labels as a string not as a series of arcs.
+
 // NOTE: while the FST is able to represent a non-final
-// dead-end state (NON_FINAL_END_NODE=0), the layres above
+// dead-end state (NON_FINAL_END_NODE=0), the layers above
 // (FSTEnum, Util) have problems with this!!

 /** Represents an FST using a compact byte[] format.
--- a/lucene/src/java/org/apache/lucene/util/fst/NodeHash.java
+++ b/lucene/src/java/org/apache/lucene/util/fst/NodeHash.java
@ -164,4 +164,8 @@ final class NodeHash<T> {
      }
    }
  }
+
+  public int count() {
+    return count;
+  }
 }
--- a/lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java
+++ b/lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java
@ -150,7 +150,7 @@ public class TestFSTs extends LuceneTestCase {
        for(IntsRef term : terms2) {
          pairs.add(new FSTTester.InputOutput<Object>(term, NO_OUTPUT));
        }
-        FST<Object> fst = new FSTTester<Object>(random, dir, inputMode, pairs, outputs).doTest(0, 0);
+        FST<Object> fst = new FSTTester<Object>(random, dir, inputMode, pairs, outputs).doTest(0, 0, false);
        assertNotNull(fst);
        assertEquals(22, fst.getNodeCount());
        assertEquals(27, fst.getArcCount());
@ -163,7 +163,7 @@ public class TestFSTs extends LuceneTestCase {
        for(int idx=0;idx<terms2.length;idx++) {
          pairs.add(new FSTTester.InputOutput<Long>(terms2[idx], outputs.get(idx)));
        }
-        final FST<Long> fst = new FSTTester<Long>(random, dir, inputMode, pairs, outputs).doTest(0, 0);
+        final FST<Long> fst = new FSTTester<Long>(random, dir, inputMode, pairs, outputs).doTest(0, 0, false);
        assertNotNull(fst);
        assertEquals(22, fst.getNodeCount());
        assertEquals(27, fst.getArcCount());
@ -178,7 +178,7 @@ public class TestFSTs extends LuceneTestCase {
          final BytesRef output = random.nextInt(30) == 17 ? NO_OUTPUT : new BytesRef(Integer.toString(idx));
          pairs.add(new FSTTester.InputOutput<BytesRef>(terms2[idx], output));
        }
-        final FST<BytesRef> fst = new FSTTester<BytesRef>(random, dir, inputMode, pairs, outputs).doTest(0, 0);
+        final FST<BytesRef> fst = new FSTTester<BytesRef>(random, dir, inputMode, pairs, outputs).doTest(0, 0, false);
        assertNotNull(fst);
        assertEquals(24, fst.getNodeCount());
        assertEquals(30, fst.getArcCount());
@ -359,14 +359,14 @@ public class TestFSTs extends LuceneTestCase {

    public void doTest() throws IOException {
      // no pruning
-      doTest(0, 0);
+      doTest(0, 0, true);

      if (!(outputs instanceof UpToTwoPositiveIntOutputs)) {
        // simple pruning
-        doTest(_TestUtil.nextInt(random, 1, 1+pairs.size()), 0);
+        doTest(_TestUtil.nextInt(random, 1, 1+pairs.size()), 0, true);
        
        // leafy pruning
-        doTest(0, _TestUtil.nextInt(random, 1, 1+pairs.size()));
+        doTest(0, _TestUtil.nextInt(random, 1, 1+pairs.size()), true);
      }
    }

@ -446,14 +446,17 @@ public class TestFSTs extends LuceneTestCase {
    }


-    FST<T> doTest(int prune1, int prune2) throws IOException {
+    FST<T> doTest(int prune1, int prune2, boolean allowRandomSuffixSharing) throws IOException {
      if (VERBOSE) {
        System.out.println("TEST: prune1=" + prune1 + " prune2=" + prune2);
      }

      final Builder<T> builder = new Builder<T>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4,
                                                prune1, prune2,
-                                                prune1==0 && prune2==0, outputs);
+                                                prune1==0 && prune2==0,
+                                                allowRandomSuffixSharing ? random.nextBoolean() : true,
+                                                allowRandomSuffixSharing ? _TestUtil.nextInt(random, 1, 10) : Integer.MAX_VALUE,
+                                                outputs);

      for(InputOutput<T> pair : pairs) {
        if (pair.output instanceof UpToTwoPositiveIntOutputs.TwoLongs) {
@ -1017,7 +1020,7 @@ public class TestFSTs extends LuceneTestCase {
    IndexReader r = IndexReader.open(writer, true);
    writer.close();
    final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(random.nextBoolean());
-    Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, 0, 0, true, outputs);
+    Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);

    boolean storeOrd = random.nextBoolean();
    if (VERBOSE) {
@ -1145,7 +1148,7 @@ public class TestFSTs extends LuceneTestCase {
      this.inputMode = inputMode;
      this.outputs = outputs;
      
-      builder = new Builder<T>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, outputs);
+      builder = new Builder<T>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, true, Integer.MAX_VALUE, outputs);
    }

    protected abstract T getOutput(IntsRef input, int ord) throws IOException;
@ -1245,7 +1248,7 @@ public class TestFSTs extends LuceneTestCase {
    }
  }

-  // java -cp build/classes/test:build/classes/java:lib/junit-4.7.jar org.apache.lucene.util.fst.TestFSTs /x/tmp/allTerms3.txt out
+  // java -cp build/classes/test:build/classes/java:build/classes/test-framework:lib/junit-4.7.jar org.apache.lucene.util.fst.TestFSTs /x/tmp/allTerms3.txt out
  public static void main(String[] args) throws IOException {
    int prune = 0;
    int limit = Integer.MAX_VALUE;
@ -1351,7 +1354,7 @@ public class TestFSTs extends LuceneTestCase {

  public void testSingleString() throws Exception {
    final Outputs<Object> outputs = NoOutputs.getSingleton();
-    final Builder<Object> b = new Builder<Object>(FST.INPUT_TYPE.BYTE1, 0, 0, true, outputs);
+    final Builder<Object> b = new Builder<Object>(FST.INPUT_TYPE.BYTE1, outputs);
    b.add(new BytesRef("foobar"), outputs.getNoOutput());
    final BytesRefFSTEnum<Object> fstEnum = new BytesRefFSTEnum<Object>(b.finish());
    assertNull(fstEnum.seekFloor(new BytesRef("foo")));
@ -1368,7 +1371,7 @@ public class TestFSTs extends LuceneTestCase {
    final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);

    // Build an FST mapping BytesRef -> Long
-    final Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, 0, 0, true, outputs);
+    final Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);

    final BytesRef a = new BytesRef("a");
    final BytesRef b = new BytesRef("b");
@ -1413,7 +1416,7 @@ public class TestFSTs extends LuceneTestCase {
      FST<Object> compile(String[] lines) throws IOException {
        final NoOutputs outputs = NoOutputs.getSingleton();
        final Object nothing = outputs.getNoOutput();
-        final Builder<Object> b = new Builder<Object>(FST.INPUT_TYPE.BYTE1, 0, 0, true, outputs);
+        final Builder<Object> b = new Builder<Object>(FST.INPUT_TYPE.BYTE1, outputs);

        int line = 0;
        final BytesRef term = new BytesRef();
@ -1488,7 +1491,7 @@ public class TestFSTs extends LuceneTestCase {
  public void testNonFinalStopNodes() throws Exception {
    final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
    final Long nothing = outputs.getNoOutput();
-    final Builder<Long> b = new Builder<Long>(FST.INPUT_TYPE.BYTE1, 0, 0, true, outputs);
+    final Builder<Long> b = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);

    final FST<Long> fst = new FST<Long>(FST.INPUT_TYPE.BYTE1, outputs);

--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java
@ -244,7 +244,7 @@ public class SynonymMap {
      ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
      // TODO: are we using the best sharing options?
      org.apache.lucene.util.fst.Builder<BytesRef> builder = 
-        new org.apache.lucene.util.fst.Builder<BytesRef>(FST.INPUT_TYPE.BYTE4, 0, 0, true, outputs);
+        new org.apache.lucene.util.fst.Builder<BytesRef>(FST.INPUT_TYPE.BYTE4, outputs);
      
      BytesRef scratch = new BytesRef(64);
      ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput();
--- a/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTLookup.java
+++ b/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTLookup.java
@ -450,8 +450,7 @@ public class FSTLookup extends Lookup {
    // Build the automaton.
    final Outputs<Object> outputs = NoOutputs.getSingleton();
    final Object empty = outputs.getNoOutput();
-    final Builder<Object> builder = 
-      new Builder<Object>(FST.INPUT_TYPE.BYTE4, 0, 0, true, outputs);
+    final Builder<Object> builder = new Builder<Object>(FST.INPUT_TYPE.BYTE4, outputs);
    final IntsRef scratchIntsRef = new IntsRef(10);
    for (Entry e : entries) {
      final int termLength = scratchIntsRef.length = e.term.length;