mirror of https://github.com/apache/lucene.git
LUCENE-3289: add options to FST Builder to tradeoff RAM/CPU used during build vs how small the resulting FST is
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1145292 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
94a47f4415
commit
fbf9f4ccad
|
@ -532,6 +532,11 @@ Optimizations
|
|||
directly if possible and merges separately written files on the fly instead
|
||||
of during close. (Simon Willnauer, Robert Muir)
|
||||
|
||||
* LUCENE-3289: When building an FST you can now tune how aggressively
|
||||
the FST should try to share common suffixes. Typically you can
|
||||
greatly reduce RAM required during building, and CPU consumed, at
|
||||
the cost of a somewhat larger FST. (Mike McCandless)
|
||||
|
||||
======================= Lucene 3.3.0 =======================
|
||||
|
||||
Changes in backwards compatibility policy
|
||||
|
|
|
@ -190,7 +190,7 @@ public class VariableGapTermsIndexReader extends TermsIndexReaderBase {
|
|||
if (indexDivisor > 1) {
|
||||
// subsample
|
||||
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
|
||||
final Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, 0, 0, true, outputs);
|
||||
final Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
final BytesRefFSTEnum<Long> fstEnum = new BytesRefFSTEnum<Long>(fst);
|
||||
BytesRefFSTEnum.InputOutput<Long> result;
|
||||
int count = indexDivisor;
|
||||
|
|
|
@ -222,9 +222,7 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
|
|||
public FSTFieldWriter(FieldInfo fieldInfo, long termsFilePointer) throws IOException {
|
||||
this.fieldInfo = fieldInfo;
|
||||
fstOutputs = PositiveIntOutputs.getSingleton(true);
|
||||
fstBuilder = new Builder<Long>(FST.INPUT_TYPE.BYTE1,
|
||||
0, 0, true,
|
||||
fstOutputs);
|
||||
fstBuilder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, fstOutputs);
|
||||
indexStart = out.getFilePointer();
|
||||
////System.out.println("VGW: field=" + fieldInfo.name);
|
||||
|
||||
|
|
|
@ -478,9 +478,6 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
|||
PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton(false);
|
||||
final Builder<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> b;
|
||||
b = new Builder<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>>(FST.INPUT_TYPE.BYTE1,
|
||||
0,
|
||||
0,
|
||||
true,
|
||||
new PairOutputs<Long,PairOutputs.Pair<Long,Long>>(posIntOutputs,
|
||||
new PairOutputs<Long,Long>(posIntOutputs, posIntOutputs)));
|
||||
IndexInput in = (IndexInput) SimpleTextFieldsReader.this.in.clone();
|
||||
|
|
|
@ -62,6 +62,9 @@ public class Builder<T> {
|
|||
// terms go through it:
|
||||
private final int minSuffixCount2;
|
||||
|
||||
private final boolean doShareNonSingletonNodes;
|
||||
private final int shareMaxTailLength;
|
||||
|
||||
private final IntsRef lastInput = new IntsRef();
|
||||
|
||||
// NOTE: cutting this over to ArrayList instead loses ~6%
|
||||
|
@ -72,12 +75,11 @@ public class Builder<T> {
|
|||
|
||||
/**
|
||||
* Instantiates an FST/FSA builder without any pruning. A shortcut
|
||||
* to {@link #Builder(FST.INPUT_TYPE, int, int, boolean, Outputs)} with
|
||||
* to {@link #Builder(FST.INPUT_TYPE, int, int, boolean, boolean, int, Outputs)} with
|
||||
* pruning options turned off.
|
||||
*/
|
||||
public Builder(FST.INPUT_TYPE inputType, Outputs<T> outputs)
|
||||
{
|
||||
this(inputType, 0, 0, true, outputs);
|
||||
public Builder(FST.INPUT_TYPE inputType, Outputs<T> outputs) {
|
||||
this(inputType, 0, 0, true, true, Integer.MAX_VALUE, outputs);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -97,20 +99,34 @@ public class Builder<T> {
|
|||
* @param minSuffixCount2
|
||||
* (Note: only Mike McCandless knows what this one is really doing...)
|
||||
*
|
||||
* @param doMinSuffix
|
||||
* @param doShareSuffix
|
||||
* If <code>true</code>, the shared suffixes will be compacted into unique paths.
|
||||
* This requires an additional hash map for lookups in memory. Setting this parameter to
|
||||
* <code>false</code> creates a single path for all input sequences. This will result in a larger
|
||||
* graph, but may require less memory and will speed up construction.
|
||||
*
|
||||
* @param doShareNonSingletonNodes
|
||||
* Only used if doShareSuffix is true. Set this to
|
||||
* true to ensure FST is fully minimal, at cost of more
|
||||
* CPU and more RAM during building.
|
||||
*
|
||||
* @param shareMaxTailLength
|
||||
* Only used if doShareSuffix is true. Set this to
|
||||
* Integer.MAX_VALUE to ensure FST is fully minimal, at cost of more
|
||||
* CPU and more RAM during building.
|
||||
*
|
||||
* @param outputs The output type for each input sequence. Applies only if building an FST. For
|
||||
* FSA, use {@link NoOutputs#getSingleton()} and {@link NoOutputs#getNoOutput()} as the
|
||||
* singleton output object.
|
||||
*/
|
||||
public Builder(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doMinSuffix, Outputs<T> outputs) {
|
||||
public Builder(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doShareSuffix,
|
||||
boolean doShareNonSingletonNodes, int shareMaxTailLength, Outputs<T> outputs) {
|
||||
this.minSuffixCount1 = minSuffixCount1;
|
||||
this.minSuffixCount2 = minSuffixCount2;
|
||||
this.doShareNonSingletonNodes = doShareNonSingletonNodes;
|
||||
this.shareMaxTailLength = shareMaxTailLength;
|
||||
fst = new FST<T>(inputType, outputs);
|
||||
if (doMinSuffix) {
|
||||
if (doShareSuffix) {
|
||||
dedupHash = new NodeHash<T>(fst);
|
||||
} else {
|
||||
dedupHash = null;
|
||||
|
@ -143,10 +159,9 @@ public class Builder<T> {
|
|||
fst.setAllowArrayArcs(b);
|
||||
}
|
||||
|
||||
private CompiledNode compileNode(UnCompiledNode<T> n) throws IOException {
|
||||
|
||||
private CompiledNode compileNode(UnCompiledNode<T> n, int tailLength) throws IOException {
|
||||
final int address;
|
||||
if (dedupHash != null) {
|
||||
if (dedupHash != null && (doShareNonSingletonNodes || n.numArcs <= 1) && tailLength <= shareMaxTailLength) {
|
||||
if (n.numArcs == 0) {
|
||||
address = fst.addNode(n);
|
||||
} else {
|
||||
|
@ -221,7 +236,7 @@ public class Builder<T> {
|
|||
} else {
|
||||
|
||||
if (minSuffixCount2 != 0) {
|
||||
compileAllTargets(node);
|
||||
compileAllTargets(node, lastInput.length-idx);
|
||||
}
|
||||
final T nextFinalOutput = node.output;
|
||||
|
||||
|
@ -237,7 +252,7 @@ public class Builder<T> {
|
|||
// compile any targets that were previously
|
||||
// undecided:
|
||||
parent.replaceLast(lastInput.ints[lastInput.offset + idx-1],
|
||||
compileNode(node),
|
||||
compileNode(node, 1+lastInput.length-idx),
|
||||
nextFinalOutput,
|
||||
isFinal);
|
||||
} else {
|
||||
|
@ -428,22 +443,28 @@ public class Builder<T> {
|
|||
// empty string got pruned
|
||||
return null;
|
||||
} else {
|
||||
fst.finish(compileNode(frontier[0]).address);
|
||||
fst.finish(compileNode(frontier[0], lastInput.length).address);
|
||||
//System.out.println("compile addr = " + fst.getStartNode());
|
||||
return fst;
|
||||
}
|
||||
} else {
|
||||
if (minSuffixCount2 != 0) {
|
||||
compileAllTargets(frontier[0]);
|
||||
compileAllTargets(frontier[0], lastInput.length);
|
||||
}
|
||||
//System.out.println("NOW: " + frontier[0].numArcs);
|
||||
fst.finish(compileNode(frontier[0]).address);
|
||||
fst.finish(compileNode(frontier[0], lastInput.length).address);
|
||||
}
|
||||
|
||||
/*
|
||||
if (dedupHash != null) {
|
||||
System.out.println("NH: " + dedupHash.count());
|
||||
}
|
||||
*/
|
||||
|
||||
return fst;
|
||||
}
|
||||
|
||||
private void compileAllTargets(UnCompiledNode<T> node) throws IOException {
|
||||
private void compileAllTargets(UnCompiledNode<T> node, int tailLength) throws IOException {
|
||||
for(int arcIdx=0;arcIdx<node.numArcs;arcIdx++) {
|
||||
final Arc<T> arc = node.arcs[arcIdx];
|
||||
if (!arc.target.isCompiled()) {
|
||||
|
@ -453,7 +474,7 @@ public class Builder<T> {
|
|||
//System.out.println("seg=" + segment + " FORCE final arc=" + (char) arc.label);
|
||||
arc.isFinal = n.isFinal = true;
|
||||
}
|
||||
arc.target = compileNode(n);
|
||||
arc.target = compileNode(n, tailLength-1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -25,8 +25,12 @@ import org.apache.lucene.util.ArrayUtil;
|
|||
import org.apache.lucene.util.CodecUtil;
|
||||
import org.apache.lucene.util.fst.Builder.UnCompiledNode;
|
||||
|
||||
// TODO: if FST is pure prefix trie we can do a more compact
|
||||
// job, ie, once we are at a 'suffix only', just store the
|
||||
// completion labels as a string not as a series of arcs.
|
||||
|
||||
// NOTE: while the FST is able to represent a non-final
|
||||
// dead-end state (NON_FINAL_END_NODE=0), the layres above
|
||||
// dead-end state (NON_FINAL_END_NODE=0), the layers above
|
||||
// (FSTEnum, Util) have problems with this!!
|
||||
|
||||
/** Represents an FST using a compact byte[] format.
|
||||
|
|
|
@ -164,4 +164,8 @@ final class NodeHash<T> {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
public int count() {
|
||||
return count;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -150,7 +150,7 @@ public class TestFSTs extends LuceneTestCase {
|
|||
for(IntsRef term : terms2) {
|
||||
pairs.add(new FSTTester.InputOutput<Object>(term, NO_OUTPUT));
|
||||
}
|
||||
FST<Object> fst = new FSTTester<Object>(random, dir, inputMode, pairs, outputs).doTest(0, 0);
|
||||
FST<Object> fst = new FSTTester<Object>(random, dir, inputMode, pairs, outputs).doTest(0, 0, false);
|
||||
assertNotNull(fst);
|
||||
assertEquals(22, fst.getNodeCount());
|
||||
assertEquals(27, fst.getArcCount());
|
||||
|
@ -163,7 +163,7 @@ public class TestFSTs extends LuceneTestCase {
|
|||
for(int idx=0;idx<terms2.length;idx++) {
|
||||
pairs.add(new FSTTester.InputOutput<Long>(terms2[idx], outputs.get(idx)));
|
||||
}
|
||||
final FST<Long> fst = new FSTTester<Long>(random, dir, inputMode, pairs, outputs).doTest(0, 0);
|
||||
final FST<Long> fst = new FSTTester<Long>(random, dir, inputMode, pairs, outputs).doTest(0, 0, false);
|
||||
assertNotNull(fst);
|
||||
assertEquals(22, fst.getNodeCount());
|
||||
assertEquals(27, fst.getArcCount());
|
||||
|
@ -178,7 +178,7 @@ public class TestFSTs extends LuceneTestCase {
|
|||
final BytesRef output = random.nextInt(30) == 17 ? NO_OUTPUT : new BytesRef(Integer.toString(idx));
|
||||
pairs.add(new FSTTester.InputOutput<BytesRef>(terms2[idx], output));
|
||||
}
|
||||
final FST<BytesRef> fst = new FSTTester<BytesRef>(random, dir, inputMode, pairs, outputs).doTest(0, 0);
|
||||
final FST<BytesRef> fst = new FSTTester<BytesRef>(random, dir, inputMode, pairs, outputs).doTest(0, 0, false);
|
||||
assertNotNull(fst);
|
||||
assertEquals(24, fst.getNodeCount());
|
||||
assertEquals(30, fst.getArcCount());
|
||||
|
@ -359,14 +359,14 @@ public class TestFSTs extends LuceneTestCase {
|
|||
|
||||
public void doTest() throws IOException {
|
||||
// no pruning
|
||||
doTest(0, 0);
|
||||
doTest(0, 0, true);
|
||||
|
||||
if (!(outputs instanceof UpToTwoPositiveIntOutputs)) {
|
||||
// simple pruning
|
||||
doTest(_TestUtil.nextInt(random, 1, 1+pairs.size()), 0);
|
||||
doTest(_TestUtil.nextInt(random, 1, 1+pairs.size()), 0, true);
|
||||
|
||||
// leafy pruning
|
||||
doTest(0, _TestUtil.nextInt(random, 1, 1+pairs.size()));
|
||||
doTest(0, _TestUtil.nextInt(random, 1, 1+pairs.size()), true);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -446,14 +446,17 @@ public class TestFSTs extends LuceneTestCase {
|
|||
}
|
||||
|
||||
|
||||
FST<T> doTest(int prune1, int prune2) throws IOException {
|
||||
FST<T> doTest(int prune1, int prune2, boolean allowRandomSuffixSharing) throws IOException {
|
||||
if (VERBOSE) {
|
||||
System.out.println("TEST: prune1=" + prune1 + " prune2=" + prune2);
|
||||
}
|
||||
|
||||
final Builder<T> builder = new Builder<T>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4,
|
||||
prune1, prune2,
|
||||
prune1==0 && prune2==0, outputs);
|
||||
prune1==0 && prune2==0,
|
||||
allowRandomSuffixSharing ? random.nextBoolean() : true,
|
||||
allowRandomSuffixSharing ? _TestUtil.nextInt(random, 1, 10) : Integer.MAX_VALUE,
|
||||
outputs);
|
||||
|
||||
for(InputOutput<T> pair : pairs) {
|
||||
if (pair.output instanceof UpToTwoPositiveIntOutputs.TwoLongs) {
|
||||
|
@ -1017,7 +1020,7 @@ public class TestFSTs extends LuceneTestCase {
|
|||
IndexReader r = IndexReader.open(writer, true);
|
||||
writer.close();
|
||||
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(random.nextBoolean());
|
||||
Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, 0, 0, true, outputs);
|
||||
Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
|
||||
boolean storeOrd = random.nextBoolean();
|
||||
if (VERBOSE) {
|
||||
|
@ -1145,7 +1148,7 @@ public class TestFSTs extends LuceneTestCase {
|
|||
this.inputMode = inputMode;
|
||||
this.outputs = outputs;
|
||||
|
||||
builder = new Builder<T>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, outputs);
|
||||
builder = new Builder<T>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, true, Integer.MAX_VALUE, outputs);
|
||||
}
|
||||
|
||||
protected abstract T getOutput(IntsRef input, int ord) throws IOException;
|
||||
|
@ -1245,7 +1248,7 @@ public class TestFSTs extends LuceneTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
// java -cp build/classes/test:build/classes/java:lib/junit-4.7.jar org.apache.lucene.util.fst.TestFSTs /x/tmp/allTerms3.txt out
|
||||
// java -cp build/classes/test:build/classes/java:build/classes/test-framework:lib/junit-4.7.jar org.apache.lucene.util.fst.TestFSTs /x/tmp/allTerms3.txt out
|
||||
public static void main(String[] args) throws IOException {
|
||||
int prune = 0;
|
||||
int limit = Integer.MAX_VALUE;
|
||||
|
@ -1351,7 +1354,7 @@ public class TestFSTs extends LuceneTestCase {
|
|||
|
||||
public void testSingleString() throws Exception {
|
||||
final Outputs<Object> outputs = NoOutputs.getSingleton();
|
||||
final Builder<Object> b = new Builder<Object>(FST.INPUT_TYPE.BYTE1, 0, 0, true, outputs);
|
||||
final Builder<Object> b = new Builder<Object>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
b.add(new BytesRef("foobar"), outputs.getNoOutput());
|
||||
final BytesRefFSTEnum<Object> fstEnum = new BytesRefFSTEnum<Object>(b.finish());
|
||||
assertNull(fstEnum.seekFloor(new BytesRef("foo")));
|
||||
|
@ -1368,7 +1371,7 @@ public class TestFSTs extends LuceneTestCase {
|
|||
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
|
||||
|
||||
// Build an FST mapping BytesRef -> Long
|
||||
final Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, 0, 0, true, outputs);
|
||||
final Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
|
||||
final BytesRef a = new BytesRef("a");
|
||||
final BytesRef b = new BytesRef("b");
|
||||
|
@ -1413,7 +1416,7 @@ public class TestFSTs extends LuceneTestCase {
|
|||
FST<Object> compile(String[] lines) throws IOException {
|
||||
final NoOutputs outputs = NoOutputs.getSingleton();
|
||||
final Object nothing = outputs.getNoOutput();
|
||||
final Builder<Object> b = new Builder<Object>(FST.INPUT_TYPE.BYTE1, 0, 0, true, outputs);
|
||||
final Builder<Object> b = new Builder<Object>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
|
||||
int line = 0;
|
||||
final BytesRef term = new BytesRef();
|
||||
|
@ -1488,7 +1491,7 @@ public class TestFSTs extends LuceneTestCase {
|
|||
public void testNonFinalStopNodes() throws Exception {
|
||||
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
|
||||
final Long nothing = outputs.getNoOutput();
|
||||
final Builder<Long> b = new Builder<Long>(FST.INPUT_TYPE.BYTE1, 0, 0, true, outputs);
|
||||
final Builder<Long> b = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
|
||||
final FST<Long> fst = new FST<Long>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
|
||||
|
|
|
@ -244,7 +244,7 @@ public class SynonymMap {
|
|||
ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
|
||||
// TODO: are we using the best sharing options?
|
||||
org.apache.lucene.util.fst.Builder<BytesRef> builder =
|
||||
new org.apache.lucene.util.fst.Builder<BytesRef>(FST.INPUT_TYPE.BYTE4, 0, 0, true, outputs);
|
||||
new org.apache.lucene.util.fst.Builder<BytesRef>(FST.INPUT_TYPE.BYTE4, outputs);
|
||||
|
||||
BytesRef scratch = new BytesRef(64);
|
||||
ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput();
|
||||
|
|
|
@ -450,8 +450,7 @@ public class FSTLookup extends Lookup {
|
|||
// Build the automaton.
|
||||
final Outputs<Object> outputs = NoOutputs.getSingleton();
|
||||
final Object empty = outputs.getNoOutput();
|
||||
final Builder<Object> builder =
|
||||
new Builder<Object>(FST.INPUT_TYPE.BYTE4, 0, 0, true, outputs);
|
||||
final Builder<Object> builder = new Builder<Object>(FST.INPUT_TYPE.BYTE4, outputs);
|
||||
final IntsRef scratchIntsRef = new IntsRef(10);
|
||||
for (Entry e : entries) {
|
||||
final int termLength = scratchIntsRef.length = e.term.length;
|
||||
|
|
Loading…
Reference in New Issue