LUCENE-4617: remove fst.pack method

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1420951 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2012-12-12 20:28:51 +00:00
parent 01fee5dd1f
commit 97508e2a18
9 changed files with 67 additions and 89 deletions

View File

@ -180,6 +180,12 @@ API Changes
* LUCENE-4605: Added DocsEnum.FLAG_NONE which can be passed instead of 0 as
the flag to .docs() and .docsAndPositions(). (Shai Erera)
* LUCENE-4617: Remove FST.pack() method. Previously to make a packed FST,
you had to make a Builder with willPackFST=true (telling it you will later pack it),
create your fst with finish(), and then call pack() to get another FST.
Instead just pass true for doPackFST to Builder and finish() returns a packed FST.
(Robert Muir)
Bug Fixes
* LUCENE-1822: BaseFragListBuilder hard-coded 6 char margin is too naive.

View File

@ -232,7 +232,7 @@ public abstract class BinaryDictionaryWriter {
* Write dictionary in file
* Dictionary format is:
* [Size of dictionary(int)], [entry:{left id(short)}{right id(short)}{word cost(short)}{length of pos info(short)}{pos info(char)}], [entry...], [entry...].....
* @throws IOException
* @throws IOException if an I/O error occurs writing the dictionary files
*/
public void write(String baseDir) throws IOException {
final String baseName = getBaseFileName(baseDir);

View File

@ -162,7 +162,7 @@ public class TokenInfoDictionaryBuilder {
offset = next;
}
final FST<Long> fst = fstBuilder.finish().pack(2, 100000, PackedInts.DEFAULT);
final FST<Long> fst = fstBuilder.finish();
System.out.print(" " + fst.getNodeCount() + " nodes, " + fst.getArcCount() + " arcs, " + fst.sizeInBytes() + " bytes... ");
dictionary.setFST(fst);

View File

@ -269,9 +269,6 @@ public final class MemoryPostingsFormat extends PostingsFormat {
out.writeVLong(sumDocFreq);
out.writeVInt(docCount);
FST<BytesRef> fst = builder.finish();
if (doPackFST) {
fst = fst.pack(3, Math.max(10, fst.getNodeCount()/4), acceptableOverheadRatio);
}
fst.save(out);
//System.out.println("finish field=" + field.name + " fp=" + out.getFilePointer());
}

View File

@ -63,6 +63,10 @@ public class Builder<T> {
private final IntsRef lastInput = new IntsRef();
// for packing
private final boolean doPackFST;
private final float acceptableOverheadRatio;
// NOTE: cutting this over to ArrayList instead loses ~6%
// in build performance on 9.8M Wikipedia terms; so we
// left this as an array:
@ -135,23 +139,22 @@ public class Builder<T> {
* FSA, use {@link NoOutputs#getSingleton()} and {@link NoOutputs#getNoOutput()} as the
* singleton output object.
*
* @param willPackFST Pass true if you will pack the FST before saving. This
* causes the FST to create additional data structures internally to enable packing, but
* it means the resulting FST cannot be saved until it
* is packed using {@link FST#pack(int, int, float)}
* @param doPackFST Pass true to create a packed FST.
*
* @param acceptableOverheadRatio How to trade speed for space when building the FST. This option
* is only relevant when willPackFST is true. @see PackedInts#getMutable(int, int, float)
*/
public Builder(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doShareSuffix,
boolean doShareNonSingletonNodes, int shareMaxTailLength, Outputs<T> outputs,
FreezeTail<T> freezeTail, boolean willPackFST, float acceptableOverheadRatio) {
FreezeTail<T> freezeTail, boolean doPackFST, float acceptableOverheadRatio) {
this.minSuffixCount1 = minSuffixCount1;
this.minSuffixCount2 = minSuffixCount2;
this.freezeTail = freezeTail;
this.doShareNonSingletonNodes = doShareNonSingletonNodes;
this.shareMaxTailLength = shareMaxTailLength;
fst = new FST<T>(inputType, outputs, willPackFST, acceptableOverheadRatio);
this.doPackFST = doPackFST;
this.acceptableOverheadRatio = acceptableOverheadRatio;
fst = new FST<T>(inputType, outputs, doPackFST, acceptableOverheadRatio);
if (doShareSuffix) {
dedupHash = new NodeHash<T>(fst);
} else {
@ -474,7 +477,11 @@ public class Builder<T> {
//if (DEBUG) System.out.println(" builder.finish root.isFinal=" + root.isFinal + " root.output=" + root.output);
fst.finish(compileNode(root, lastInput.length).node);
return fst;
if (doPackFST) {
return fst.pack(3, Math.max(10, fst.getNodeCount()/4), acceptableOverheadRatio);
} else {
return fst;
}
}
private void compileAllTargets(UnCompiledNode<T> node, int tailLength) throws IOException {

View File

@ -1467,7 +1467,7 @@ public final class FST<T> {
* However, this is not a strict implementation of the
* algorithms described in this paper.
*/
public FST<T> pack(int minInCountDeref, int maxDerefNodes, float acceptableOverheadRatio) throws IOException {
FST<T> pack(int minInCountDeref, int maxDerefNodes, float acceptableOverheadRatio) throws IOException {
// TODO: other things to try
// - renumber the nodes to get more next / better locality?

View File

@ -369,61 +369,52 @@ public class TestFSTs extends LuceneTestCase {
if (ord > 0) {
final Random random = new Random(random().nextLong());
for(int rewriteIter=0;rewriteIter<2;rewriteIter++) {
if (rewriteIter == 1) {
if (doRewrite) {
// Verify again, with packed FST:
fst = fst.pack(_TestUtil.nextInt(random, 1, 10), _TestUtil.nextInt(random, 0, 10000000), random.nextFloat());
} else {
break;
}
// Now confirm BytesRefFSTEnum and TermsEnum act the
// same:
final BytesRefFSTEnum<Long> fstEnum = new BytesRefFSTEnum<Long>(fst);
int num = atLeast(1000);
for(int iter=0;iter<num;iter++) {
final BytesRef randomTerm = new BytesRef(getRandomString(random));
if (VERBOSE) {
System.out.println("TEST: seek non-exist " + randomTerm.utf8ToString() + " " + randomTerm);
}
// Now confirm BytesRefFSTEnum and TermsEnum act the
// same:
final BytesRefFSTEnum<Long> fstEnum = new BytesRefFSTEnum<Long>(fst);
int num = atLeast(1000);
for(int iter=0;iter<num;iter++) {
final BytesRef randomTerm = new BytesRef(getRandomString(random));
if (VERBOSE) {
System.out.println("TEST: seek non-exist " + randomTerm.utf8ToString() + " " + randomTerm);
}
final TermsEnum.SeekStatus seekResult = termsEnum.seekCeil(randomTerm);
final InputOutput<Long> fstSeekResult = fstEnum.seekCeil(randomTerm);
final TermsEnum.SeekStatus seekResult = termsEnum.seekCeil(randomTerm);
final InputOutput<Long> fstSeekResult = fstEnum.seekCeil(randomTerm);
if (seekResult == TermsEnum.SeekStatus.END) {
assertNull("got " + (fstSeekResult == null ? "null" : fstSeekResult.input.utf8ToString()) + " but expected null", fstSeekResult);
} else {
assertSame(termsEnum, fstEnum, storeOrd);
for(int nextIter=0;nextIter<10;nextIter++) {
if (seekResult == TermsEnum.SeekStatus.END) {
assertNull("got " + (fstSeekResult == null ? "null" : fstSeekResult.input.utf8ToString()) + " but expected null", fstSeekResult);
} else {
assertSame(termsEnum, fstEnum, storeOrd);
for(int nextIter=0;nextIter<10;nextIter++) {
if (VERBOSE) {
System.out.println("TEST: next");
if (storeOrd) {
System.out.println(" ord=" + termsEnum.ord());
}
}
if (termsEnum.next() != null) {
if (VERBOSE) {
System.out.println("TEST: next");
if (storeOrd) {
System.out.println(" ord=" + termsEnum.ord());
}
System.out.println(" term=" + termsEnum.term().utf8ToString());
}
if (termsEnum.next() != null) {
if (VERBOSE) {
System.out.println(" term=" + termsEnum.term().utf8ToString());
}
assertNotNull(fstEnum.next());
assertSame(termsEnum, fstEnum, storeOrd);
} else {
if (VERBOSE) {
System.out.println(" end!");
}
BytesRefFSTEnum.InputOutput<Long> nextResult = fstEnum.next();
if (nextResult != null) {
System.out.println("expected null but got: input=" + nextResult.input.utf8ToString() + " output=" + outputs.outputToString(nextResult.output));
fail();
}
break;
assertNotNull(fstEnum.next());
assertSame(termsEnum, fstEnum, storeOrd);
} else {
if (VERBOSE) {
System.out.println(" end!");
}
BytesRefFSTEnum.InputOutput<Long> nextResult = fstEnum.next();
if (nextResult != null) {
System.out.println("expected null but got: input=" + nextResult.input.utf8ToString() + " output=" + outputs.outputToString(nextResult.output));
fail();
}
break;
}
}
}
}
}
}
@ -513,12 +504,6 @@ public class TestFSTs extends LuceneTestCase {
System.out.println("Wrote FST to out.dot");
}
if (doPack) {
System.out.println("Pack...");
fst = fst.pack(4, 100000000, random().nextFloat());
System.out.println("New size " + fst.sizeInBytes() + " bytes");
}
Directory dir = FSDirectory.open(new File(dirOut));
IndexOutput out = dir.createOutput("fst.bin", IOContext.DEFAULT);
fst.save(out);
@ -1102,13 +1087,11 @@ public class TestFSTs extends LuceneTestCase {
Util.toDot(fst, w, false, false);
w.close();
//System.out.println(w.toString());
final String expected;
if (willRewrite) {
expected = "4 -> 3 [label=\"t\" style=\"bold\"";
} else {
expected = "8 -> 6 [label=\"t\" style=\"bold\"";
}
assertTrue(w.toString().indexOf(expected) != -1);
// check for accept state at label t
assertTrue(w.toString().indexOf("[label=\"t\" style=\"bold\"") != -1);
// check for accept state at label n
assertTrue(w.toString().indexOf("[label=\"n\" style=\"bold\"") != -1);
}
// Make sure raw FST can differentiate between final vs

View File

@ -337,21 +337,6 @@ public class FSTTester<T> {
verifyPruned(inputMode, fst, prune1, prune2);
}
if (willRewrite && fst != null) {
if (LuceneTestCase.VERBOSE) {
System.out.println("TEST: now rewrite");
}
final FST<T> packed = fst.pack(_TestUtil.nextInt(random, 1, 10), _TestUtil.nextInt(random, 0, 10000000), random.nextFloat());
if (LuceneTestCase.VERBOSE) {
System.out.println("TEST: now verify packed FST");
}
if (prune1 == 0 && prune2 == 0) {
verifyUnPruned(inputMode, packed);
} else {
verifyPruned(inputMode, packed, prune1, prune2);
}
}
return fst;
}