mirror of https://github.com/apache/lucene.git
LUCENE-4617: remove fst.pack method
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1420951 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
01fee5dd1f
commit
97508e2a18
|
@ -180,6 +180,12 @@ API Changes
|
|||
* LUCENE-4605: Added DocsEnum.FLAG_NONE which can be passed instead of 0 as
|
||||
the flag to .docs() and .docsAndPositions(). (Shai Erera)
|
||||
|
||||
* LUCENE-4617: Remove FST.pack() method. Previously to make a packed FST,
|
||||
you had to make a Builder with willPackFST=true (telling it you will later pack it),
|
||||
create your fst with finish(), and then call pack() to get another FST.
|
||||
Instead just pass true for doPackFST to Builder and finish() returns a packed FST.
|
||||
(Robert Muir)
|
||||
|
||||
Bug Fixes
|
||||
|
||||
* LUCENE-1822: BaseFragListBuilder hard-coded 6 char margin is too naive.
|
||||
|
|
Binary file not shown.
|
@ -232,7 +232,7 @@ public abstract class BinaryDictionaryWriter {
|
|||
* Write dictionary in file
|
||||
* Dictionary format is:
|
||||
* [Size of dictionary(int)], [entry:{left id(short)}{right id(short)}{word cost(short)}{length of pos info(short)}{pos info(char)}], [entry...], [entry...].....
|
||||
* @throws IOException
|
||||
* @throws IOException if an I/O error occurs writing the dictionary files
|
||||
*/
|
||||
public void write(String baseDir) throws IOException {
|
||||
final String baseName = getBaseFileName(baseDir);
|
||||
|
|
|
@ -162,7 +162,7 @@ public class TokenInfoDictionaryBuilder {
|
|||
offset = next;
|
||||
}
|
||||
|
||||
final FST<Long> fst = fstBuilder.finish().pack(2, 100000, PackedInts.DEFAULT);
|
||||
final FST<Long> fst = fstBuilder.finish();
|
||||
|
||||
System.out.print(" " + fst.getNodeCount() + " nodes, " + fst.getArcCount() + " arcs, " + fst.sizeInBytes() + " bytes... ");
|
||||
dictionary.setFST(fst);
|
||||
|
|
|
@ -269,9 +269,6 @@ public final class MemoryPostingsFormat extends PostingsFormat {
|
|||
out.writeVLong(sumDocFreq);
|
||||
out.writeVInt(docCount);
|
||||
FST<BytesRef> fst = builder.finish();
|
||||
if (doPackFST) {
|
||||
fst = fst.pack(3, Math.max(10, fst.getNodeCount()/4), acceptableOverheadRatio);
|
||||
}
|
||||
fst.save(out);
|
||||
//System.out.println("finish field=" + field.name + " fp=" + out.getFilePointer());
|
||||
}
|
||||
|
|
|
@ -63,6 +63,10 @@ public class Builder<T> {
|
|||
|
||||
private final IntsRef lastInput = new IntsRef();
|
||||
|
||||
// for packing
|
||||
private final boolean doPackFST;
|
||||
private final float acceptableOverheadRatio;
|
||||
|
||||
// NOTE: cutting this over to ArrayList instead loses ~6%
|
||||
// in build performance on 9.8M Wikipedia terms; so we
|
||||
// left this as an array:
|
||||
|
@ -135,23 +139,22 @@ public class Builder<T> {
|
|||
* FSA, use {@link NoOutputs#getSingleton()} and {@link NoOutputs#getNoOutput()} as the
|
||||
* singleton output object.
|
||||
*
|
||||
* @param willPackFST Pass true if you will pack the FST before saving. This
|
||||
* causes the FST to create additional data structures internally to enable packing, but
|
||||
* it means the resulting FST cannot be saved until it
|
||||
* is packed using {@link FST#pack(int, int, float)}
|
||||
* @param doPackFST Pass true to create a packed FST.
|
||||
*
|
||||
* @param acceptableOverheadRatio How to trade speed for space when building the FST. This option
|
||||
* is only relevant when willPackFST is true. @see PackedInts#getMutable(int, int, float)
|
||||
*/
|
||||
public Builder(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doShareSuffix,
|
||||
boolean doShareNonSingletonNodes, int shareMaxTailLength, Outputs<T> outputs,
|
||||
FreezeTail<T> freezeTail, boolean willPackFST, float acceptableOverheadRatio) {
|
||||
FreezeTail<T> freezeTail, boolean doPackFST, float acceptableOverheadRatio) {
|
||||
this.minSuffixCount1 = minSuffixCount1;
|
||||
this.minSuffixCount2 = minSuffixCount2;
|
||||
this.freezeTail = freezeTail;
|
||||
this.doShareNonSingletonNodes = doShareNonSingletonNodes;
|
||||
this.shareMaxTailLength = shareMaxTailLength;
|
||||
fst = new FST<T>(inputType, outputs, willPackFST, acceptableOverheadRatio);
|
||||
this.doPackFST = doPackFST;
|
||||
this.acceptableOverheadRatio = acceptableOverheadRatio;
|
||||
fst = new FST<T>(inputType, outputs, doPackFST, acceptableOverheadRatio);
|
||||
if (doShareSuffix) {
|
||||
dedupHash = new NodeHash<T>(fst);
|
||||
} else {
|
||||
|
@ -474,7 +477,11 @@ public class Builder<T> {
|
|||
//if (DEBUG) System.out.println(" builder.finish root.isFinal=" + root.isFinal + " root.output=" + root.output);
|
||||
fst.finish(compileNode(root, lastInput.length).node);
|
||||
|
||||
return fst;
|
||||
if (doPackFST) {
|
||||
return fst.pack(3, Math.max(10, fst.getNodeCount()/4), acceptableOverheadRatio);
|
||||
} else {
|
||||
return fst;
|
||||
}
|
||||
}
|
||||
|
||||
private void compileAllTargets(UnCompiledNode<T> node, int tailLength) throws IOException {
|
||||
|
|
|
@ -1467,7 +1467,7 @@ public final class FST<T> {
|
|||
* However, this is not a strict implementation of the
|
||||
* algorithms described in this paper.
|
||||
*/
|
||||
public FST<T> pack(int minInCountDeref, int maxDerefNodes, float acceptableOverheadRatio) throws IOException {
|
||||
FST<T> pack(int minInCountDeref, int maxDerefNodes, float acceptableOverheadRatio) throws IOException {
|
||||
|
||||
// TODO: other things to try
|
||||
// - renumber the nodes to get more next / better locality?
|
||||
|
|
|
@ -369,61 +369,52 @@ public class TestFSTs extends LuceneTestCase {
|
|||
|
||||
if (ord > 0) {
|
||||
final Random random = new Random(random().nextLong());
|
||||
for(int rewriteIter=0;rewriteIter<2;rewriteIter++) {
|
||||
if (rewriteIter == 1) {
|
||||
if (doRewrite) {
|
||||
// Verify again, with packed FST:
|
||||
fst = fst.pack(_TestUtil.nextInt(random, 1, 10), _TestUtil.nextInt(random, 0, 10000000), random.nextFloat());
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
// Now confirm BytesRefFSTEnum and TermsEnum act the
|
||||
// same:
|
||||
final BytesRefFSTEnum<Long> fstEnum = new BytesRefFSTEnum<Long>(fst);
|
||||
int num = atLeast(1000);
|
||||
for(int iter=0;iter<num;iter++) {
|
||||
final BytesRef randomTerm = new BytesRef(getRandomString(random));
|
||||
|
||||
if (VERBOSE) {
|
||||
System.out.println("TEST: seek non-exist " + randomTerm.utf8ToString() + " " + randomTerm);
|
||||
}
|
||||
// Now confirm BytesRefFSTEnum and TermsEnum act the
|
||||
// same:
|
||||
final BytesRefFSTEnum<Long> fstEnum = new BytesRefFSTEnum<Long>(fst);
|
||||
int num = atLeast(1000);
|
||||
for(int iter=0;iter<num;iter++) {
|
||||
final BytesRef randomTerm = new BytesRef(getRandomString(random));
|
||||
|
||||
if (VERBOSE) {
|
||||
System.out.println("TEST: seek non-exist " + randomTerm.utf8ToString() + " " + randomTerm);
|
||||
}
|
||||
final TermsEnum.SeekStatus seekResult = termsEnum.seekCeil(randomTerm);
|
||||
final InputOutput<Long> fstSeekResult = fstEnum.seekCeil(randomTerm);
|
||||
|
||||
final TermsEnum.SeekStatus seekResult = termsEnum.seekCeil(randomTerm);
|
||||
final InputOutput<Long> fstSeekResult = fstEnum.seekCeil(randomTerm);
|
||||
|
||||
if (seekResult == TermsEnum.SeekStatus.END) {
|
||||
assertNull("got " + (fstSeekResult == null ? "null" : fstSeekResult.input.utf8ToString()) + " but expected null", fstSeekResult);
|
||||
} else {
|
||||
assertSame(termsEnum, fstEnum, storeOrd);
|
||||
for(int nextIter=0;nextIter<10;nextIter++) {
|
||||
if (seekResult == TermsEnum.SeekStatus.END) {
|
||||
assertNull("got " + (fstSeekResult == null ? "null" : fstSeekResult.input.utf8ToString()) + " but expected null", fstSeekResult);
|
||||
} else {
|
||||
assertSame(termsEnum, fstEnum, storeOrd);
|
||||
for(int nextIter=0;nextIter<10;nextIter++) {
|
||||
if (VERBOSE) {
|
||||
System.out.println("TEST: next");
|
||||
if (storeOrd) {
|
||||
System.out.println(" ord=" + termsEnum.ord());
|
||||
}
|
||||
}
|
||||
if (termsEnum.next() != null) {
|
||||
if (VERBOSE) {
|
||||
System.out.println("TEST: next");
|
||||
if (storeOrd) {
|
||||
System.out.println(" ord=" + termsEnum.ord());
|
||||
}
|
||||
System.out.println(" term=" + termsEnum.term().utf8ToString());
|
||||
}
|
||||
if (termsEnum.next() != null) {
|
||||
if (VERBOSE) {
|
||||
System.out.println(" term=" + termsEnum.term().utf8ToString());
|
||||
}
|
||||
assertNotNull(fstEnum.next());
|
||||
assertSame(termsEnum, fstEnum, storeOrd);
|
||||
} else {
|
||||
if (VERBOSE) {
|
||||
System.out.println(" end!");
|
||||
}
|
||||
BytesRefFSTEnum.InputOutput<Long> nextResult = fstEnum.next();
|
||||
if (nextResult != null) {
|
||||
System.out.println("expected null but got: input=" + nextResult.input.utf8ToString() + " output=" + outputs.outputToString(nextResult.output));
|
||||
fail();
|
||||
}
|
||||
break;
|
||||
assertNotNull(fstEnum.next());
|
||||
assertSame(termsEnum, fstEnum, storeOrd);
|
||||
} else {
|
||||
if (VERBOSE) {
|
||||
System.out.println(" end!");
|
||||
}
|
||||
BytesRefFSTEnum.InputOutput<Long> nextResult = fstEnum.next();
|
||||
if (nextResult != null) {
|
||||
System.out.println("expected null but got: input=" + nextResult.input.utf8ToString() + " output=" + outputs.outputToString(nextResult.output));
|
||||
fail();
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -513,12 +504,6 @@ public class TestFSTs extends LuceneTestCase {
|
|||
System.out.println("Wrote FST to out.dot");
|
||||
}
|
||||
|
||||
if (doPack) {
|
||||
System.out.println("Pack...");
|
||||
fst = fst.pack(4, 100000000, random().nextFloat());
|
||||
System.out.println("New size " + fst.sizeInBytes() + " bytes");
|
||||
}
|
||||
|
||||
Directory dir = FSDirectory.open(new File(dirOut));
|
||||
IndexOutput out = dir.createOutput("fst.bin", IOContext.DEFAULT);
|
||||
fst.save(out);
|
||||
|
@ -1102,13 +1087,11 @@ public class TestFSTs extends LuceneTestCase {
|
|||
Util.toDot(fst, w, false, false);
|
||||
w.close();
|
||||
//System.out.println(w.toString());
|
||||
final String expected;
|
||||
if (willRewrite) {
|
||||
expected = "4 -> 3 [label=\"t\" style=\"bold\"";
|
||||
} else {
|
||||
expected = "8 -> 6 [label=\"t\" style=\"bold\"";
|
||||
}
|
||||
assertTrue(w.toString().indexOf(expected) != -1);
|
||||
|
||||
// check for accept state at label t
|
||||
assertTrue(w.toString().indexOf("[label=\"t\" style=\"bold\"") != -1);
|
||||
// check for accept state at label n
|
||||
assertTrue(w.toString().indexOf("[label=\"n\" style=\"bold\"") != -1);
|
||||
}
|
||||
|
||||
// Make sure raw FST can differentiate between final vs
|
||||
|
|
|
@ -337,21 +337,6 @@ public class FSTTester<T> {
|
|||
verifyPruned(inputMode, fst, prune1, prune2);
|
||||
}
|
||||
|
||||
if (willRewrite && fst != null) {
|
||||
if (LuceneTestCase.VERBOSE) {
|
||||
System.out.println("TEST: now rewrite");
|
||||
}
|
||||
final FST<T> packed = fst.pack(_TestUtil.nextInt(random, 1, 10), _TestUtil.nextInt(random, 0, 10000000), random.nextFloat());
|
||||
if (LuceneTestCase.VERBOSE) {
|
||||
System.out.println("TEST: now verify packed FST");
|
||||
}
|
||||
if (prune1 == 0 && prune2 == 0) {
|
||||
verifyUnPruned(inputMode, packed);
|
||||
} else {
|
||||
verifyPruned(inputMode, packed, prune1, prune2);
|
||||
}
|
||||
}
|
||||
|
||||
return fst;
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue