diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 5788a5f3e5b..ab60dae32d3 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -179,6 +179,12 @@ API Changes * LUCENE-4605: Added DocsEnum.FLAG_NONE which can be passed instead of 0 as the flag to .docs() and .docsAndPositions(). (Shai Erera) + +* LUCENE-4617: Remove FST.pack() method. Previously to make a packed FST, + you had to make a Builder with willPackFST=true (telling it you will later pack it), + create your fst with finish(), and then call pack() to get another FST. + Instead just pass true for doPackFST to Builder and finish() returns a packed FST. + (Robert Muir) Bug Fixes diff --git a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$fst.dat b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$fst.dat index a914db078ae..2b1cedb59e2 100644 Binary files a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$fst.dat and b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$fst.dat differ diff --git a/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/BinaryDictionaryWriter.java b/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/BinaryDictionaryWriter.java index a9625fd36ee..3e5ed728f38 100644 --- a/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/BinaryDictionaryWriter.java +++ b/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/BinaryDictionaryWriter.java @@ -232,7 +232,7 @@ public abstract class BinaryDictionaryWriter { * Write dictionary in file * Dictionary format is: * [Size of dictionary(int)], [entry:{left id(short)}{right id(short)}{word cost(short)}{length of pos info(short)}{pos info(char)}], [entry...], [entry...]..... - * @throws IOException + * @throws IOException if an I/O error occurs writing the dictionary files */ public void write(String baseDir) throws IOException { final String baseName = getBaseFileName(baseDir); diff --git a/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java b/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java index 2465c6aa6d8..569908da694 100644 --- a/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java +++ b/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java @@ -162,7 +162,7 @@ public class TokenInfoDictionaryBuilder { offset = next; } - final FST fst = fstBuilder.finish().pack(2, 100000, PackedInts.DEFAULT); + final FST fst = fstBuilder.finish(); System.out.print(" " + fst.getNodeCount() + " nodes, " + fst.getArcCount() + " arcs, " + fst.sizeInBytes() + " bytes... "); dictionary.setFST(fst); diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java index 3dab4df65c3..cff0ec0f063 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java @@ -269,9 +269,6 @@ public final class MemoryPostingsFormat extends PostingsFormat { out.writeVLong(sumDocFreq); out.writeVInt(docCount); FST fst = builder.finish(); - if (doPackFST) { - fst = fst.pack(3, Math.max(10, fst.getNodeCount()/4), acceptableOverheadRatio); - } fst.save(out); //System.out.println("finish field=" + field.name + " fp=" + out.getFilePointer()); } diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/Builder.java b/lucene/core/src/java/org/apache/lucene/util/fst/Builder.java index c45d7b12303..d5c1467348c 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/Builder.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/Builder.java @@ -62,6 +62,10 @@ public class Builder { private final int shareMaxTailLength; private final IntsRef lastInput = new IntsRef(); + + // for packing + private final boolean doPackFST; + private final float acceptableOverheadRatio; // NOTE: cutting this over to ArrayList instead loses ~6% // in build performance on 9.8M Wikipedia terms; so we @@ -135,23 +139,22 @@ public class Builder { * FSA, use {@link NoOutputs#getSingleton()} and {@link NoOutputs#getNoOutput()} as the * singleton output object. * - * @param willPackFST Pass true if you will pack the FST before saving. This - * causes the FST to create additional data structures internally to enable packing, but - * it means the resulting FST cannot be saved until it - * is packed using {@link FST#pack(int, int, float)} - * + * @param doPackFST Pass true to create a packed FST. + * * @param acceptableOverheadRatio How to trade speed for space when building the FST. This option * is only relevant when willPackFST is true. @see PackedInts#getMutable(int, int, float) */ public Builder(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doShareSuffix, boolean doShareNonSingletonNodes, int shareMaxTailLength, Outputs outputs, - FreezeTail freezeTail, boolean willPackFST, float acceptableOverheadRatio) { + FreezeTail freezeTail, boolean doPackFST, float acceptableOverheadRatio) { this.minSuffixCount1 = minSuffixCount1; this.minSuffixCount2 = minSuffixCount2; this.freezeTail = freezeTail; this.doShareNonSingletonNodes = doShareNonSingletonNodes; this.shareMaxTailLength = shareMaxTailLength; - fst = new FST(inputType, outputs, willPackFST, acceptableOverheadRatio); + this.doPackFST = doPackFST; + this.acceptableOverheadRatio = acceptableOverheadRatio; + fst = new FST(inputType, outputs, doPackFST, acceptableOverheadRatio); if (doShareSuffix) { dedupHash = new NodeHash(fst); } else { @@ -474,7 +477,11 @@ public class Builder { //if (DEBUG) System.out.println(" builder.finish root.isFinal=" + root.isFinal + " root.output=" + root.output); fst.finish(compileNode(root, lastInput.length).node); - return fst; + if (doPackFST) { + return fst.pack(3, Math.max(10, fst.getNodeCount()/4), acceptableOverheadRatio); + } else { + return fst; + } } private void compileAllTargets(UnCompiledNode node, int tailLength) throws IOException { diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java index 4853f29e5df..24f30681dd9 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java @@ -1467,7 +1467,7 @@ public final class FST { * However, this is not a strict implementation of the * algorithms described in this paper. */ - public FST pack(int minInCountDeref, int maxDerefNodes, float acceptableOverheadRatio) throws IOException { + FST pack(int minInCountDeref, int maxDerefNodes, float acceptableOverheadRatio) throws IOException { // TODO: other things to try // - renumber the nodes to get more next / better locality? diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java index 5f17681008c..176dbca3770 100644 --- a/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java +++ b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java @@ -369,61 +369,52 @@ public class TestFSTs extends LuceneTestCase { if (ord > 0) { final Random random = new Random(random().nextLong()); - for(int rewriteIter=0;rewriteIter<2;rewriteIter++) { - if (rewriteIter == 1) { - if (doRewrite) { - // Verify again, with packed FST: - fst = fst.pack(_TestUtil.nextInt(random, 1, 10), _TestUtil.nextInt(random, 0, 10000000), random.nextFloat()); - } else { - break; - } + // Now confirm BytesRefFSTEnum and TermsEnum act the + // same: + final BytesRefFSTEnum fstEnum = new BytesRefFSTEnum(fst); + int num = atLeast(1000); + for(int iter=0;iter fstEnum = new BytesRefFSTEnum(fst); - int num = atLeast(1000); - for(int iter=0;iter fstSeekResult = fstEnum.seekCeil(randomTerm); - - if (seekResult == TermsEnum.SeekStatus.END) { - assertNull("got " + (fstSeekResult == null ? "null" : fstSeekResult.input.utf8ToString()) + " but expected null", fstSeekResult); - } else { - assertSame(termsEnum, fstEnum, storeOrd); - for(int nextIter=0;nextIter<10;nextIter++) { + + final TermsEnum.SeekStatus seekResult = termsEnum.seekCeil(randomTerm); + final InputOutput fstSeekResult = fstEnum.seekCeil(randomTerm); + + if (seekResult == TermsEnum.SeekStatus.END) { + assertNull("got " + (fstSeekResult == null ? "null" : fstSeekResult.input.utf8ToString()) + " but expected null", fstSeekResult); + } else { + assertSame(termsEnum, fstEnum, storeOrd); + for(int nextIter=0;nextIter<10;nextIter++) { + if (VERBOSE) { + System.out.println("TEST: next"); + if (storeOrd) { + System.out.println(" ord=" + termsEnum.ord()); + } + } + if (termsEnum.next() != null) { if (VERBOSE) { - System.out.println("TEST: next"); - if (storeOrd) { - System.out.println(" ord=" + termsEnum.ord()); - } + System.out.println(" term=" + termsEnum.term().utf8ToString()); } - if (termsEnum.next() != null) { - if (VERBOSE) { - System.out.println(" term=" + termsEnum.term().utf8ToString()); - } - assertNotNull(fstEnum.next()); - assertSame(termsEnum, fstEnum, storeOrd); - } else { - if (VERBOSE) { - System.out.println(" end!"); - } - BytesRefFSTEnum.InputOutput nextResult = fstEnum.next(); - if (nextResult != null) { - System.out.println("expected null but got: input=" + nextResult.input.utf8ToString() + " output=" + outputs.outputToString(nextResult.output)); - fail(); - } - break; + assertNotNull(fstEnum.next()); + assertSame(termsEnum, fstEnum, storeOrd); + } else { + if (VERBOSE) { + System.out.println(" end!"); } + BytesRefFSTEnum.InputOutput nextResult = fstEnum.next(); + if (nextResult != null) { + System.out.println("expected null but got: input=" + nextResult.input.utf8ToString() + " output=" + outputs.outputToString(nextResult.output)); + fail(); + } + break; } } } } + } } @@ -513,12 +504,6 @@ public class TestFSTs extends LuceneTestCase { System.out.println("Wrote FST to out.dot"); } - if (doPack) { - System.out.println("Pack..."); - fst = fst.pack(4, 100000000, random().nextFloat()); - System.out.println("New size " + fst.sizeInBytes() + " bytes"); - } - Directory dir = FSDirectory.open(new File(dirOut)); IndexOutput out = dir.createOutput("fst.bin", IOContext.DEFAULT); fst.save(out); @@ -1102,13 +1087,11 @@ public class TestFSTs extends LuceneTestCase { Util.toDot(fst, w, false, false); w.close(); //System.out.println(w.toString()); - final String expected; - if (willRewrite) { - expected = "4 -> 3 [label=\"t\" style=\"bold\""; - } else { - expected = "8 -> 6 [label=\"t\" style=\"bold\""; - } - assertTrue(w.toString().indexOf(expected) != -1); + + // check for accept state at label t + assertTrue(w.toString().indexOf("[label=\"t\" style=\"bold\"") != -1); + // check for accept state at label n + assertTrue(w.toString().indexOf("[label=\"n\" style=\"bold\"") != -1); } // Make sure raw FST can differentiate between final vs diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/fst/FSTTester.java b/lucene/test-framework/src/java/org/apache/lucene/util/fst/FSTTester.java index 4c52956eaff..be21c1a0f47 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/util/fst/FSTTester.java +++ b/lucene/test-framework/src/java/org/apache/lucene/util/fst/FSTTester.java @@ -337,21 +337,6 @@ public class FSTTester { verifyPruned(inputMode, fst, prune1, prune2); } - if (willRewrite && fst != null) { - if (LuceneTestCase.VERBOSE) { - System.out.println("TEST: now rewrite"); - } - final FST packed = fst.pack(_TestUtil.nextInt(random, 1, 10), _TestUtil.nextInt(random, 0, 10000000), random.nextFloat()); - if (LuceneTestCase.VERBOSE) { - System.out.println("TEST: now verify packed FST"); - } - if (prune1 == 0 && prune2 == 0) { - verifyUnPruned(inputMode, packed); - } else { - verifyPruned(inputMode, packed, prune1, prune2); - } - } - return fst; }