diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/Util.java b/lucene/core/src/java/org/apache/lucene/util/fst/Util.java index b49e0a0e272..cbfbacb19f9 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/Util.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/Util.java @@ -266,6 +266,7 @@ public final class Util { private final FST fst; private final FST.BytesReader bytesReader; private final int topN; + private final int maxQueueDepth; private final FST.Arc scratchArc = new FST.Arc(); @@ -273,10 +274,11 @@ public final class Util { TreeSet> queue = null; - public TopNSearcher(FST fst, int topN, Comparator comparator) { + public TopNSearcher(FST fst, int topN, int maxQueueDepth, Comparator comparator) { this.fst = fst; this.bytesReader = fst.getBytesReader(0); this.topN = topN; + this.maxQueueDepth = maxQueueDepth; this.comparator = comparator; queue = new TreeSet>(); @@ -290,9 +292,7 @@ public final class Util { T cost = fst.outputs.add(path.cost, path.arc.output); //System.out.println(" addIfCompetitive queue.size()=" + queue.size() + " path=" + path + " + label=" + path.arc.label); - // LUCENE-4481: TODO: re-enable this pruning if we can make this admissible: - /* - if (queue.size() == topN) { + if (queue.size() == maxQueueDepth) { FSTPath bottom = queue.last(); int comp = comparator.compare(cost, bottom.cost); if (comp > 0) { @@ -314,7 +314,6 @@ public final class Util { } else { // Queue isn't full yet, so any path we hit competes: } - */ // copy over the current input to the new input // and add the arc.label to the end @@ -326,12 +325,9 @@ public final class Util { queue.add(newPath); - // LUCENE-4481: TODO: re-enable this pruning if we can make this admissible: - /* - if (queue.size() == topN+1) { + if (queue.size() == maxQueueDepth+1) { queue.pollLast(); } - */ } /** Adds all leaving arcs, including 'finished' arc, if @@ -375,6 +371,7 @@ public final class Util { // TODO: maybe we should make an FST.INPUT_TYPE.BYTE0.5!? // (nibbles) + int rejectCount = 0; // For each top N path: while (results.size() < topN) { @@ -404,13 +401,10 @@ public final class Util { continue; } - // LUCENE-4481: TODO: re-enable this pruning if we can make this admissible: - /* - if (results.size() == topN-1) { + if (results.size() == topN-1 && maxQueueDepth == topN) { // Last path -- don't bother w/ queue anymore: queue = null; } - */ //System.out.println(" path: " + path); @@ -467,6 +461,9 @@ public final class Util { T finalOutput = fst.outputs.add(path.cost, path.arc.output); if (acceptResult(path.input, finalOutput)) { results.add(new MinResult(path.input, finalOutput, comparator)); + } else { + rejectCount++; + assert rejectCount + topN <= maxQueueDepth: "maxQueueDepth (" + maxQueueDepth + ") is too small for topN (" + topN + "): rejected " + rejectCount + " paths"; } break; } else { @@ -519,7 +516,10 @@ public final class Util { * PositiveIntOutputs#getSingleton}). */ public static MinResult[] shortestPaths(FST fst, FST.Arc fromNode, T startOutput, Comparator comparator, int topN, boolean allowEmptyString) throws IOException { - TopNSearcher searcher = new TopNSearcher(fst, topN, comparator); + + // All paths are kept, so we can pass topN for + // maxQueueDepth and the pruning is admissible: + TopNSearcher searcher = new TopNSearcher(fst, topN, topN, comparator); // since this search is initialized with a single start node // it is okay to start with an empty input path here diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java index c5a02335cba..38c3e914467 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java @@ -36,6 +36,8 @@ import org.apache.lucene.search.suggest.Lookup; import org.apache.lucene.search.suggest.fst.Sort; import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.store.ByteArrayDataOutput; +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.DataOutput; import org.apache.lucene.store.InputStreamDataInput; import org.apache.lucene.store.OutputStreamDataOutput; import org.apache.lucene.util.ArrayUtil; @@ -161,6 +163,11 @@ public class AnalyzingSuggester extends Lookup { * SynonymFilter). */ private final int maxGraphExpansions; + /** Highest number of analyzed paths we saw for any single + * input surface form. For analyzers that never create + * graphs this will always be 1. */ + private int maxAnalyzedPathsForOneInput; + /** * Calls {@link #AnalyzingSuggester(Analyzer,Analyzer,int,int,int) * AnalyzingSuggester(analyzer, analyzer, EXACT_FIRST | @@ -354,6 +361,8 @@ public class AnalyzingSuggester extends Lookup { // don't have to alloc [possibly biggish] // intermediate HashSet in RAM: Set paths = SpecialOperations.getFiniteStrings(automaton, maxGraphExpansions); + maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, paths.size()); + for (IntsRef path : paths) { Util.toBytesRef(path, scratch); @@ -469,8 +478,10 @@ public class AnalyzingSuggester extends Lookup { @Override public boolean store(OutputStream output) throws IOException { + DataOutput dataOut = new OutputStreamDataOutput(output); try { - fst.save(new OutputStreamDataOutput(output)); + fst.save(dataOut); + dataOut.writeVInt(maxAnalyzedPathsForOneInput); } finally { IOUtils.close(output); } @@ -479,8 +490,10 @@ public class AnalyzingSuggester extends Lookup { @Override public boolean load(InputStream input) throws IOException { + DataInput dataIn = new InputStreamDataInput(input); try { - this.fst = new FST>(new InputStreamDataInput(input), new PairOutputs(PositiveIntOutputs.getSingleton(true), ByteSequenceOutputs.getSingleton())); + this.fst = new FST>(dataIn, new PairOutputs(PositiveIntOutputs.getSingleton(true), ByteSequenceOutputs.getSingleton())); + maxAnalyzedPathsForOneInput = dataIn.readVInt(); } finally { IOUtils.close(input); } @@ -529,7 +542,7 @@ public class AnalyzingSuggester extends Lookup { FST.Arc> scratchArc = new FST.Arc>(); - List results = new ArrayList(); + final List results = new ArrayList(); if (exactFirst) { @@ -545,7 +558,7 @@ public class AnalyzingSuggester extends Lookup { // Searcher just to find the single exact only // match, if present: Util.TopNSearcher> searcher; - searcher = new Util.TopNSearcher>(fst, count * maxSurfaceFormsPerAnalyzedForm, weightComparator); + searcher = new Util.TopNSearcher>(fst, count * maxSurfaceFormsPerAnalyzedForm, count * maxSurfaceFormsPerAnalyzedForm, weightComparator); // NOTE: we could almost get away with only using // the first start node. The only catch is if @@ -591,18 +604,17 @@ public class AnalyzingSuggester extends Lookup { Util.TopNSearcher> searcher; searcher = new Util.TopNSearcher>(fst, - num, + num - results.size(), + num * maxAnalyzedPathsForOneInput, weightComparator) { private final Set seen = new HashSet(); @Override protected boolean acceptResult(IntsRef input, Pair output) { - //System.out.println("ACCEPT? path=" + input); // Dedup: when the input analyzes to a graph we // can get duplicate surface forms: if (seen.contains(output.output2)) { - //System.out.println("SKIP: dup"); return false; } seen.add(output.output2); @@ -615,7 +627,14 @@ public class AnalyzingSuggester extends Lookup { // create duplicate results: spare.grow(output.output2.length); UnicodeUtil.UTF8toUTF16(output.output2, spare); - return CHARSEQUENCE_COMPARATOR.compare(spare, key) != 0; + if (CHARSEQUENCE_COMPARATOR.compare(spare, key) == 0) { + // We found exact match, which means we should + // have already found it in the first search: + assert results.size() == 1; + return false; + } else { + return true; + } } } }; diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java index f4b3d3aeb56..6cb117bae4b 100644 --- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java +++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java @@ -17,6 +17,11 @@ package org.apache.lucene.search.suggest.analyzing; * limitations under the License. */ +import java.io.File; +import java.io.FileInputStream; +import java.io.InputStream; +import java.io.FileOutputStream; +import java.io.OutputStream; import java.io.IOException; import java.io.Reader; import java.io.StringReader; @@ -824,6 +829,29 @@ public class AnalyzingSuggesterTest extends LuceneTestCase { assertEquals(4, results.get(1).value); assertEquals("a b", results.get(2).key); assertEquals(3, results.get(2).value); + + // Try again after save/load: + File tmpDir = _TestUtil.getTempDir("AnalyzingSuggesterTest"); + tmpDir.mkdir(); + + File path = new File(tmpDir, "suggester"); + + OutputStream os = new FileOutputStream(path); + suggester.store(os); + os.close(); + + InputStream is = new FileInputStream(path); + suggester.load(is); + is.close(); + + results = suggester.lookup("a", false, 3); + assertEquals(3, results.size()); + assertEquals("a", results.get(0).key); + assertEquals(5, results.get(0).value); + assertEquals("a c", results.get(1).key); + assertEquals(4, results.get(1).value); + assertEquals("a b", results.get(2).key); + assertEquals(3, results.get(2).value); } public void testDupSurfaceFormsMissingResults() throws Exception { @@ -863,6 +891,27 @@ public class AnalyzingSuggesterTest extends LuceneTestCase { assertEquals(6, results.get(0).value); assertEquals("nellie", results.get(1).key); assertEquals(5, results.get(1).value); + + // Try again after save/load: + File tmpDir = _TestUtil.getTempDir("AnalyzingSuggesterTest"); + tmpDir.mkdir(); + + File path = new File(tmpDir, "suggester"); + + OutputStream os = new FileOutputStream(path); + suggester.store(os); + os.close(); + + InputStream is = new FileInputStream(path); + suggester.load(is); + is.close(); + + results = suggester.lookup("nellie", false, 2); + assertEquals(2, results.size()); + assertEquals("hambone", results.get(0).key); + assertEquals(6, results.get(0).value); + assertEquals("nellie", results.get(1).key); + assertEquals(5, results.get(1).value); } public void testDupSurfaceFormsMissingResults2() throws Exception { @@ -912,5 +961,26 @@ public class AnalyzingSuggesterTest extends LuceneTestCase { assertEquals(6, results.get(0).value); assertEquals("b", results.get(1).key); assertEquals(5, results.get(1).value); + + // Try again after save/load: + File tmpDir = _TestUtil.getTempDir("AnalyzingSuggesterTest"); + tmpDir.mkdir(); + + File path = new File(tmpDir, "suggester"); + + OutputStream os = new FileOutputStream(path); + suggester.store(os); + os.close(); + + InputStream is = new FileInputStream(path); + suggester.load(is); + is.close(); + + results = suggester.lookup("a", false, 2); + assertEquals(2, results.size()); + assertEquals("a", results.get(0).key); + assertEquals(6, results.get(0).value); + assertEquals("b", results.get(1).key); + assertEquals(5, results.get(1).value); } }