diff --git a/lucene/common-build.xml b/lucene/common-build.xml index 1f1b5c6cd74..089e0825cae 100644 --- a/lucene/common-build.xml +++ b/lucene/common-build.xml @@ -819,11 +819,10 @@ - + diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/Util.java b/lucene/core/src/java/org/apache/lucene/util/fst/Util.java index 82f417d6e02..32898a8094b 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/Util.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/Util.java @@ -857,6 +857,7 @@ public final class Util { */ public static Arc readCeilArc(int label, FST fst, Arc follow, Arc arc, BytesReader in) throws IOException { + // TODO maybe this is a useful in the FST class - we could simplify some other code like FSTEnum? if (label == FST.END_LABEL) { if (follow.isFinal()) { if (follow.target <= 0) { diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java index 79e36c27bb1..06e21e976a6 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java @@ -679,24 +679,40 @@ public class AnalyzingSuggester extends Lookup { } }; + /** + * Returns a new {@link PathIntersector} + */ protected PathIntersector getPathIntersector(Automaton automaton, FST> fst) { return new PathIntersector(automaton, fst); } + /** + * This class is used to obtain the prefix paths in the automaton that also intersect the FST. + */ protected static class PathIntersector { protected List>> intersect; protected final Automaton automaton; protected final FST> fst; + + /** + * Creates a new {@link PathIntersector} + */ public PathIntersector(Automaton automaton, FST> fst) { this.automaton = automaton; this.fst = fst; } + /** + * Returns the prefix paths for exact first top N search. + */ public List>> intersectExact() throws IOException { - return intersect = FSTUtil.intersectPrefixPathsExact(automaton, fst); + return intersect = FSTUtil.intersectPrefixPaths(automaton, fst); } + /** + * Returns the prefix paths for top N search. + */ public List>> intersectAll() throws IOException { - return intersect == null ? intersect = FSTUtil.intersectPrefixPathsExact(automaton, fst) : intersect; + return intersect == null ? intersect = FSTUtil.intersectPrefixPaths(automaton, fst) : intersect; } } } diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java index f8332c635c6..9fa52d31179 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java @@ -64,63 +64,12 @@ public class FSTUtil { } } - /** Enumerates all paths in the automaton that also - * intersect the FST, accumulating the FST end node and - * output for each path. */ - public static List> intersectPrefixPathsExact(Automaton a, FST fst) throws IOException { - final List> queue = new ArrayList>(); - final List> endNodes = new ArrayList>(); - - queue.add(new Path(a.getInitialState(), - fst.getFirstArc(new FST.Arc()), - fst.outputs.getNoOutput(), - new IntsRef())); - - final FST.Arc scratchArc = new FST.Arc(); - final FST.BytesReader fstReader = fst.getBytesReader(0); - - //System.out.println("fst/a intersect"); - - while (queue.size() != 0) { - final Path path = queue.remove(queue.size()-1); - //System.out.println(" cycle path=" + path); - if (path.state.isAccept()) { - endNodes.add(path); - } - - IntsRef currentInput = path.input; - for(Transition t : path.state.getTransitions()) { - // TODO: we can fix this if necessary: - if (t.getMin() != t.getMax()) { - throw new IllegalStateException("can only handle Transitions that match one character"); - } - - //System.out.println(" t=" + (char) t.getMin()); - - final FST.Arc nextArc = fst.findTargetArc(t.getMin(), path.fstNode, scratchArc, fstReader); - if (nextArc != null) { - //System.out.println(" fst matches"); - // Path continues: - IntsRef newInput = new IntsRef(currentInput.length + 1); - newInput.copyInts(currentInput); - newInput.ints[currentInput.length] = t.getMin(); - newInput.length = currentInput.length + 1; - - queue.add(new Path(t.getDest(), - new FST.Arc().copyFrom(nextArc), - fst.outputs.add(path.output, nextArc.output), - newInput)); - } - } - } - - return endNodes; - } - /** - * nocommit javadoc + * Enumerates all minimal prefix paths in the automaton that also intersect the FST, + * accumulating the FST end node and output for each path. */ - public static List> intersectPrefixPaths(Automaton a, FST fst) throws IOException { + public static List> intersectPrefixPaths(Automaton a, FST fst) + throws IOException { assert a.isDeterministic(); final List> queue = new ArrayList>(); final List> endNodes = new ArrayList>(); @@ -135,14 +84,16 @@ public class FSTUtil { final Path path = queue.remove(queue.size() - 1); if (path.state.isAccept()) { endNodes.add(path); + // we can stop here if we accept this path, + // we accept all further paths too continue; } -// System.out.println(UnicodeUtil.newString(path.input.ints, path.input.offset, path.input.length)); - + IntsRef currentInput = path.input; for (Transition t : path.state.getTransitions()) { - - if (t.getMin() == t.getMax()) { + final int min = t.getMin(); + final int max = t.getMax(); + if (min == max) { final FST.Arc nextArc = fst.findTargetArc(t.getMin(), path.fstNode, scratchArc, fstReader); if (nextArc != null) { @@ -150,32 +101,26 @@ public class FSTUtil { newInput.copyInts(currentInput); newInput.ints[currentInput.length] = t.getMin(); newInput.length = currentInput.length + 1; -// if (t.getDest().isAccept()) { -// System.out.println(UnicodeUtil.newString(newInput.ints, newInput.offset, newInput.length)); -// } queue.add(new Path(t.getDest(), new FST.Arc() .copyFrom(nextArc), fst.outputs .add(path.output, nextArc.output), newInput)); } } else { - // TODO: + // TODO: // if we accept the entire range possible in the FST (ie. 0 to 256) // we can simply use the prefix as the accepted state instead of // looking up all the // ranges and terminate early here? - FST.Arc nextArc = Util.readCeilArc(t.getMin(), fst, path.fstNode, + FST.Arc nextArc = Util.readCeilArc(min, fst, path.fstNode, scratchArc, fstReader); - while (nextArc != null && nextArc.label <= t.getMax()) { - assert nextArc.label <= t.getMax(); - assert nextArc.label >= t.getMin() : nextArc.label + " " - + t.getMin(); + while (nextArc != null && nextArc.label <= max) { + assert nextArc.label <= max; + assert nextArc.label >= min : nextArc.label + " " + + min; final IntsRef newInput = new IntsRef(currentInput.length + 1); newInput.copyInts(currentInput); newInput.ints[currentInput.length] = nextArc.label; newInput.length = currentInput.length + 1; -// if (t.getDest().isAccept()) { -// System.out.println(UnicodeUtil.newString(newInput.ints, newInput.offset, newInput.length)); -// } queue.add(new Path(t.getDest(), new FST.Arc() .copyFrom(nextArc), fst.outputs .add(path.output, nextArc.output), newInput)); @@ -188,13 +133,7 @@ public class FSTUtil { } } } - //System.out.println(); - - for (Path path2 : endNodes) { - if ("poales".equals(UnicodeUtil.newString(path2.input.ints, path2.input.offset, path2.input.length))) - System.out.println(UnicodeUtil.newString(path2.input.ints, path2.input.offset, path2.input.length)); - } - return endNodes; - } + return endNodes; + } } diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java index 5973caacbad..64c92984f6d 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java @@ -1,23 +1,4 @@ package org.apache.lucene.search.suggest.analyzing; - -import java.io.IOException; -import java.util.Arrays; -import java.util.List; -import java.util.Set; - -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.search.suggest.analyzing.AnalyzingSuggester.PathIntersector; -import org.apache.lucene.search.suggest.analyzing.FSTUtil.Path; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.IntsRef; -import org.apache.lucene.util.automaton.Automaton; -import org.apache.lucene.util.automaton.BasicAutomata; -import org.apache.lucene.util.automaton.BasicOperations; -import org.apache.lucene.util.automaton.LevenshteinAutomata; -import org.apache.lucene.util.automaton.SpecialOperations; -import org.apache.lucene.util.fst.FST; -import org.apache.lucene.util.fst.PairOutputs.Pair; - /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -34,24 +15,117 @@ import org.apache.lucene.util.fst.PairOutputs.Pair; * See the License for the specific language governing permissions and * limitations under the License. */ +import java.io.IOException; +import java.util.Arrays; +import java.util.List; +import java.util.Set; -public class FuzzySuggester extends AnalyzingSuggester { +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.search.suggest.analyzing.FSTUtil.Path; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.BasicAutomata; +import org.apache.lucene.util.automaton.BasicOperations; +import org.apache.lucene.util.automaton.LevenshteinAutomata; +import org.apache.lucene.util.automaton.SpecialOperations; +import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.PairOutputs.Pair; + +/** + * Implements a fuzzy {@link AnalyzingSuggester}. The similarity measurement is + * based on the Damerau-Levenshtein (optimal string alignment) algorithm, though + * you can explicitly choose classic Levenshtein by passing false + * to the transpositions parameter. + *

+ * At most, this query will match terms up to + * {@value org.apache.lucene.util.automaton.LevenshteinAutomata#MAXIMUM_SUPPORTED_DISTANCE} + * edits. Higher distances (especially with transpositions enabled), are not + * supported. + *

+ * Note: complex query analyzers can have a significant impact on the lookup + * performance. It's recommended to not use analyzers that drop or inject terms + * like synonyms to keep the complexity of the prefix intersection low for good + * lookup performance. At index time, complex analyzers can safely be used. + *

+ */ +public final class FuzzySuggester extends AnalyzingSuggester { private final int maxEdits; private final boolean transpositions; private final int minPrefix; + /** + * The default minimum shared (non-fuzzy) prefix. Set to 2 + */ + public static final int DEFAULT_MIN_PREFIX = 2; + + /** + * The default maximum number of edits for fuzzy suggestions. Set to 1 + */ + public static final int DEFAULT_MAX_EDITS = 1; + + /** + * Creates a {@link FuzzySuggester} instance initialized with default values. + * Calls + * {@link FuzzySuggester#FuzzySuggester(Analyzer, Analyzer, int, int, int, int, boolean, int)} + * FuzzySuggester(analyzer, analyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, + * DEFAULT_MAX_EDITS, true, DEFAULT_MIN_PREFIX) + * + * @param analyzer + * the analyzer used for this suggester + */ public FuzzySuggester(Analyzer analyzer) { this(analyzer, analyzer); } + /** + * Creates a {@link FuzzySuggester} instance with an index & a query analyzer initialized with default values. + * Calls + * {@link FuzzySuggester#FuzzySuggester(Analyzer, Analyzer, int, int, int, int, boolean, int)} + * FuzzySuggester(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, + * DEFAULT_MAX_EDITS, true, DEFAULT_MIN_PREFIX) + * + * @param indexAnalyzer + * Analyzer that will be used for analyzing suggestions while building the index. + * @param queryAnalyzer + * Analyzer that will be used for analyzing query text during lookup + */ public FuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer) { - this(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, 1, true, 1); + this(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, DEFAULT_MAX_EDITS, true, DEFAULT_MIN_PREFIX); } - // nocommit: probably want an option to like, require the first character or something :) + /** + * Creates a {@link FuzzySuggester} instance. + * + * @param indexAnalyzer Analyzer that will be used for + * analyzing suggestions while building the index. + * @param queryAnalyzer Analyzer that will be used for + * analyzing query text during lookup + * @param options see {@link #EXACT_FIRST}, {@link #PRESERVE_SEP} + * @param maxSurfaceFormsPerAnalyzedForm Maximum number of + * surface forms to keep for a single analyzed form. + * When there are too many surface forms we discard the + * lowest weighted ones. + * @param maxGraphExpansions Maximum number of graph paths + * to expand from the analyzed form. Set this to -1 for + * no limit. + * + * @param maxEdits must be >= 0 and <= {@link LevenshteinAutomata#MAXIMUM_SUPPORTED_DISTANCE}. + * @param transpositions true if transpositions should be treated as a primitive + * edit operation. If this is false, comparisons will implement the classic + * Levenshtein algorithm. + * @param minPrefix length of common (non-fuzzy) prefix + * + */ public FuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer, int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions, int maxEdits, boolean transpositions, int minPrefix) { super(indexAnalyzer, queryAnalyzer, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions); + if (maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) { + throw new IllegalArgumentException("maxEdits must be between 0 and " + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE); + } + if (minPrefix < 0) { + throw new IllegalArgumentException("minPrefix must not be < 0"); + } this.maxEdits = maxEdits; this.transpositions = transpositions; this.minPrefix = minPrefix; @@ -66,8 +140,7 @@ public class FuzzySuggester extends AnalyzingSuggester { } final Automaton toLevenshteinAutomata(Automaton automaton) { - // nocommit: how slow can this be :) - Set ref = SpecialOperations.getFiniteStrings(automaton, -1); + final Set ref = SpecialOperations.getFiniteStrings(automaton, -1); Automaton subs[] = new Automaton[ref.size()]; int upto = 0; for (IntsRef path : ref) { @@ -92,7 +165,7 @@ public class FuzzySuggester extends AnalyzingSuggester { return subs[0]; } else { Automaton a = BasicOperations.union(Arrays.asList(subs)); - // nocommit: we could call toLevenshteinAutomata() before det? + // TODO: we could call toLevenshteinAutomata() before det? // this only happens if you have multiple paths anyway (e.g. synonyms) BasicOperations.determinize(a); return a; diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java index 26a26ad69da..49a5443be7e 100644 --- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java +++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java @@ -48,7 +48,7 @@ import org.junit.Ignore; /** * Benchmarks tests for implementations of {@link Lookup} interface. */ -//@Ignore("COMMENT ME TO RUN BENCHMARKS!") +@Ignore("COMMENT ME TO RUN BENCHMARKS!") public class LookupBenchmarkTest extends LuceneTestCase { @SuppressWarnings("unchecked") private final List> benchmarkClasses = Arrays.asList( diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java index 8b30ae5c113..53c5ab7dac7 100644 --- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java +++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java @@ -57,6 +57,27 @@ import org.apache.lucene.util.fst.Util; public class FuzzySuggesterTest extends LuceneTestCase { + public void testRandomEdits() throws IOException { + List keys = new ArrayList(); + int numTerms = atLeast(100); + for (int i = 0; i < numTerms; i++) { + keys.add(new TermFreq("boo" + _TestUtil.randomSimpleString(random()), 1 + random().nextInt(100))); + } + keys.add(new TermFreq("foo bar boo far", 12)); + FuzzySuggester suggester = new FuzzySuggester(new MockAnalyzer(random(), MockTokenizer.KEYWORD, false)); + suggester.build(new TermFreqArrayIterator(keys)); + int numIters = atLeast(10); + for (int i = 0; i < numIters; i++) { + String addRandomEdit = addRandomEdit("foo bar boo", 2); + List results = suggester.lookup(_TestUtil.stringToCharSequence(addRandomEdit, random()), false, 2); + assertEquals(addRandomEdit, 1, results.size()); + assertEquals("foo bar boo far", results.get(0).key.toString()); + assertEquals(12, results.get(0).value, 0.01F); + } + + + } + /** this is basically the WFST test ported to KeywordAnalyzer. so it acts the same */ public void testKeyword() throws Exception { TermFreq keys[] = new TermFreq[] { @@ -96,12 +117,6 @@ public class FuzzySuggesterTest extends LuceneTestCase { assertEquals("barbara", results.get(1).key.toString()); assertEquals(6, results.get(1).value, 0.01F); - String addRandomEdit = addRandomEdit("barbara", 1); - results = suggester.lookup(_TestUtil.stringToCharSequence(addRandomEdit, random()), false, 2); - assertEquals(addRandomEdit, 1, results.size()); - assertEquals("barbara", results.get(0).key.toString()); - assertEquals(6, results.get(0).value, 0.01F); - // top N of 2, but only foo is available results = suggester.lookup(_TestUtil.stringToCharSequence("f", random()), false, 2); assertEquals(1, results.size()); @@ -134,7 +149,6 @@ public class FuzzySuggesterTest extends LuceneTestCase { assertEquals(6, results.get(2).value, 0.01F); } - // TODO: more tests /** * basic "standardanalyzer" test with stopword removal */ @@ -786,7 +800,7 @@ public class FuzzySuggesterTest extends LuceneTestCase { assertEquals(50, results.get(1).value); } - public String addRandomEdit(String string, int prefixLenght) { + private static String addRandomEdit(String string, int prefixLenght) { char[] charArray = string.toCharArray(); StringBuilder builder = new StringBuilder(); for (int i = 0; i < charArray.length; i++) { @@ -822,22 +836,4 @@ public class FuzzySuggesterTest extends LuceneTestCase { } return builder.toString(); } - - public Automaton getAutomaton(String string) { - IntsRef path = new IntsRef(); - Util.toUTF32(string, path); - if (path.length <= 1) { - return BasicAutomata.makeString(path.ints, path.offset, path.length); - } else { - Automaton prefix = BasicAutomata.makeString(path.ints, path.offset, 1); - int ints[] = new int[path.length-1-1]; - System.arraycopy(path.ints, path.offset+1, ints, 0, ints.length); - LevenshteinAutomata lev = new LevenshteinAutomata(ints, 256, true); - Automaton levAutomaton = lev.toAutomaton(1); - Automaton suffix = BasicAutomata.makeString(path.ints, path.length-1, 1); - Automaton combined = BasicOperations.concatenate(Arrays.asList(prefix, levAutomaton, suffix, BasicAutomata.makeAnyString())); - combined.setDeterministic(true); // its like the special case in concatenate itself, except we cloneExpanded already - return combined; - } - } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleAssertionsRequired.java b/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleAssertionsRequired.java index 1a6ed59b0c0..86083287f4c 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleAssertionsRequired.java +++ b/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleAssertionsRequired.java @@ -35,8 +35,7 @@ public class TestRuleAssertionsRequired implements TestRule { String msg = "Test class requires enabled assertions, enable globally (-ea)" + " or for Solr/Lucene subpackages only: " + description.getClassName(); System.err.println(msg); - // nocommit put back: - //throw new Exception(msg); + throw new Exception(msg); } catch (AssertionError e) { // Ok, enabled. }