path2 : endNodes) {
- if ("poales".equals(UnicodeUtil.newString(path2.input.ints, path2.input.offset, path2.input.length)))
- System.out.println(UnicodeUtil.newString(path2.input.ints, path2.input.offset, path2.input.length));
- }
- return endNodes;
- }
+ return endNodes;
+ }
}
diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java
index 5973caacbad..64c92984f6d 100644
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java
@@ -1,23 +1,4 @@
package org.apache.lucene.search.suggest.analyzing;
-
-import java.io.IOException;
-import java.util.Arrays;
-import java.util.List;
-import java.util.Set;
-
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.search.suggest.analyzing.AnalyzingSuggester.PathIntersector;
-import org.apache.lucene.search.suggest.analyzing.FSTUtil.Path;
-import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.IntsRef;
-import org.apache.lucene.util.automaton.Automaton;
-import org.apache.lucene.util.automaton.BasicAutomata;
-import org.apache.lucene.util.automaton.BasicOperations;
-import org.apache.lucene.util.automaton.LevenshteinAutomata;
-import org.apache.lucene.util.automaton.SpecialOperations;
-import org.apache.lucene.util.fst.FST;
-import org.apache.lucene.util.fst.PairOutputs.Pair;
-
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@@ -34,24 +15,117 @@ import org.apache.lucene.util.fst.PairOutputs.Pair;
* See the License for the specific language governing permissions and
* limitations under the License.
*/
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Set;
-public class FuzzySuggester extends AnalyzingSuggester {
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.search.suggest.analyzing.FSTUtil.Path;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.BasicAutomata;
+import org.apache.lucene.util.automaton.BasicOperations;
+import org.apache.lucene.util.automaton.LevenshteinAutomata;
+import org.apache.lucene.util.automaton.SpecialOperations;
+import org.apache.lucene.util.fst.FST;
+import org.apache.lucene.util.fst.PairOutputs.Pair;
+
+/**
+ * Implements a fuzzy {@link AnalyzingSuggester}. The similarity measurement is
+ * based on the Damerau-Levenshtein (optimal string alignment) algorithm, though
+ * you can explicitly choose classic Levenshtein by passing false
+ * to the transpositions
parameter.
+ *
+ * At most, this query will match terms up to
+ * {@value org.apache.lucene.util.automaton.LevenshteinAutomata#MAXIMUM_SUPPORTED_DISTANCE}
+ * edits. Higher distances (especially with transpositions enabled), are not
+ * supported.
+ *
+ * Note: complex query analyzers can have a significant impact on the lookup
+ * performance. It's recommended to not use analyzers that drop or inject terms
+ * like synonyms to keep the complexity of the prefix intersection low for good
+ * lookup performance. At index time, complex analyzers can safely be used.
+ *
+ */
+public final class FuzzySuggester extends AnalyzingSuggester {
private final int maxEdits;
private final boolean transpositions;
private final int minPrefix;
+ /**
+ * The default minimum shared (non-fuzzy) prefix. Set to 2
+ */
+ public static final int DEFAULT_MIN_PREFIX = 2;
+
+ /**
+ * The default maximum number of edits for fuzzy suggestions. Set to 1
+ */
+ public static final int DEFAULT_MAX_EDITS = 1;
+
+ /**
+ * Creates a {@link FuzzySuggester} instance initialized with default values.
+ * Calls
+ * {@link FuzzySuggester#FuzzySuggester(Analyzer, Analyzer, int, int, int, int, boolean, int)}
+ * FuzzySuggester(analyzer, analyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1,
+ * DEFAULT_MAX_EDITS, true, DEFAULT_MIN_PREFIX)
+ *
+ * @param analyzer
+ * the analyzer used for this suggester
+ */
public FuzzySuggester(Analyzer analyzer) {
this(analyzer, analyzer);
}
+ /**
+ * Creates a {@link FuzzySuggester} instance with an index & a query analyzer initialized with default values.
+ * Calls
+ * {@link FuzzySuggester#FuzzySuggester(Analyzer, Analyzer, int, int, int, int, boolean, int)}
+ * FuzzySuggester(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1,
+ * DEFAULT_MAX_EDITS, true, DEFAULT_MIN_PREFIX)
+ *
+ * @param indexAnalyzer
+ * Analyzer that will be used for analyzing suggestions while building the index.
+ * @param queryAnalyzer
+ * Analyzer that will be used for analyzing query text during lookup
+ */
public FuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer) {
- this(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, 1, true, 1);
+ this(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, DEFAULT_MAX_EDITS, true, DEFAULT_MIN_PREFIX);
}
- // nocommit: probably want an option to like, require the first character or something :)
+ /**
+ * Creates a {@link FuzzySuggester} instance.
+ *
+ * @param indexAnalyzer Analyzer that will be used for
+ * analyzing suggestions while building the index.
+ * @param queryAnalyzer Analyzer that will be used for
+ * analyzing query text during lookup
+ * @param options see {@link #EXACT_FIRST}, {@link #PRESERVE_SEP}
+ * @param maxSurfaceFormsPerAnalyzedForm Maximum number of
+ * surface forms to keep for a single analyzed form.
+ * When there are too many surface forms we discard the
+ * lowest weighted ones.
+ * @param maxGraphExpansions Maximum number of graph paths
+ * to expand from the analyzed form. Set this to -1 for
+ * no limit.
+ *
+ * @param maxEdits must be >= 0 and <= {@link LevenshteinAutomata#MAXIMUM_SUPPORTED_DISTANCE}.
+ * @param transpositions true
if transpositions should be treated as a primitive
+ * edit operation. If this is false, comparisons will implement the classic
+ * Levenshtein algorithm.
+ * @param minPrefix length of common (non-fuzzy) prefix
+ *
+ */
public FuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer,
int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions, int maxEdits, boolean transpositions, int minPrefix) {
super(indexAnalyzer, queryAnalyzer, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions);
+ if (maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
+ throw new IllegalArgumentException("maxEdits must be between 0 and " + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE);
+ }
+ if (minPrefix < 0) {
+ throw new IllegalArgumentException("minPrefix must not be < 0");
+ }
this.maxEdits = maxEdits;
this.transpositions = transpositions;
this.minPrefix = minPrefix;
@@ -66,8 +140,7 @@ public class FuzzySuggester extends AnalyzingSuggester {
}
final Automaton toLevenshteinAutomata(Automaton automaton) {
- // nocommit: how slow can this be :)
- Set ref = SpecialOperations.getFiniteStrings(automaton, -1);
+ final Set ref = SpecialOperations.getFiniteStrings(automaton, -1);
Automaton subs[] = new Automaton[ref.size()];
int upto = 0;
for (IntsRef path : ref) {
@@ -92,7 +165,7 @@ public class FuzzySuggester extends AnalyzingSuggester {
return subs[0];
} else {
Automaton a = BasicOperations.union(Arrays.asList(subs));
- // nocommit: we could call toLevenshteinAutomata() before det?
+ // TODO: we could call toLevenshteinAutomata() before det?
// this only happens if you have multiple paths anyway (e.g. synonyms)
BasicOperations.determinize(a);
return a;
diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java
index 26a26ad69da..49a5443be7e 100644
--- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java
+++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java
@@ -48,7 +48,7 @@ import org.junit.Ignore;
/**
* Benchmarks tests for implementations of {@link Lookup} interface.
*/
-//@Ignore("COMMENT ME TO RUN BENCHMARKS!")
+@Ignore("COMMENT ME TO RUN BENCHMARKS!")
public class LookupBenchmarkTest extends LuceneTestCase {
@SuppressWarnings("unchecked")
private final List> benchmarkClasses = Arrays.asList(
diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java
index 8b30ae5c113..53c5ab7dac7 100644
--- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java
+++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java
@@ -57,6 +57,27 @@ import org.apache.lucene.util.fst.Util;
public class FuzzySuggesterTest extends LuceneTestCase {
+ public void testRandomEdits() throws IOException {
+ List keys = new ArrayList();
+ int numTerms = atLeast(100);
+ for (int i = 0; i < numTerms; i++) {
+ keys.add(new TermFreq("boo" + _TestUtil.randomSimpleString(random()), 1 + random().nextInt(100)));
+ }
+ keys.add(new TermFreq("foo bar boo far", 12));
+ FuzzySuggester suggester = new FuzzySuggester(new MockAnalyzer(random(), MockTokenizer.KEYWORD, false));
+ suggester.build(new TermFreqArrayIterator(keys));
+ int numIters = atLeast(10);
+ for (int i = 0; i < numIters; i++) {
+ String addRandomEdit = addRandomEdit("foo bar boo", 2);
+ List results = suggester.lookup(_TestUtil.stringToCharSequence(addRandomEdit, random()), false, 2);
+ assertEquals(addRandomEdit, 1, results.size());
+ assertEquals("foo bar boo far", results.get(0).key.toString());
+ assertEquals(12, results.get(0).value, 0.01F);
+ }
+
+
+ }
+
/** this is basically the WFST test ported to KeywordAnalyzer. so it acts the same */
public void testKeyword() throws Exception {
TermFreq keys[] = new TermFreq[] {
@@ -96,12 +117,6 @@ public class FuzzySuggesterTest extends LuceneTestCase {
assertEquals("barbara", results.get(1).key.toString());
assertEquals(6, results.get(1).value, 0.01F);
- String addRandomEdit = addRandomEdit("barbara", 1);
- results = suggester.lookup(_TestUtil.stringToCharSequence(addRandomEdit, random()), false, 2);
- assertEquals(addRandomEdit, 1, results.size());
- assertEquals("barbara", results.get(0).key.toString());
- assertEquals(6, results.get(0).value, 0.01F);
-
// top N of 2, but only foo is available
results = suggester.lookup(_TestUtil.stringToCharSequence("f", random()), false, 2);
assertEquals(1, results.size());
@@ -134,7 +149,6 @@ public class FuzzySuggesterTest extends LuceneTestCase {
assertEquals(6, results.get(2).value, 0.01F);
}
- // TODO: more tests
/**
* basic "standardanalyzer" test with stopword removal
*/
@@ -786,7 +800,7 @@ public class FuzzySuggesterTest extends LuceneTestCase {
assertEquals(50, results.get(1).value);
}
- public String addRandomEdit(String string, int prefixLenght) {
+ private static String addRandomEdit(String string, int prefixLenght) {
char[] charArray = string.toCharArray();
StringBuilder builder = new StringBuilder();
for (int i = 0; i < charArray.length; i++) {
@@ -822,22 +836,4 @@ public class FuzzySuggesterTest extends LuceneTestCase {
}
return builder.toString();
}
-
- public Automaton getAutomaton(String string) {
- IntsRef path = new IntsRef();
- Util.toUTF32(string, path);
- if (path.length <= 1) {
- return BasicAutomata.makeString(path.ints, path.offset, path.length);
- } else {
- Automaton prefix = BasicAutomata.makeString(path.ints, path.offset, 1);
- int ints[] = new int[path.length-1-1];
- System.arraycopy(path.ints, path.offset+1, ints, 0, ints.length);
- LevenshteinAutomata lev = new LevenshteinAutomata(ints, 256, true);
- Automaton levAutomaton = lev.toAutomaton(1);
- Automaton suffix = BasicAutomata.makeString(path.ints, path.length-1, 1);
- Automaton combined = BasicOperations.concatenate(Arrays.asList(prefix, levAutomaton, suffix, BasicAutomata.makeAnyString()));
- combined.setDeterministic(true); // its like the special case in concatenate itself, except we cloneExpanded already
- return combined;
- }
- }
}
diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleAssertionsRequired.java b/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleAssertionsRequired.java
index 1a6ed59b0c0..86083287f4c 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleAssertionsRequired.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleAssertionsRequired.java
@@ -35,8 +35,7 @@ public class TestRuleAssertionsRequired implements TestRule {
String msg = "Test class requires enabled assertions, enable globally (-ea)" +
" or for Solr/Lucene subpackages only: " + description.getClassName();
System.err.println(msg);
- // nocommit put back:
- //throw new Exception(msg);
+ throw new Exception(msg);
} catch (AssertionError e) {
// Ok, enabled.
}