diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java b/lucene/core/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java index bb0728acb9c..92384c450f1 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java @@ -36,7 +36,7 @@ public class LevenshteinAutomata { final int word[]; /* the automata alphabet. */ final int alphabet[]; - /* the maximum symbol in the alphabet (e.g. 256 for UTF-8 or 10FFFF for UTF-32) */ + /* the maximum symbol in the alphabet (e.g. 255 for UTF-8 or 10FFFF for UTF-32) */ final int alphaMax; /* the ranges outside of alphabet */ @@ -53,18 +53,24 @@ public class LevenshteinAutomata { public LevenshteinAutomata(String input, boolean withTranspositions) { this(codePoints(input), Character.MAX_CODE_POINT, withTranspositions); } - + /** - * Expert: Don't use this! + * Expert: specify a custom maximum possible symbol + * (alphaMax); default is Character.MAX_CODE_POINT. */ public LevenshteinAutomata(int[] word, int alphaMax, boolean withTranspositions) { this.word = word; this.alphaMax = alphaMax; - + // calculate the alphabet SortedSet set = new TreeSet(); - for (int i = 0; i < word.length; i++) - set.add(word[i]); + for (int i = 0; i < word.length; i++) { + int v = word[i]; + if (v > alphaMax) { + throw new IllegalArgumentException("alphaMax exceeded by symbol " + v + " in word"); + } + set.add(v); + } alphabet = new int[set.size()]; Iterator iterator = set.iterator(); for (int i = 0; i < alphabet.length; i++) diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java index 94a2b47b8df..2169c43f282 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java @@ -72,7 +72,6 @@ public final class FuzzySuggester extends AnalyzingSuggester { private final boolean transpositions; private final int nonFuzzyPrefix; private final int minFuzzyLength; - private final boolean allowSepEdit; /** * The default minimum length of the key passed to {@link @@ -91,11 +90,6 @@ public final class FuzzySuggester extends AnalyzingSuggester { */ public static final int DEFAULT_MAX_EDITS = 1; - /** - * We allow token separator to be deleted/inserted, by default. - */ - public static final boolean DEFAULT_ALLOW_SEP_EDIT = true; - /** * Creates a {@link FuzzySuggester} instance initialized with default values. * @@ -115,7 +109,7 @@ public final class FuzzySuggester extends AnalyzingSuggester { */ public FuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer) { this(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, DEFAULT_MAX_EDITS, true, - DEFAULT_NON_FUZZY_PREFIX, DEFAULT_MIN_FUZZY_LENGTH, DEFAULT_ALLOW_SEP_EDIT); + DEFAULT_NON_FUZZY_PREFIX, DEFAULT_MIN_FUZZY_LENGTH); } /** @@ -139,12 +133,11 @@ public final class FuzzySuggester extends AnalyzingSuggester { * Levenshtein algorithm. * @param nonFuzzyPrefix length of common (non-fuzzy) prefix (see default {@link #DEFAULT_NON_FUZZY_PREFIX} * @param minFuzzyLength minimum length of lookup key before any edits are allowed (see default {@link #DEFAULT_MIN_FUZZY_LENGTH}) - * @param allowSepEdit if true, the token separater is allowed to be an edit (so words may be split/joined) (see default {@link #DEFAULT_ALLOW_SEP_EDIT}) */ public FuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer, int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions, int maxEdits, boolean transpositions, int nonFuzzyPrefix, - int minFuzzyLength, boolean allowSepEdit) { + int minFuzzyLength) { super(indexAnalyzer, queryAnalyzer, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions); if (maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) { throw new IllegalArgumentException("maxEdits must be between 0 and " + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE); @@ -160,7 +153,6 @@ public final class FuzzySuggester extends AnalyzingSuggester { this.transpositions = transpositions; this.nonFuzzyPrefix = nonFuzzyPrefix; this.minFuzzyLength = minFuzzyLength; - this.allowSepEdit = allowSepEdit; } @Override @@ -206,7 +198,7 @@ public final class FuzzySuggester extends AnalyzingSuggester { // to allow the trailing dedup bytes to be // edited... but then 0 byte is "in general" allowed // on input (but not in UTF8). - LevenshteinAutomata lev = new LevenshteinAutomata(ints, allowSepEdit ? 255 : 254, transpositions); + LevenshteinAutomata lev = new LevenshteinAutomata(ints, 255, transpositions); Automaton levAutomaton = lev.toAutomaton(maxEdits); Automaton combined = BasicOperations.concatenate(Arrays.asList(prefix, levAutomaton)); combined.setDeterministic(true); // its like the special case in concatenate itself, except we cloneExpanded already diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java index 3a9dde83476..70de5d25c91 100644 --- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java +++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java @@ -184,7 +184,7 @@ public class FuzzySuggesterTest extends LuceneTestCase { int options = 0; Analyzer a = new MockAnalyzer(random()); - FuzzySuggester suggester = new FuzzySuggester(a, a, options, 256, -1, 1, true, 1, 3, true); + FuzzySuggester suggester = new FuzzySuggester(a, a, options, 256, -1, 1, true, 1, 3); suggester.build(new TermFreqArrayIterator(keys)); // TODO: would be nice if "ab " would allow the test to // pass, and more generally if the analyzer can know @@ -387,7 +387,7 @@ public class FuzzySuggesterTest extends LuceneTestCase { public void testExactFirst() throws Exception { Analyzer a = getUnusualAnalyzer(); - FuzzySuggester suggester = new FuzzySuggester(a, a, AnalyzingSuggester.EXACT_FIRST | AnalyzingSuggester.PRESERVE_SEP, 256, -1, 1, true, 1, 3, true); + FuzzySuggester suggester = new FuzzySuggester(a, a, AnalyzingSuggester.EXACT_FIRST | AnalyzingSuggester.PRESERVE_SEP, 256, -1, 1, true, 1, 3); suggester.build(new TermFreqArrayIterator(new TermFreq[] { new TermFreq("x y", 1), new TermFreq("x y z", 3), @@ -426,7 +426,7 @@ public class FuzzySuggesterTest extends LuceneTestCase { public void testNonExactFirst() throws Exception { Analyzer a = getUnusualAnalyzer(); - FuzzySuggester suggester = new FuzzySuggester(a, a, AnalyzingSuggester.PRESERVE_SEP, 256, -1, 1, true, 1, 3, true); + FuzzySuggester suggester = new FuzzySuggester(a, a, AnalyzingSuggester.PRESERVE_SEP, 256, -1, 1, true, 1, 3); suggester.build(new TermFreqArrayIterator(new TermFreq[] { new TermFreq("x y", 1), @@ -645,7 +645,7 @@ public class FuzzySuggesterTest extends LuceneTestCase { Analyzer a = new MockTokenEatingAnalyzer(numStopChars, preserveHoles); FuzzySuggester suggester = new FuzzySuggester(a, a, - preserveSep ? AnalyzingSuggester.PRESERVE_SEP : 0, 256, -1, 1, false, 1, 3, true); + preserveSep ? AnalyzingSuggester.PRESERVE_SEP : 0, 256, -1, 1, false, 1, 3); suggester.build(new TermFreqArrayIterator(keys)); for (String prefix : allPrefixes) { @@ -775,10 +775,9 @@ public class FuzzySuggesterTest extends LuceneTestCase { } } - public void testMaxSurfaceFormsPerAnalyzedForm() throws Exception { Analyzer a = new MockAnalyzer(random()); - FuzzySuggester suggester = new FuzzySuggester(a, a, 0, 2, -1, 1, true, 1, 3, true); + FuzzySuggester suggester = new FuzzySuggester(a, a, 0, 2, -1, 1, true, 1, 3); List keys = Arrays.asList(new TermFreq[] { new TermFreq("a", 40), @@ -796,6 +795,26 @@ public class FuzzySuggesterTest extends LuceneTestCase { assertEquals("a ", results.get(1).key); assertEquals(50, results.get(1).value); } + + public void testEditSeps() throws Exception { + Analyzer a = new MockAnalyzer(random()); + FuzzySuggester suggester = new FuzzySuggester(a, a, FuzzySuggester.PRESERVE_SEP, 2, -1, 2, true, 1, 3); + + List keys = Arrays.asList(new TermFreq[] { + new TermFreq("foo bar", 40), + new TermFreq("foo bar baz", 50), + new TermFreq("barbaz", 60), + new TermFreq("barbazfoo", 10), + }); + + Collections.shuffle(keys, random()); + suggester.build(new TermFreqArrayIterator(keys)); + + assertEquals("[foo bar baz/50, foo bar/40]", suggester.lookup("foobar", false, 5).toString()); + assertEquals("[foo bar baz/50]", suggester.lookup("foobarbaz", false, 5).toString()); + assertEquals("[barbaz/60, barbazfoo/10]", suggester.lookup("bar baz", false, 5).toString()); + assertEquals("[barbazfoo/10]", suggester.lookup("bar baz foo", false, 5).toString()); + } private static String addRandomEdit(String string, int prefixLength) { char[] input = string.toCharArray(); @@ -891,7 +910,7 @@ public class FuzzySuggesterTest extends LuceneTestCase { boolean transpositions = random().nextBoolean(); // TODO: test graph analyzers // TODO: test exactFirst / preserveSep permutations - FuzzySuggester suggest = new FuzzySuggester(a, a, 0, 256, -1, maxEdits, transpositions, prefixLen, 3, true); + FuzzySuggester suggest = new FuzzySuggester(a, a, 0, 256, -1, maxEdits, transpositions, prefixLen, 3); if (VERBOSE) { System.out.println("TEST: maxEdits=" + maxEdits + " prefixLen=" + prefixLen + " transpositions=" + transpositions + " num=" + NUM);