diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Transition.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Transition.java index d03bcbc69ba..d7a137b8dc2 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/Transition.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Transition.java @@ -139,6 +139,9 @@ public class Transition implements Cloneable { static void appendCharString(int c, StringBuilder b) { if (c >= 0x21 && c <= 0x7e && c != '\\' && c != '"') b.appendCodePoint(c); else { + b.append("\\\\U" + Integer.toHexString(c)); + // nocommit + /* b.append("\\\\U"); String s = Integer.toHexString(c); if (c < 0x10) b.append("0000000").append(s); @@ -149,6 +152,7 @@ public class Transition implements Cloneable { else if (c < 0x1000000) b.append("00").append(s); else if (c < 0x10000000) b.append("0").append(s); else b.append(s); + */ } } diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java index e063f978207..8fcabf7e8e1 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java @@ -15,7 +15,10 @@ package org.apache.lucene.search.suggest.analyzing; * See the License for the specific language governing permissions and * limitations under the License. */ +import java.io.FileOutputStream; import java.io.IOException; +import java.io.OutputStreamWriter; +import java.io.Writer; import java.util.Arrays; import java.util.List; import java.util.Set; @@ -60,10 +63,14 @@ public final class FuzzySuggester extends AnalyzingSuggester { private final int maxEdits; private final boolean transpositions; private final int minPrefix; + + // nocommit separate param for "min length before we + // enable fuzzy"? eg type "nusglasses" into google... /** * The default minimum shared (non-fuzzy) prefix. Set to 2 */ + // nocommit should we do 1...? public static final int DEFAULT_MIN_PREFIX = 2; /** @@ -156,7 +163,14 @@ public final class FuzzySuggester extends AnalyzingSuggester { Automaton prefix = BasicAutomata.makeString(path.ints, path.offset, minPrefix); int ints[] = new int[path.length-minPrefix]; System.arraycopy(path.ints, path.offset+minPrefix, ints, 0, ints.length); - LevenshteinAutomata lev = new LevenshteinAutomata(ints, 256, transpositions); + // nocommit i think we should pass 254 max? ie + // exclude 0xff ... this way we can't 'edit away' + // the sep? or ... maybe we want to allow that to + // be edited away? + // nocommit also the 0 byte ... we use that as + // trailer ... we probably shouldn't allow that byte + // to be edited (we could add alphaMin?) + LevenshteinAutomata lev = new LevenshteinAutomata(ints, 255, transpositions); Automaton levAutomaton = lev.toAutomaton(maxEdits); Automaton combined = BasicOperations.concatenate(Arrays.asList(prefix, levAutomaton)); combined.setDeterministic(true); // its like the special case in concatenate itself, except we cloneExpanded already @@ -164,6 +178,11 @@ public final class FuzzySuggester extends AnalyzingSuggester { upto++; } } + + // nocommit maybe we should reduce the LevN? the added + // arcs add cost during intersect (extra FST arc + // lookups...). could be net win... + if (subs.length == 0) { return BasicAutomata.makeEmpty(); // matches nothing } else if (subs.length == 1) { @@ -186,8 +205,20 @@ public final class FuzzySuggester extends AnalyzingSuggester { @Override public List>> intersectAll() throws IOException { - return FSTUtil.intersectPrefixPaths(toLevenshteinAutomata(automaton),fst); + // nocommit we don't "penalize" for edits + // ... shouldn't we? ie, ed=0 completions should have + // higher rank than ed=1, at the same "weight"? maybe + // we can punt on this for starters ... or maybe we + // can re-run each prefix path through lev0, lev1, + // lev2 to figure out the number of edits? + Automaton levA = toLevenshteinAutomata(automaton); + /* + Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8"); + w.write(levA.toDot()); + w.close(); + System.out.println("Wrote LevA to out.dot"); + */ + return FSTUtil.intersectPrefixPaths(levA, fst); } - } } diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java index 65cd84d48f9..31e19880e34 100644 --- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java +++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java @@ -700,6 +700,9 @@ public class FuzzySuggesterTest extends LuceneTestCase { System.out.println(" analyzed: " + analyzedKey); } TokenStreamToAutomaton tokenStreamToAutomaton = suggester.getTokenStreamToAutomaton(); + + // nocommit this is putting fox in charge of hen + // house! ie maybe we have a bug in suggester.toLevA ... Automaton automaton = suggester.toLevenshteinAutomata(suggester.toLookupAutomaton(analyzedKey)); assertTrue(automaton.isDeterministic()); // TODO: could be faster... but its slowCompletor for a reason @@ -834,4 +837,261 @@ public class FuzzySuggesterTest extends LuceneTestCase { return builder.toString(); } + + private String randomSimpleString(int maxLen) { + final int len = _TestUtil.nextInt(random(), 1, maxLen); + final char[] chars = new char[len]; + for(int j=0;j answers = new ArrayList(); + final Set seen = new HashSet(); + for(int i=0;i() { + @Override + public int compare(TermFreq a, TermFreq b) { + return a.term.compareTo(b.term); + } + }); + if (VERBOSE) { + System.out.println("\nTEST: targets"); + for(TermFreq tf : answers) { + System.out.println(" " + tf.term.utf8ToString() + " freq=" + tf.v); + } + } + + Analyzer a = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false); + int maxEdits = random().nextBoolean() ? 1 : 2; + int prefixLen = random().nextInt(4); + boolean transpositions = random().nextBoolean(); + // TODO: test graph analyzers + // TODO: test exactFirst / preserveSep permutations + FuzzySuggester suggest = new FuzzySuggester(a, a, 0, 256, -1, maxEdits, transpositions, prefixLen); + + if (VERBOSE) { + System.out.println("TEST: maxEdits=" + maxEdits + " prefixLen=" + prefixLen + " transpositions=" + transpositions + " num=" + NUM); + } + + Collections.shuffle(answers, random()); + suggest.build(new TermFreqArrayIterator(answers.toArray(new TermFreq[answers.size()]))); + + final int ITERS = atLeast(100); + for(int iter=0;iter actual = suggest.lookup(frag, false, NUM); + if (VERBOSE) { + System.out.println(" actual: " + actual.size()); + for(LookupResult c : actual) { + System.out.println(" " + c); + } + } + + // nocommit must fix lookup to tie break properly!!: + Collections.sort(actual, new CompareByCostThenAlpha()); + + final int limit = Math.min(expected.size(), actual.size()); + for(int ans=0;ans slowFuzzyMatch(int prefixLen, int maxEdits, boolean allowTransposition, List answers, String frag) { + final List results = new ArrayList(); + final int fragLen = frag.length(); + for(TermFreq tf : answers) { + //System.out.println(" check s=" + tf.term.utf8ToString()); + boolean prefixMatches = true; + for(int i=0;i 1 && j > 1 && targetPoints.ints[i-1] == otherPoints.ints[j-2] && targetPoints.ints[i-2] == otherPoints.ints[j-1]) { + d[i][j] = Math.min(d[i][j], d[i-2][j-2] + cost); + } + } + } + + return d[n][m]; + } + + private static IntsRef toIntsRef(String s) { + IntsRef ref = new IntsRef(s.length()); // worst case + int utf16Len = s.length(); + for (int i = 0, cp = 0; i < utf16Len; i += Character.charCount(cp)) { + cp = ref.ints[ref.length++] = Character.codePointAt(s, i); + } + return ref; + } }