diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Transition.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Transition.java
index d03bcbc69ba..d7a137b8dc2 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/Transition.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Transition.java
@@ -139,6 +139,9 @@ public class Transition implements Cloneable {
static void appendCharString(int c, StringBuilder b) {
if (c >= 0x21 && c <= 0x7e && c != '\\' && c != '"') b.appendCodePoint(c);
else {
+ b.append("\\\\U" + Integer.toHexString(c));
+ // nocommit
+ /*
b.append("\\\\U");
String s = Integer.toHexString(c);
if (c < 0x10) b.append("0000000").append(s);
@@ -149,6 +152,7 @@ public class Transition implements Cloneable {
else if (c < 0x1000000) b.append("00").append(s);
else if (c < 0x10000000) b.append("0").append(s);
else b.append(s);
+ */
}
}
diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java
index e063f978207..8fcabf7e8e1 100644
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java
@@ -15,7 +15,10 @@ package org.apache.lucene.search.suggest.analyzing;
* See the License for the specific language governing permissions and
* limitations under the License.
*/
+import java.io.FileOutputStream;
import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
import java.util.Arrays;
import java.util.List;
import java.util.Set;
@@ -60,10 +63,14 @@ public final class FuzzySuggester extends AnalyzingSuggester {
private final int maxEdits;
private final boolean transpositions;
private final int minPrefix;
+
+ // nocommit separate param for "min length before we
+ // enable fuzzy"? eg type "nusglasses" into google...
/**
* The default minimum shared (non-fuzzy) prefix. Set to 2
*/
+ // nocommit should we do 1...?
public static final int DEFAULT_MIN_PREFIX = 2;
/**
@@ -156,7 +163,14 @@ public final class FuzzySuggester extends AnalyzingSuggester {
Automaton prefix = BasicAutomata.makeString(path.ints, path.offset, minPrefix);
int ints[] = new int[path.length-minPrefix];
System.arraycopy(path.ints, path.offset+minPrefix, ints, 0, ints.length);
- LevenshteinAutomata lev = new LevenshteinAutomata(ints, 256, transpositions);
+ // nocommit i think we should pass 254 max? ie
+ // exclude 0xff ... this way we can't 'edit away'
+ // the sep? or ... maybe we want to allow that to
+ // be edited away?
+ // nocommit also the 0 byte ... we use that as
+ // trailer ... we probably shouldn't allow that byte
+ // to be edited (we could add alphaMin?)
+ LevenshteinAutomata lev = new LevenshteinAutomata(ints, 255, transpositions);
Automaton levAutomaton = lev.toAutomaton(maxEdits);
Automaton combined = BasicOperations.concatenate(Arrays.asList(prefix, levAutomaton));
combined.setDeterministic(true); // its like the special case in concatenate itself, except we cloneExpanded already
@@ -164,6 +178,11 @@ public final class FuzzySuggester extends AnalyzingSuggester {
upto++;
}
}
+
+ // nocommit maybe we should reduce the LevN? the added
+ // arcs add cost during intersect (extra FST arc
+ // lookups...). could be net win...
+
if (subs.length == 0) {
return BasicAutomata.makeEmpty(); // matches nothing
} else if (subs.length == 1) {
@@ -186,8 +205,20 @@ public final class FuzzySuggester extends AnalyzingSuggester {
@Override
public List>> intersectAll() throws IOException {
- return FSTUtil.intersectPrefixPaths(toLevenshteinAutomata(automaton),fst);
+ // nocommit we don't "penalize" for edits
+ // ... shouldn't we? ie, ed=0 completions should have
+ // higher rank than ed=1, at the same "weight"? maybe
+ // we can punt on this for starters ... or maybe we
+ // can re-run each prefix path through lev0, lev1,
+ // lev2 to figure out the number of edits?
+ Automaton levA = toLevenshteinAutomata(automaton);
+ /*
+ Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
+ w.write(levA.toDot());
+ w.close();
+ System.out.println("Wrote LevA to out.dot");
+ */
+ return FSTUtil.intersectPrefixPaths(levA, fst);
}
-
}
}
diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java
index 65cd84d48f9..31e19880e34 100644
--- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java
+++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java
@@ -700,6 +700,9 @@ public class FuzzySuggesterTest extends LuceneTestCase {
System.out.println(" analyzed: " + analyzedKey);
}
TokenStreamToAutomaton tokenStreamToAutomaton = suggester.getTokenStreamToAutomaton();
+
+ // nocommit this is putting fox in charge of hen
+ // house! ie maybe we have a bug in suggester.toLevA ...
Automaton automaton = suggester.toLevenshteinAutomata(suggester.toLookupAutomaton(analyzedKey));
assertTrue(automaton.isDeterministic());
// TODO: could be faster... but its slowCompletor for a reason
@@ -834,4 +837,261 @@ public class FuzzySuggesterTest extends LuceneTestCase {
return builder.toString();
}
+
+ private String randomSimpleString(int maxLen) {
+ final int len = _TestUtil.nextInt(random(), 1, maxLen);
+ final char[] chars = new char[len];
+ for(int j=0;j answers = new ArrayList();
+ final Set seen = new HashSet();
+ for(int i=0;i() {
+ @Override
+ public int compare(TermFreq a, TermFreq b) {
+ return a.term.compareTo(b.term);
+ }
+ });
+ if (VERBOSE) {
+ System.out.println("\nTEST: targets");
+ for(TermFreq tf : answers) {
+ System.out.println(" " + tf.term.utf8ToString() + " freq=" + tf.v);
+ }
+ }
+
+ Analyzer a = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false);
+ int maxEdits = random().nextBoolean() ? 1 : 2;
+ int prefixLen = random().nextInt(4);
+ boolean transpositions = random().nextBoolean();
+ // TODO: test graph analyzers
+ // TODO: test exactFirst / preserveSep permutations
+ FuzzySuggester suggest = new FuzzySuggester(a, a, 0, 256, -1, maxEdits, transpositions, prefixLen);
+
+ if (VERBOSE) {
+ System.out.println("TEST: maxEdits=" + maxEdits + " prefixLen=" + prefixLen + " transpositions=" + transpositions + " num=" + NUM);
+ }
+
+ Collections.shuffle(answers, random());
+ suggest.build(new TermFreqArrayIterator(answers.toArray(new TermFreq[answers.size()])));
+
+ final int ITERS = atLeast(100);
+ for(int iter=0;iter actual = suggest.lookup(frag, false, NUM);
+ if (VERBOSE) {
+ System.out.println(" actual: " + actual.size());
+ for(LookupResult c : actual) {
+ System.out.println(" " + c);
+ }
+ }
+
+ // nocommit must fix lookup to tie break properly!!:
+ Collections.sort(actual, new CompareByCostThenAlpha());
+
+ final int limit = Math.min(expected.size(), actual.size());
+ for(int ans=0;ans slowFuzzyMatch(int prefixLen, int maxEdits, boolean allowTransposition, List answers, String frag) {
+ final List results = new ArrayList();
+ final int fragLen = frag.length();
+ for(TermFreq tf : answers) {
+ //System.out.println(" check s=" + tf.term.utf8ToString());
+ boolean prefixMatches = true;
+ for(int i=0;i 1 && j > 1 && targetPoints.ints[i-1] == otherPoints.ints[j-2] && targetPoints.ints[i-2] == otherPoints.ints[j-1]) {
+ d[i][j] = Math.min(d[i][j], d[i-2][j-2] + cost);
+ }
+ }
+ }
+
+ return d[n][m];
+ }
+
+ private static IntsRef toIntsRef(String s) {
+ IntsRef ref = new IntsRef(s.length()); // worst case
+ int utf16Len = s.length();
+ for (int i = 0, cp = 0; i < utf16Len; i += Character.charCount(cp)) {
+ cp = ref.ints[ref.length++] = Character.codePointAt(s, i);
+ }
+ return ref;
+ }
}