LUCENE-3846: remove allowSepEdit (it doesn't work); add test

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3846@1403413 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2012-10-29 16:43:03 +00:00
parent bd8ef39a3b
commit c9a38e3236
3 changed files with 41 additions and 24 deletions

View File

@ -36,7 +36,7 @@ public class LevenshteinAutomata {
final int word[];
/* the automata alphabet. */
final int alphabet[];
/* the maximum symbol in the alphabet (e.g. 256 for UTF-8 or 10FFFF for UTF-32) */
/* the maximum symbol in the alphabet (e.g. 255 for UTF-8 or 10FFFF for UTF-32) */
final int alphaMax;
/* the ranges outside of alphabet */
@ -53,18 +53,24 @@ public class LevenshteinAutomata {
public LevenshteinAutomata(String input, boolean withTranspositions) {
this(codePoints(input), Character.MAX_CODE_POINT, withTranspositions);
}
/**
* Expert: Don't use this!
* Expert: specify a custom maximum possible symbol
* (alphaMax); default is Character.MAX_CODE_POINT.
*/
public LevenshteinAutomata(int[] word, int alphaMax, boolean withTranspositions) {
this.word = word;
this.alphaMax = alphaMax;
// calculate the alphabet
SortedSet<Integer> set = new TreeSet<Integer>();
for (int i = 0; i < word.length; i++)
set.add(word[i]);
for (int i = 0; i < word.length; i++) {
int v = word[i];
if (v > alphaMax) {
throw new IllegalArgumentException("alphaMax exceeded by symbol " + v + " in word");
}
set.add(v);
}
alphabet = new int[set.size()];
Iterator<Integer> iterator = set.iterator();
for (int i = 0; i < alphabet.length; i++)

View File

@ -72,7 +72,6 @@ public final class FuzzySuggester extends AnalyzingSuggester {
private final boolean transpositions;
private final int nonFuzzyPrefix;
private final int minFuzzyLength;
private final boolean allowSepEdit;
/**
* The default minimum length of the key passed to {@link
@ -91,11 +90,6 @@ public final class FuzzySuggester extends AnalyzingSuggester {
*/
public static final int DEFAULT_MAX_EDITS = 1;
/**
* We allow token separator to be deleted/inserted, by default.
*/
public static final boolean DEFAULT_ALLOW_SEP_EDIT = true;
/**
* Creates a {@link FuzzySuggester} instance initialized with default values.
*
@ -115,7 +109,7 @@ public final class FuzzySuggester extends AnalyzingSuggester {
*/
public FuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer) {
this(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, DEFAULT_MAX_EDITS, true,
DEFAULT_NON_FUZZY_PREFIX, DEFAULT_MIN_FUZZY_LENGTH, DEFAULT_ALLOW_SEP_EDIT);
DEFAULT_NON_FUZZY_PREFIX, DEFAULT_MIN_FUZZY_LENGTH);
}
/**
@ -139,12 +133,11 @@ public final class FuzzySuggester extends AnalyzingSuggester {
* Levenshtein algorithm.
* @param nonFuzzyPrefix length of common (non-fuzzy) prefix (see default {@link #DEFAULT_NON_FUZZY_PREFIX}
* @param minFuzzyLength minimum length of lookup key before any edits are allowed (see default {@link #DEFAULT_MIN_FUZZY_LENGTH})
* @param allowSepEdit if true, the token separater is allowed to be an edit (so words may be split/joined) (see default {@link #DEFAULT_ALLOW_SEP_EDIT})
*/
public FuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer,
int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions,
int maxEdits, boolean transpositions, int nonFuzzyPrefix,
int minFuzzyLength, boolean allowSepEdit) {
int minFuzzyLength) {
super(indexAnalyzer, queryAnalyzer, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions);
if (maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
throw new IllegalArgumentException("maxEdits must be between 0 and " + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE);
@ -160,7 +153,6 @@ public final class FuzzySuggester extends AnalyzingSuggester {
this.transpositions = transpositions;
this.nonFuzzyPrefix = nonFuzzyPrefix;
this.minFuzzyLength = minFuzzyLength;
this.allowSepEdit = allowSepEdit;
}
@Override
@ -206,7 +198,7 @@ public final class FuzzySuggester extends AnalyzingSuggester {
// to allow the trailing dedup bytes to be
// edited... but then 0 byte is "in general" allowed
// on input (but not in UTF8).
LevenshteinAutomata lev = new LevenshteinAutomata(ints, allowSepEdit ? 255 : 254, transpositions);
LevenshteinAutomata lev = new LevenshteinAutomata(ints, 255, transpositions);
Automaton levAutomaton = lev.toAutomaton(maxEdits);
Automaton combined = BasicOperations.concatenate(Arrays.asList(prefix, levAutomaton));
combined.setDeterministic(true); // its like the special case in concatenate itself, except we cloneExpanded already

View File

@ -184,7 +184,7 @@ public class FuzzySuggesterTest extends LuceneTestCase {
int options = 0;
Analyzer a = new MockAnalyzer(random());
FuzzySuggester suggester = new FuzzySuggester(a, a, options, 256, -1, 1, true, 1, 3, true);
FuzzySuggester suggester = new FuzzySuggester(a, a, options, 256, -1, 1, true, 1, 3);
suggester.build(new TermFreqArrayIterator(keys));
// TODO: would be nice if "ab " would allow the test to
// pass, and more generally if the analyzer can know
@ -387,7 +387,7 @@ public class FuzzySuggesterTest extends LuceneTestCase {
public void testExactFirst() throws Exception {
Analyzer a = getUnusualAnalyzer();
FuzzySuggester suggester = new FuzzySuggester(a, a, AnalyzingSuggester.EXACT_FIRST | AnalyzingSuggester.PRESERVE_SEP, 256, -1, 1, true, 1, 3, true);
FuzzySuggester suggester = new FuzzySuggester(a, a, AnalyzingSuggester.EXACT_FIRST | AnalyzingSuggester.PRESERVE_SEP, 256, -1, 1, true, 1, 3);
suggester.build(new TermFreqArrayIterator(new TermFreq[] {
new TermFreq("x y", 1),
new TermFreq("x y z", 3),
@ -426,7 +426,7 @@ public class FuzzySuggesterTest extends LuceneTestCase {
public void testNonExactFirst() throws Exception {
Analyzer a = getUnusualAnalyzer();
FuzzySuggester suggester = new FuzzySuggester(a, a, AnalyzingSuggester.PRESERVE_SEP, 256, -1, 1, true, 1, 3, true);
FuzzySuggester suggester = new FuzzySuggester(a, a, AnalyzingSuggester.PRESERVE_SEP, 256, -1, 1, true, 1, 3);
suggester.build(new TermFreqArrayIterator(new TermFreq[] {
new TermFreq("x y", 1),
@ -645,7 +645,7 @@ public class FuzzySuggesterTest extends LuceneTestCase {
Analyzer a = new MockTokenEatingAnalyzer(numStopChars, preserveHoles);
FuzzySuggester suggester = new FuzzySuggester(a, a,
preserveSep ? AnalyzingSuggester.PRESERVE_SEP : 0, 256, -1, 1, false, 1, 3, true);
preserveSep ? AnalyzingSuggester.PRESERVE_SEP : 0, 256, -1, 1, false, 1, 3);
suggester.build(new TermFreqArrayIterator(keys));
for (String prefix : allPrefixes) {
@ -775,10 +775,9 @@ public class FuzzySuggesterTest extends LuceneTestCase {
}
}
public void testMaxSurfaceFormsPerAnalyzedForm() throws Exception {
Analyzer a = new MockAnalyzer(random());
FuzzySuggester suggester = new FuzzySuggester(a, a, 0, 2, -1, 1, true, 1, 3, true);
FuzzySuggester suggester = new FuzzySuggester(a, a, 0, 2, -1, 1, true, 1, 3);
List<TermFreq> keys = Arrays.asList(new TermFreq[] {
new TermFreq("a", 40),
@ -796,6 +795,26 @@ public class FuzzySuggesterTest extends LuceneTestCase {
assertEquals("a ", results.get(1).key);
assertEquals(50, results.get(1).value);
}
public void testEditSeps() throws Exception {
Analyzer a = new MockAnalyzer(random());
FuzzySuggester suggester = new FuzzySuggester(a, a, FuzzySuggester.PRESERVE_SEP, 2, -1, 2, true, 1, 3);
List<TermFreq> keys = Arrays.asList(new TermFreq[] {
new TermFreq("foo bar", 40),
new TermFreq("foo bar baz", 50),
new TermFreq("barbaz", 60),
new TermFreq("barbazfoo", 10),
});
Collections.shuffle(keys, random());
suggester.build(new TermFreqArrayIterator(keys));
assertEquals("[foo bar baz/50, foo bar/40]", suggester.lookup("foobar", false, 5).toString());
assertEquals("[foo bar baz/50]", suggester.lookup("foobarbaz", false, 5).toString());
assertEquals("[barbaz/60, barbazfoo/10]", suggester.lookup("bar baz", false, 5).toString());
assertEquals("[barbazfoo/10]", suggester.lookup("bar baz foo", false, 5).toString());
}
private static String addRandomEdit(String string, int prefixLength) {
char[] input = string.toCharArray();
@ -891,7 +910,7 @@ public class FuzzySuggesterTest extends LuceneTestCase {
boolean transpositions = random().nextBoolean();
// TODO: test graph analyzers
// TODO: test exactFirst / preserveSep permutations
FuzzySuggester suggest = new FuzzySuggester(a, a, 0, 256, -1, maxEdits, transpositions, prefixLen, 3, true);
FuzzySuggester suggest = new FuzzySuggester(a, a, 0, 256, -1, maxEdits, transpositions, prefixLen, 3);
if (VERBOSE) {
System.out.println("TEST: maxEdits=" + maxEdits + " prefixLen=" + prefixLen + " transpositions=" + transpositions + " num=" + NUM);