LUCENE-3846: remove allowSepEdit (it doesn't work); add test

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3846@1403413 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2012-10-29 16:43:03 +00:00
parent bd8ef39a3b
commit c9a38e3236
3 changed files with 41 additions and 24 deletions

View File

@ -36,7 +36,7 @@ public class LevenshteinAutomata {
final int word[]; final int word[];
/* the automata alphabet. */ /* the automata alphabet. */
final int alphabet[]; final int alphabet[];
/* the maximum symbol in the alphabet (e.g. 256 for UTF-8 or 10FFFF for UTF-32) */ /* the maximum symbol in the alphabet (e.g. 255 for UTF-8 or 10FFFF for UTF-32) */
final int alphaMax; final int alphaMax;
/* the ranges outside of alphabet */ /* the ranges outside of alphabet */
@ -55,7 +55,8 @@ public class LevenshteinAutomata {
} }
/** /**
* Expert: Don't use this! * Expert: specify a custom maximum possible symbol
* (alphaMax); default is Character.MAX_CODE_POINT.
*/ */
public LevenshteinAutomata(int[] word, int alphaMax, boolean withTranspositions) { public LevenshteinAutomata(int[] word, int alphaMax, boolean withTranspositions) {
this.word = word; this.word = word;
@ -63,8 +64,13 @@ public class LevenshteinAutomata {
// calculate the alphabet // calculate the alphabet
SortedSet<Integer> set = new TreeSet<Integer>(); SortedSet<Integer> set = new TreeSet<Integer>();
for (int i = 0; i < word.length; i++) for (int i = 0; i < word.length; i++) {
set.add(word[i]); int v = word[i];
if (v > alphaMax) {
throw new IllegalArgumentException("alphaMax exceeded by symbol " + v + " in word");
}
set.add(v);
}
alphabet = new int[set.size()]; alphabet = new int[set.size()];
Iterator<Integer> iterator = set.iterator(); Iterator<Integer> iterator = set.iterator();
for (int i = 0; i < alphabet.length; i++) for (int i = 0; i < alphabet.length; i++)

View File

@ -72,7 +72,6 @@ public final class FuzzySuggester extends AnalyzingSuggester {
private final boolean transpositions; private final boolean transpositions;
private final int nonFuzzyPrefix; private final int nonFuzzyPrefix;
private final int minFuzzyLength; private final int minFuzzyLength;
private final boolean allowSepEdit;
/** /**
* The default minimum length of the key passed to {@link * The default minimum length of the key passed to {@link
@ -91,11 +90,6 @@ public final class FuzzySuggester extends AnalyzingSuggester {
*/ */
public static final int DEFAULT_MAX_EDITS = 1; public static final int DEFAULT_MAX_EDITS = 1;
/**
* We allow token separator to be deleted/inserted, by default.
*/
public static final boolean DEFAULT_ALLOW_SEP_EDIT = true;
/** /**
* Creates a {@link FuzzySuggester} instance initialized with default values. * Creates a {@link FuzzySuggester} instance initialized with default values.
* *
@ -115,7 +109,7 @@ public final class FuzzySuggester extends AnalyzingSuggester {
*/ */
public FuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer) { public FuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer) {
this(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, DEFAULT_MAX_EDITS, true, this(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, DEFAULT_MAX_EDITS, true,
DEFAULT_NON_FUZZY_PREFIX, DEFAULT_MIN_FUZZY_LENGTH, DEFAULT_ALLOW_SEP_EDIT); DEFAULT_NON_FUZZY_PREFIX, DEFAULT_MIN_FUZZY_LENGTH);
} }
/** /**
@ -139,12 +133,11 @@ public final class FuzzySuggester extends AnalyzingSuggester {
* Levenshtein algorithm. * Levenshtein algorithm.
* @param nonFuzzyPrefix length of common (non-fuzzy) prefix (see default {@link #DEFAULT_NON_FUZZY_PREFIX} * @param nonFuzzyPrefix length of common (non-fuzzy) prefix (see default {@link #DEFAULT_NON_FUZZY_PREFIX}
* @param minFuzzyLength minimum length of lookup key before any edits are allowed (see default {@link #DEFAULT_MIN_FUZZY_LENGTH}) * @param minFuzzyLength minimum length of lookup key before any edits are allowed (see default {@link #DEFAULT_MIN_FUZZY_LENGTH})
* @param allowSepEdit if true, the token separater is allowed to be an edit (so words may be split/joined) (see default {@link #DEFAULT_ALLOW_SEP_EDIT})
*/ */
public FuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer, public FuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer,
int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions, int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions,
int maxEdits, boolean transpositions, int nonFuzzyPrefix, int maxEdits, boolean transpositions, int nonFuzzyPrefix,
int minFuzzyLength, boolean allowSepEdit) { int minFuzzyLength) {
super(indexAnalyzer, queryAnalyzer, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions); super(indexAnalyzer, queryAnalyzer, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions);
if (maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) { if (maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
throw new IllegalArgumentException("maxEdits must be between 0 and " + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE); throw new IllegalArgumentException("maxEdits must be between 0 and " + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE);
@ -160,7 +153,6 @@ public final class FuzzySuggester extends AnalyzingSuggester {
this.transpositions = transpositions; this.transpositions = transpositions;
this.nonFuzzyPrefix = nonFuzzyPrefix; this.nonFuzzyPrefix = nonFuzzyPrefix;
this.minFuzzyLength = minFuzzyLength; this.minFuzzyLength = minFuzzyLength;
this.allowSepEdit = allowSepEdit;
} }
@Override @Override
@ -206,7 +198,7 @@ public final class FuzzySuggester extends AnalyzingSuggester {
// to allow the trailing dedup bytes to be // to allow the trailing dedup bytes to be
// edited... but then 0 byte is "in general" allowed // edited... but then 0 byte is "in general" allowed
// on input (but not in UTF8). // on input (but not in UTF8).
LevenshteinAutomata lev = new LevenshteinAutomata(ints, allowSepEdit ? 255 : 254, transpositions); LevenshteinAutomata lev = new LevenshteinAutomata(ints, 255, transpositions);
Automaton levAutomaton = lev.toAutomaton(maxEdits); Automaton levAutomaton = lev.toAutomaton(maxEdits);
Automaton combined = BasicOperations.concatenate(Arrays.asList(prefix, levAutomaton)); Automaton combined = BasicOperations.concatenate(Arrays.asList(prefix, levAutomaton));
combined.setDeterministic(true); // its like the special case in concatenate itself, except we cloneExpanded already combined.setDeterministic(true); // its like the special case in concatenate itself, except we cloneExpanded already

View File

@ -184,7 +184,7 @@ public class FuzzySuggesterTest extends LuceneTestCase {
int options = 0; int options = 0;
Analyzer a = new MockAnalyzer(random()); Analyzer a = new MockAnalyzer(random());
FuzzySuggester suggester = new FuzzySuggester(a, a, options, 256, -1, 1, true, 1, 3, true); FuzzySuggester suggester = new FuzzySuggester(a, a, options, 256, -1, 1, true, 1, 3);
suggester.build(new TermFreqArrayIterator(keys)); suggester.build(new TermFreqArrayIterator(keys));
// TODO: would be nice if "ab " would allow the test to // TODO: would be nice if "ab " would allow the test to
// pass, and more generally if the analyzer can know // pass, and more generally if the analyzer can know
@ -387,7 +387,7 @@ public class FuzzySuggesterTest extends LuceneTestCase {
public void testExactFirst() throws Exception { public void testExactFirst() throws Exception {
Analyzer a = getUnusualAnalyzer(); Analyzer a = getUnusualAnalyzer();
FuzzySuggester suggester = new FuzzySuggester(a, a, AnalyzingSuggester.EXACT_FIRST | AnalyzingSuggester.PRESERVE_SEP, 256, -1, 1, true, 1, 3, true); FuzzySuggester suggester = new FuzzySuggester(a, a, AnalyzingSuggester.EXACT_FIRST | AnalyzingSuggester.PRESERVE_SEP, 256, -1, 1, true, 1, 3);
suggester.build(new TermFreqArrayIterator(new TermFreq[] { suggester.build(new TermFreqArrayIterator(new TermFreq[] {
new TermFreq("x y", 1), new TermFreq("x y", 1),
new TermFreq("x y z", 3), new TermFreq("x y z", 3),
@ -426,7 +426,7 @@ public class FuzzySuggesterTest extends LuceneTestCase {
public void testNonExactFirst() throws Exception { public void testNonExactFirst() throws Exception {
Analyzer a = getUnusualAnalyzer(); Analyzer a = getUnusualAnalyzer();
FuzzySuggester suggester = new FuzzySuggester(a, a, AnalyzingSuggester.PRESERVE_SEP, 256, -1, 1, true, 1, 3, true); FuzzySuggester suggester = new FuzzySuggester(a, a, AnalyzingSuggester.PRESERVE_SEP, 256, -1, 1, true, 1, 3);
suggester.build(new TermFreqArrayIterator(new TermFreq[] { suggester.build(new TermFreqArrayIterator(new TermFreq[] {
new TermFreq("x y", 1), new TermFreq("x y", 1),
@ -645,7 +645,7 @@ public class FuzzySuggesterTest extends LuceneTestCase {
Analyzer a = new MockTokenEatingAnalyzer(numStopChars, preserveHoles); Analyzer a = new MockTokenEatingAnalyzer(numStopChars, preserveHoles);
FuzzySuggester suggester = new FuzzySuggester(a, a, FuzzySuggester suggester = new FuzzySuggester(a, a,
preserveSep ? AnalyzingSuggester.PRESERVE_SEP : 0, 256, -1, 1, false, 1, 3, true); preserveSep ? AnalyzingSuggester.PRESERVE_SEP : 0, 256, -1, 1, false, 1, 3);
suggester.build(new TermFreqArrayIterator(keys)); suggester.build(new TermFreqArrayIterator(keys));
for (String prefix : allPrefixes) { for (String prefix : allPrefixes) {
@ -775,10 +775,9 @@ public class FuzzySuggesterTest extends LuceneTestCase {
} }
} }
public void testMaxSurfaceFormsPerAnalyzedForm() throws Exception { public void testMaxSurfaceFormsPerAnalyzedForm() throws Exception {
Analyzer a = new MockAnalyzer(random()); Analyzer a = new MockAnalyzer(random());
FuzzySuggester suggester = new FuzzySuggester(a, a, 0, 2, -1, 1, true, 1, 3, true); FuzzySuggester suggester = new FuzzySuggester(a, a, 0, 2, -1, 1, true, 1, 3);
List<TermFreq> keys = Arrays.asList(new TermFreq[] { List<TermFreq> keys = Arrays.asList(new TermFreq[] {
new TermFreq("a", 40), new TermFreq("a", 40),
@ -797,6 +796,26 @@ public class FuzzySuggesterTest extends LuceneTestCase {
assertEquals(50, results.get(1).value); assertEquals(50, results.get(1).value);
} }
public void testEditSeps() throws Exception {
Analyzer a = new MockAnalyzer(random());
FuzzySuggester suggester = new FuzzySuggester(a, a, FuzzySuggester.PRESERVE_SEP, 2, -1, 2, true, 1, 3);
List<TermFreq> keys = Arrays.asList(new TermFreq[] {
new TermFreq("foo bar", 40),
new TermFreq("foo bar baz", 50),
new TermFreq("barbaz", 60),
new TermFreq("barbazfoo", 10),
});
Collections.shuffle(keys, random());
suggester.build(new TermFreqArrayIterator(keys));
assertEquals("[foo bar baz/50, foo bar/40]", suggester.lookup("foobar", false, 5).toString());
assertEquals("[foo bar baz/50]", suggester.lookup("foobarbaz", false, 5).toString());
assertEquals("[barbaz/60, barbazfoo/10]", suggester.lookup("bar baz", false, 5).toString());
assertEquals("[barbazfoo/10]", suggester.lookup("bar baz foo", false, 5).toString());
}
private static String addRandomEdit(String string, int prefixLength) { private static String addRandomEdit(String string, int prefixLength) {
char[] input = string.toCharArray(); char[] input = string.toCharArray();
StringBuilder builder = new StringBuilder(); StringBuilder builder = new StringBuilder();
@ -891,7 +910,7 @@ public class FuzzySuggesterTest extends LuceneTestCase {
boolean transpositions = random().nextBoolean(); boolean transpositions = random().nextBoolean();
// TODO: test graph analyzers // TODO: test graph analyzers
// TODO: test exactFirst / preserveSep permutations // TODO: test exactFirst / preserveSep permutations
FuzzySuggester suggest = new FuzzySuggester(a, a, 0, 256, -1, maxEdits, transpositions, prefixLen, 3, true); FuzzySuggester suggest = new FuzzySuggester(a, a, 0, 256, -1, maxEdits, transpositions, prefixLen, 3);
if (VERBOSE) { if (VERBOSE) {
System.out.println("TEST: maxEdits=" + maxEdits + " prefixLen=" + prefixLen + " transpositions=" + transpositions + " num=" + NUM); System.out.println("TEST: maxEdits=" + maxEdits + " prefixLen=" + prefixLen + " transpositions=" + transpositions + " num=" + NUM);