mirror of https://github.com/apache/lucene.git
LUCENE-3846: remove allowSepEdit (it doesn't work); add test
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3846@1403413 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
bd8ef39a3b
commit
c9a38e3236
|
@ -36,7 +36,7 @@ public class LevenshteinAutomata {
|
|||
final int word[];
|
||||
/* the automata alphabet. */
|
||||
final int alphabet[];
|
||||
/* the maximum symbol in the alphabet (e.g. 256 for UTF-8 or 10FFFF for UTF-32) */
|
||||
/* the maximum symbol in the alphabet (e.g. 255 for UTF-8 or 10FFFF for UTF-32) */
|
||||
final int alphaMax;
|
||||
|
||||
/* the ranges outside of alphabet */
|
||||
|
@ -53,18 +53,24 @@ public class LevenshteinAutomata {
|
|||
public LevenshteinAutomata(String input, boolean withTranspositions) {
|
||||
this(codePoints(input), Character.MAX_CODE_POINT, withTranspositions);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Expert: Don't use this!
|
||||
* Expert: specify a custom maximum possible symbol
|
||||
* (alphaMax); default is Character.MAX_CODE_POINT.
|
||||
*/
|
||||
public LevenshteinAutomata(int[] word, int alphaMax, boolean withTranspositions) {
|
||||
this.word = word;
|
||||
this.alphaMax = alphaMax;
|
||||
|
||||
|
||||
// calculate the alphabet
|
||||
SortedSet<Integer> set = new TreeSet<Integer>();
|
||||
for (int i = 0; i < word.length; i++)
|
||||
set.add(word[i]);
|
||||
for (int i = 0; i < word.length; i++) {
|
||||
int v = word[i];
|
||||
if (v > alphaMax) {
|
||||
throw new IllegalArgumentException("alphaMax exceeded by symbol " + v + " in word");
|
||||
}
|
||||
set.add(v);
|
||||
}
|
||||
alphabet = new int[set.size()];
|
||||
Iterator<Integer> iterator = set.iterator();
|
||||
for (int i = 0; i < alphabet.length; i++)
|
||||
|
|
|
@ -72,7 +72,6 @@ public final class FuzzySuggester extends AnalyzingSuggester {
|
|||
private final boolean transpositions;
|
||||
private final int nonFuzzyPrefix;
|
||||
private final int minFuzzyLength;
|
||||
private final boolean allowSepEdit;
|
||||
|
||||
/**
|
||||
* The default minimum length of the key passed to {@link
|
||||
|
@ -91,11 +90,6 @@ public final class FuzzySuggester extends AnalyzingSuggester {
|
|||
*/
|
||||
public static final int DEFAULT_MAX_EDITS = 1;
|
||||
|
||||
/**
|
||||
* We allow token separator to be deleted/inserted, by default.
|
||||
*/
|
||||
public static final boolean DEFAULT_ALLOW_SEP_EDIT = true;
|
||||
|
||||
/**
|
||||
* Creates a {@link FuzzySuggester} instance initialized with default values.
|
||||
*
|
||||
|
@ -115,7 +109,7 @@ public final class FuzzySuggester extends AnalyzingSuggester {
|
|||
*/
|
||||
public FuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer) {
|
||||
this(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, DEFAULT_MAX_EDITS, true,
|
||||
DEFAULT_NON_FUZZY_PREFIX, DEFAULT_MIN_FUZZY_LENGTH, DEFAULT_ALLOW_SEP_EDIT);
|
||||
DEFAULT_NON_FUZZY_PREFIX, DEFAULT_MIN_FUZZY_LENGTH);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -139,12 +133,11 @@ public final class FuzzySuggester extends AnalyzingSuggester {
|
|||
* Levenshtein algorithm.
|
||||
* @param nonFuzzyPrefix length of common (non-fuzzy) prefix (see default {@link #DEFAULT_NON_FUZZY_PREFIX}
|
||||
* @param minFuzzyLength minimum length of lookup key before any edits are allowed (see default {@link #DEFAULT_MIN_FUZZY_LENGTH})
|
||||
* @param allowSepEdit if true, the token separater is allowed to be an edit (so words may be split/joined) (see default {@link #DEFAULT_ALLOW_SEP_EDIT})
|
||||
*/
|
||||
public FuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer,
|
||||
int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions,
|
||||
int maxEdits, boolean transpositions, int nonFuzzyPrefix,
|
||||
int minFuzzyLength, boolean allowSepEdit) {
|
||||
int minFuzzyLength) {
|
||||
super(indexAnalyzer, queryAnalyzer, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions);
|
||||
if (maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
|
||||
throw new IllegalArgumentException("maxEdits must be between 0 and " + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE);
|
||||
|
@ -160,7 +153,6 @@ public final class FuzzySuggester extends AnalyzingSuggester {
|
|||
this.transpositions = transpositions;
|
||||
this.nonFuzzyPrefix = nonFuzzyPrefix;
|
||||
this.minFuzzyLength = minFuzzyLength;
|
||||
this.allowSepEdit = allowSepEdit;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -206,7 +198,7 @@ public final class FuzzySuggester extends AnalyzingSuggester {
|
|||
// to allow the trailing dedup bytes to be
|
||||
// edited... but then 0 byte is "in general" allowed
|
||||
// on input (but not in UTF8).
|
||||
LevenshteinAutomata lev = new LevenshteinAutomata(ints, allowSepEdit ? 255 : 254, transpositions);
|
||||
LevenshteinAutomata lev = new LevenshteinAutomata(ints, 255, transpositions);
|
||||
Automaton levAutomaton = lev.toAutomaton(maxEdits);
|
||||
Automaton combined = BasicOperations.concatenate(Arrays.asList(prefix, levAutomaton));
|
||||
combined.setDeterministic(true); // its like the special case in concatenate itself, except we cloneExpanded already
|
||||
|
|
|
@ -184,7 +184,7 @@ public class FuzzySuggesterTest extends LuceneTestCase {
|
|||
int options = 0;
|
||||
|
||||
Analyzer a = new MockAnalyzer(random());
|
||||
FuzzySuggester suggester = new FuzzySuggester(a, a, options, 256, -1, 1, true, 1, 3, true);
|
||||
FuzzySuggester suggester = new FuzzySuggester(a, a, options, 256, -1, 1, true, 1, 3);
|
||||
suggester.build(new TermFreqArrayIterator(keys));
|
||||
// TODO: would be nice if "ab " would allow the test to
|
||||
// pass, and more generally if the analyzer can know
|
||||
|
@ -387,7 +387,7 @@ public class FuzzySuggesterTest extends LuceneTestCase {
|
|||
public void testExactFirst() throws Exception {
|
||||
|
||||
Analyzer a = getUnusualAnalyzer();
|
||||
FuzzySuggester suggester = new FuzzySuggester(a, a, AnalyzingSuggester.EXACT_FIRST | AnalyzingSuggester.PRESERVE_SEP, 256, -1, 1, true, 1, 3, true);
|
||||
FuzzySuggester suggester = new FuzzySuggester(a, a, AnalyzingSuggester.EXACT_FIRST | AnalyzingSuggester.PRESERVE_SEP, 256, -1, 1, true, 1, 3);
|
||||
suggester.build(new TermFreqArrayIterator(new TermFreq[] {
|
||||
new TermFreq("x y", 1),
|
||||
new TermFreq("x y z", 3),
|
||||
|
@ -426,7 +426,7 @@ public class FuzzySuggesterTest extends LuceneTestCase {
|
|||
public void testNonExactFirst() throws Exception {
|
||||
|
||||
Analyzer a = getUnusualAnalyzer();
|
||||
FuzzySuggester suggester = new FuzzySuggester(a, a, AnalyzingSuggester.PRESERVE_SEP, 256, -1, 1, true, 1, 3, true);
|
||||
FuzzySuggester suggester = new FuzzySuggester(a, a, AnalyzingSuggester.PRESERVE_SEP, 256, -1, 1, true, 1, 3);
|
||||
|
||||
suggester.build(new TermFreqArrayIterator(new TermFreq[] {
|
||||
new TermFreq("x y", 1),
|
||||
|
@ -645,7 +645,7 @@ public class FuzzySuggesterTest extends LuceneTestCase {
|
|||
|
||||
Analyzer a = new MockTokenEatingAnalyzer(numStopChars, preserveHoles);
|
||||
FuzzySuggester suggester = new FuzzySuggester(a, a,
|
||||
preserveSep ? AnalyzingSuggester.PRESERVE_SEP : 0, 256, -1, 1, false, 1, 3, true);
|
||||
preserveSep ? AnalyzingSuggester.PRESERVE_SEP : 0, 256, -1, 1, false, 1, 3);
|
||||
suggester.build(new TermFreqArrayIterator(keys));
|
||||
|
||||
for (String prefix : allPrefixes) {
|
||||
|
@ -775,10 +775,9 @@ public class FuzzySuggesterTest extends LuceneTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
public void testMaxSurfaceFormsPerAnalyzedForm() throws Exception {
|
||||
Analyzer a = new MockAnalyzer(random());
|
||||
FuzzySuggester suggester = new FuzzySuggester(a, a, 0, 2, -1, 1, true, 1, 3, true);
|
||||
FuzzySuggester suggester = new FuzzySuggester(a, a, 0, 2, -1, 1, true, 1, 3);
|
||||
|
||||
List<TermFreq> keys = Arrays.asList(new TermFreq[] {
|
||||
new TermFreq("a", 40),
|
||||
|
@ -796,6 +795,26 @@ public class FuzzySuggesterTest extends LuceneTestCase {
|
|||
assertEquals("a ", results.get(1).key);
|
||||
assertEquals(50, results.get(1).value);
|
||||
}
|
||||
|
||||
public void testEditSeps() throws Exception {
|
||||
Analyzer a = new MockAnalyzer(random());
|
||||
FuzzySuggester suggester = new FuzzySuggester(a, a, FuzzySuggester.PRESERVE_SEP, 2, -1, 2, true, 1, 3);
|
||||
|
||||
List<TermFreq> keys = Arrays.asList(new TermFreq[] {
|
||||
new TermFreq("foo bar", 40),
|
||||
new TermFreq("foo bar baz", 50),
|
||||
new TermFreq("barbaz", 60),
|
||||
new TermFreq("barbazfoo", 10),
|
||||
});
|
||||
|
||||
Collections.shuffle(keys, random());
|
||||
suggester.build(new TermFreqArrayIterator(keys));
|
||||
|
||||
assertEquals("[foo bar baz/50, foo bar/40]", suggester.lookup("foobar", false, 5).toString());
|
||||
assertEquals("[foo bar baz/50]", suggester.lookup("foobarbaz", false, 5).toString());
|
||||
assertEquals("[barbaz/60, barbazfoo/10]", suggester.lookup("bar baz", false, 5).toString());
|
||||
assertEquals("[barbazfoo/10]", suggester.lookup("bar baz foo", false, 5).toString());
|
||||
}
|
||||
|
||||
private static String addRandomEdit(String string, int prefixLength) {
|
||||
char[] input = string.toCharArray();
|
||||
|
@ -891,7 +910,7 @@ public class FuzzySuggesterTest extends LuceneTestCase {
|
|||
boolean transpositions = random().nextBoolean();
|
||||
// TODO: test graph analyzers
|
||||
// TODO: test exactFirst / preserveSep permutations
|
||||
FuzzySuggester suggest = new FuzzySuggester(a, a, 0, 256, -1, maxEdits, transpositions, prefixLen, 3, true);
|
||||
FuzzySuggester suggest = new FuzzySuggester(a, a, 0, 256, -1, maxEdits, transpositions, prefixLen, 3);
|
||||
|
||||
if (VERBOSE) {
|
||||
System.out.println("TEST: maxEdits=" + maxEdits + " prefixLen=" + prefixLen + " transpositions=" + transpositions + " num=" + NUM);
|
||||
|
|
Loading…
Reference in New Issue