mirror of https://github.com/apache/lucene.git
LUCENE-3846: remove allowSepEdit (it doesn't work); add test
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3846@1403413 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
bd8ef39a3b
commit
c9a38e3236
|
@ -36,7 +36,7 @@ public class LevenshteinAutomata {
|
||||||
final int word[];
|
final int word[];
|
||||||
/* the automata alphabet. */
|
/* the automata alphabet. */
|
||||||
final int alphabet[];
|
final int alphabet[];
|
||||||
/* the maximum symbol in the alphabet (e.g. 256 for UTF-8 or 10FFFF for UTF-32) */
|
/* the maximum symbol in the alphabet (e.g. 255 for UTF-8 or 10FFFF for UTF-32) */
|
||||||
final int alphaMax;
|
final int alphaMax;
|
||||||
|
|
||||||
/* the ranges outside of alphabet */
|
/* the ranges outside of alphabet */
|
||||||
|
@ -55,7 +55,8 @@ public class LevenshteinAutomata {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Expert: Don't use this!
|
* Expert: specify a custom maximum possible symbol
|
||||||
|
* (alphaMax); default is Character.MAX_CODE_POINT.
|
||||||
*/
|
*/
|
||||||
public LevenshteinAutomata(int[] word, int alphaMax, boolean withTranspositions) {
|
public LevenshteinAutomata(int[] word, int alphaMax, boolean withTranspositions) {
|
||||||
this.word = word;
|
this.word = word;
|
||||||
|
@ -63,8 +64,13 @@ public class LevenshteinAutomata {
|
||||||
|
|
||||||
// calculate the alphabet
|
// calculate the alphabet
|
||||||
SortedSet<Integer> set = new TreeSet<Integer>();
|
SortedSet<Integer> set = new TreeSet<Integer>();
|
||||||
for (int i = 0; i < word.length; i++)
|
for (int i = 0; i < word.length; i++) {
|
||||||
set.add(word[i]);
|
int v = word[i];
|
||||||
|
if (v > alphaMax) {
|
||||||
|
throw new IllegalArgumentException("alphaMax exceeded by symbol " + v + " in word");
|
||||||
|
}
|
||||||
|
set.add(v);
|
||||||
|
}
|
||||||
alphabet = new int[set.size()];
|
alphabet = new int[set.size()];
|
||||||
Iterator<Integer> iterator = set.iterator();
|
Iterator<Integer> iterator = set.iterator();
|
||||||
for (int i = 0; i < alphabet.length; i++)
|
for (int i = 0; i < alphabet.length; i++)
|
||||||
|
|
|
@ -72,7 +72,6 @@ public final class FuzzySuggester extends AnalyzingSuggester {
|
||||||
private final boolean transpositions;
|
private final boolean transpositions;
|
||||||
private final int nonFuzzyPrefix;
|
private final int nonFuzzyPrefix;
|
||||||
private final int minFuzzyLength;
|
private final int minFuzzyLength;
|
||||||
private final boolean allowSepEdit;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The default minimum length of the key passed to {@link
|
* The default minimum length of the key passed to {@link
|
||||||
|
@ -91,11 +90,6 @@ public final class FuzzySuggester extends AnalyzingSuggester {
|
||||||
*/
|
*/
|
||||||
public static final int DEFAULT_MAX_EDITS = 1;
|
public static final int DEFAULT_MAX_EDITS = 1;
|
||||||
|
|
||||||
/**
|
|
||||||
* We allow token separator to be deleted/inserted, by default.
|
|
||||||
*/
|
|
||||||
public static final boolean DEFAULT_ALLOW_SEP_EDIT = true;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a {@link FuzzySuggester} instance initialized with default values.
|
* Creates a {@link FuzzySuggester} instance initialized with default values.
|
||||||
*
|
*
|
||||||
|
@ -115,7 +109,7 @@ public final class FuzzySuggester extends AnalyzingSuggester {
|
||||||
*/
|
*/
|
||||||
public FuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer) {
|
public FuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer) {
|
||||||
this(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, DEFAULT_MAX_EDITS, true,
|
this(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, DEFAULT_MAX_EDITS, true,
|
||||||
DEFAULT_NON_FUZZY_PREFIX, DEFAULT_MIN_FUZZY_LENGTH, DEFAULT_ALLOW_SEP_EDIT);
|
DEFAULT_NON_FUZZY_PREFIX, DEFAULT_MIN_FUZZY_LENGTH);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -139,12 +133,11 @@ public final class FuzzySuggester extends AnalyzingSuggester {
|
||||||
* Levenshtein algorithm.
|
* Levenshtein algorithm.
|
||||||
* @param nonFuzzyPrefix length of common (non-fuzzy) prefix (see default {@link #DEFAULT_NON_FUZZY_PREFIX}
|
* @param nonFuzzyPrefix length of common (non-fuzzy) prefix (see default {@link #DEFAULT_NON_FUZZY_PREFIX}
|
||||||
* @param minFuzzyLength minimum length of lookup key before any edits are allowed (see default {@link #DEFAULT_MIN_FUZZY_LENGTH})
|
* @param minFuzzyLength minimum length of lookup key before any edits are allowed (see default {@link #DEFAULT_MIN_FUZZY_LENGTH})
|
||||||
* @param allowSepEdit if true, the token separater is allowed to be an edit (so words may be split/joined) (see default {@link #DEFAULT_ALLOW_SEP_EDIT})
|
|
||||||
*/
|
*/
|
||||||
public FuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer,
|
public FuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer,
|
||||||
int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions,
|
int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions,
|
||||||
int maxEdits, boolean transpositions, int nonFuzzyPrefix,
|
int maxEdits, boolean transpositions, int nonFuzzyPrefix,
|
||||||
int minFuzzyLength, boolean allowSepEdit) {
|
int minFuzzyLength) {
|
||||||
super(indexAnalyzer, queryAnalyzer, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions);
|
super(indexAnalyzer, queryAnalyzer, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions);
|
||||||
if (maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
|
if (maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
|
||||||
throw new IllegalArgumentException("maxEdits must be between 0 and " + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE);
|
throw new IllegalArgumentException("maxEdits must be between 0 and " + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE);
|
||||||
|
@ -160,7 +153,6 @@ public final class FuzzySuggester extends AnalyzingSuggester {
|
||||||
this.transpositions = transpositions;
|
this.transpositions = transpositions;
|
||||||
this.nonFuzzyPrefix = nonFuzzyPrefix;
|
this.nonFuzzyPrefix = nonFuzzyPrefix;
|
||||||
this.minFuzzyLength = minFuzzyLength;
|
this.minFuzzyLength = minFuzzyLength;
|
||||||
this.allowSepEdit = allowSepEdit;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -206,7 +198,7 @@ public final class FuzzySuggester extends AnalyzingSuggester {
|
||||||
// to allow the trailing dedup bytes to be
|
// to allow the trailing dedup bytes to be
|
||||||
// edited... but then 0 byte is "in general" allowed
|
// edited... but then 0 byte is "in general" allowed
|
||||||
// on input (but not in UTF8).
|
// on input (but not in UTF8).
|
||||||
LevenshteinAutomata lev = new LevenshteinAutomata(ints, allowSepEdit ? 255 : 254, transpositions);
|
LevenshteinAutomata lev = new LevenshteinAutomata(ints, 255, transpositions);
|
||||||
Automaton levAutomaton = lev.toAutomaton(maxEdits);
|
Automaton levAutomaton = lev.toAutomaton(maxEdits);
|
||||||
Automaton combined = BasicOperations.concatenate(Arrays.asList(prefix, levAutomaton));
|
Automaton combined = BasicOperations.concatenate(Arrays.asList(prefix, levAutomaton));
|
||||||
combined.setDeterministic(true); // its like the special case in concatenate itself, except we cloneExpanded already
|
combined.setDeterministic(true); // its like the special case in concatenate itself, except we cloneExpanded already
|
||||||
|
|
|
@ -184,7 +184,7 @@ public class FuzzySuggesterTest extends LuceneTestCase {
|
||||||
int options = 0;
|
int options = 0;
|
||||||
|
|
||||||
Analyzer a = new MockAnalyzer(random());
|
Analyzer a = new MockAnalyzer(random());
|
||||||
FuzzySuggester suggester = new FuzzySuggester(a, a, options, 256, -1, 1, true, 1, 3, true);
|
FuzzySuggester suggester = new FuzzySuggester(a, a, options, 256, -1, 1, true, 1, 3);
|
||||||
suggester.build(new TermFreqArrayIterator(keys));
|
suggester.build(new TermFreqArrayIterator(keys));
|
||||||
// TODO: would be nice if "ab " would allow the test to
|
// TODO: would be nice if "ab " would allow the test to
|
||||||
// pass, and more generally if the analyzer can know
|
// pass, and more generally if the analyzer can know
|
||||||
|
@ -387,7 +387,7 @@ public class FuzzySuggesterTest extends LuceneTestCase {
|
||||||
public void testExactFirst() throws Exception {
|
public void testExactFirst() throws Exception {
|
||||||
|
|
||||||
Analyzer a = getUnusualAnalyzer();
|
Analyzer a = getUnusualAnalyzer();
|
||||||
FuzzySuggester suggester = new FuzzySuggester(a, a, AnalyzingSuggester.EXACT_FIRST | AnalyzingSuggester.PRESERVE_SEP, 256, -1, 1, true, 1, 3, true);
|
FuzzySuggester suggester = new FuzzySuggester(a, a, AnalyzingSuggester.EXACT_FIRST | AnalyzingSuggester.PRESERVE_SEP, 256, -1, 1, true, 1, 3);
|
||||||
suggester.build(new TermFreqArrayIterator(new TermFreq[] {
|
suggester.build(new TermFreqArrayIterator(new TermFreq[] {
|
||||||
new TermFreq("x y", 1),
|
new TermFreq("x y", 1),
|
||||||
new TermFreq("x y z", 3),
|
new TermFreq("x y z", 3),
|
||||||
|
@ -426,7 +426,7 @@ public class FuzzySuggesterTest extends LuceneTestCase {
|
||||||
public void testNonExactFirst() throws Exception {
|
public void testNonExactFirst() throws Exception {
|
||||||
|
|
||||||
Analyzer a = getUnusualAnalyzer();
|
Analyzer a = getUnusualAnalyzer();
|
||||||
FuzzySuggester suggester = new FuzzySuggester(a, a, AnalyzingSuggester.PRESERVE_SEP, 256, -1, 1, true, 1, 3, true);
|
FuzzySuggester suggester = new FuzzySuggester(a, a, AnalyzingSuggester.PRESERVE_SEP, 256, -1, 1, true, 1, 3);
|
||||||
|
|
||||||
suggester.build(new TermFreqArrayIterator(new TermFreq[] {
|
suggester.build(new TermFreqArrayIterator(new TermFreq[] {
|
||||||
new TermFreq("x y", 1),
|
new TermFreq("x y", 1),
|
||||||
|
@ -645,7 +645,7 @@ public class FuzzySuggesterTest extends LuceneTestCase {
|
||||||
|
|
||||||
Analyzer a = new MockTokenEatingAnalyzer(numStopChars, preserveHoles);
|
Analyzer a = new MockTokenEatingAnalyzer(numStopChars, preserveHoles);
|
||||||
FuzzySuggester suggester = new FuzzySuggester(a, a,
|
FuzzySuggester suggester = new FuzzySuggester(a, a,
|
||||||
preserveSep ? AnalyzingSuggester.PRESERVE_SEP : 0, 256, -1, 1, false, 1, 3, true);
|
preserveSep ? AnalyzingSuggester.PRESERVE_SEP : 0, 256, -1, 1, false, 1, 3);
|
||||||
suggester.build(new TermFreqArrayIterator(keys));
|
suggester.build(new TermFreqArrayIterator(keys));
|
||||||
|
|
||||||
for (String prefix : allPrefixes) {
|
for (String prefix : allPrefixes) {
|
||||||
|
@ -775,10 +775,9 @@ public class FuzzySuggesterTest extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public void testMaxSurfaceFormsPerAnalyzedForm() throws Exception {
|
public void testMaxSurfaceFormsPerAnalyzedForm() throws Exception {
|
||||||
Analyzer a = new MockAnalyzer(random());
|
Analyzer a = new MockAnalyzer(random());
|
||||||
FuzzySuggester suggester = new FuzzySuggester(a, a, 0, 2, -1, 1, true, 1, 3, true);
|
FuzzySuggester suggester = new FuzzySuggester(a, a, 0, 2, -1, 1, true, 1, 3);
|
||||||
|
|
||||||
List<TermFreq> keys = Arrays.asList(new TermFreq[] {
|
List<TermFreq> keys = Arrays.asList(new TermFreq[] {
|
||||||
new TermFreq("a", 40),
|
new TermFreq("a", 40),
|
||||||
|
@ -797,6 +796,26 @@ public class FuzzySuggesterTest extends LuceneTestCase {
|
||||||
assertEquals(50, results.get(1).value);
|
assertEquals(50, results.get(1).value);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testEditSeps() throws Exception {
|
||||||
|
Analyzer a = new MockAnalyzer(random());
|
||||||
|
FuzzySuggester suggester = new FuzzySuggester(a, a, FuzzySuggester.PRESERVE_SEP, 2, -1, 2, true, 1, 3);
|
||||||
|
|
||||||
|
List<TermFreq> keys = Arrays.asList(new TermFreq[] {
|
||||||
|
new TermFreq("foo bar", 40),
|
||||||
|
new TermFreq("foo bar baz", 50),
|
||||||
|
new TermFreq("barbaz", 60),
|
||||||
|
new TermFreq("barbazfoo", 10),
|
||||||
|
});
|
||||||
|
|
||||||
|
Collections.shuffle(keys, random());
|
||||||
|
suggester.build(new TermFreqArrayIterator(keys));
|
||||||
|
|
||||||
|
assertEquals("[foo bar baz/50, foo bar/40]", suggester.lookup("foobar", false, 5).toString());
|
||||||
|
assertEquals("[foo bar baz/50]", suggester.lookup("foobarbaz", false, 5).toString());
|
||||||
|
assertEquals("[barbaz/60, barbazfoo/10]", suggester.lookup("bar baz", false, 5).toString());
|
||||||
|
assertEquals("[barbazfoo/10]", suggester.lookup("bar baz foo", false, 5).toString());
|
||||||
|
}
|
||||||
|
|
||||||
private static String addRandomEdit(String string, int prefixLength) {
|
private static String addRandomEdit(String string, int prefixLength) {
|
||||||
char[] input = string.toCharArray();
|
char[] input = string.toCharArray();
|
||||||
StringBuilder builder = new StringBuilder();
|
StringBuilder builder = new StringBuilder();
|
||||||
|
@ -891,7 +910,7 @@ public class FuzzySuggesterTest extends LuceneTestCase {
|
||||||
boolean transpositions = random().nextBoolean();
|
boolean transpositions = random().nextBoolean();
|
||||||
// TODO: test graph analyzers
|
// TODO: test graph analyzers
|
||||||
// TODO: test exactFirst / preserveSep permutations
|
// TODO: test exactFirst / preserveSep permutations
|
||||||
FuzzySuggester suggest = new FuzzySuggester(a, a, 0, 256, -1, maxEdits, transpositions, prefixLen, 3, true);
|
FuzzySuggester suggest = new FuzzySuggester(a, a, 0, 256, -1, maxEdits, transpositions, prefixLen, 3);
|
||||||
|
|
||||||
if (VERBOSE) {
|
if (VERBOSE) {
|
||||||
System.out.println("TEST: maxEdits=" + maxEdits + " prefixLen=" + prefixLen + " transpositions=" + transpositions + " num=" + NUM);
|
System.out.println("TEST: maxEdits=" + maxEdits + " prefixLen=" + prefixLen + " transpositions=" + transpositions + " num=" + NUM);
|
||||||
|
|
Loading…
Reference in New Issue