From 83c7b92925d4c0c1e5dde4513d825228a974db54 Mon Sep 17 00:00:00 2001 From: Michael McCandless Date: Mon, 29 Oct 2012 14:50:20 +0000 Subject: [PATCH] LUCENE-3846: resolve nocommits git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3846@1403334 13f79535-47bb-0310-9956-ffa450edef68 --- .../lucene/util/automaton/Transition.java | 4 - .../suggest/analyzing/AnalyzingSuggester.java | 4 +- .../suggest/analyzing/FuzzySuggester.java | 132 ++++++++++-------- .../search/suggest/LookupBenchmarkTest.java | 2 +- .../suggest/analyzing/FuzzySuggesterTest.java | 38 +++-- 5 files changed, 104 insertions(+), 76 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Transition.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Transition.java index d7a137b8dc2..d03bcbc69ba 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/Transition.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Transition.java @@ -139,9 +139,6 @@ public class Transition implements Cloneable { static void appendCharString(int c, StringBuilder b) { if (c >= 0x21 && c <= 0x7e && c != '\\' && c != '"') b.appendCodePoint(c); else { - b.append("\\\\U" + Integer.toHexString(c)); - // nocommit - /* b.append("\\\\U"); String s = Integer.toHexString(c); if (c < 0x10) b.append("0000000").append(s); @@ -152,7 +149,6 @@ public class Transition implements Cloneable { else if (c < 0x1000000) b.append("00").append(s); else if (c < 0x10000000) b.append("0").append(s); else b.append(s); - */ } } diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java index 3497cb98a18..ba359b56619 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java @@ -619,8 +619,8 @@ public class AnalyzingSuggester extends Lookup { UnicodeUtil.UTF8toUTF16(completion.output.output2, spare); LookupResult result = new LookupResult(spare.toString(), decodeWeight(completion.output.output1)); - // nocommit for fuzzy case would be nice to return - // how many edits were required...: + // TODO: for fuzzy case would be nice to return + // how many edits were required //System.out.println(" result=" + result); results.add(result); diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java index 24ddcf2eb5b..94a2b47b8df 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java @@ -40,17 +40,26 @@ import org.apache.lucene.util.fst.PairOutputs.Pair; * Implements a fuzzy {@link AnalyzingSuggester}. The similarity measurement is * based on the Damerau-Levenshtein (optimal string alignment) algorithm, though * you can explicitly choose classic Levenshtein by passing false - * to the transpositions parameter. + * for the transpositions parameter. *

* At most, this query will match terms up to * {@value org.apache.lucene.util.automaton.LevenshteinAutomata#MAXIMUM_SUPPORTED_DISTANCE} - * edits. Higher distances (especially with transpositions enabled), are not - * supported. Note that the fuzzy distance is byte-by-byte - * as returned by the {@link TokenStream}'s {@link + * edits. Higher distances are not supported. Note that the + * fuzzy distance is measured in "byte space" on the bytes + * returned by the {@link TokenStream}'s {@link * TermToBytesRefAttribute}, usually UTF8. By default - * the first 2 (@link #DEFAULT_MIN_PREFIX) bytes must match, - * and by default we allow up to 1 (@link + * the analyzed bytes must be at least 3 {@link + * #DEFAULT_MIN_FUZZY_LENGTH} bytes before any edits are + * considered. Furthermore, the first 1 {@link + * #DEFAULT_NON_FUZZY_PREFIX} byte is not allowed to be + * edited. We allow up to 1 (@link * #DEFAULT_MAX_EDITS} edit. + * + *

+ * NOTE: This suggester does not boost suggestions that + * required no edits over suggestions that did require + * edits. This is a known limitation. + * *

* Note: complex query analyzers can have a significant impact on the lookup * performance. It's recommended to not use analyzers that drop or inject terms @@ -61,31 +70,36 @@ import org.apache.lucene.util.fst.PairOutputs.Pair; public final class FuzzySuggester extends AnalyzingSuggester { private final int maxEdits; private final boolean transpositions; - private final int minPrefix; + private final int nonFuzzyPrefix; + private final int minFuzzyLength; + private final boolean allowSepEdit; - // nocommit separate param for "min length before we - // enable fuzzy"? eg type "nusglasses" into google... - /** - * The default minimum shared (non-fuzzy) prefix. Set to 2 + * The default minimum length of the key passed to {@link + * #lookup} before any edits are allowed. */ - // nocommit should we do 1...? - public static final int DEFAULT_MIN_PREFIX = 2; + public static final int DEFAULT_MIN_FUZZY_LENGTH = 3; + + /** + * The default prefix length where edits are not allowed. + */ + public static final int DEFAULT_NON_FUZZY_PREFIX = 1; /** - * The default maximum number of edits for fuzzy suggestions. Set to 1 + * The default maximum number of edits for fuzzy + * suggestions. */ public static final int DEFAULT_MAX_EDITS = 1; + + /** + * We allow token separator to be deleted/inserted, by default. + */ + public static final boolean DEFAULT_ALLOW_SEP_EDIT = true; /** * Creates a {@link FuzzySuggester} instance initialized with default values. - * Calls - * {@link FuzzySuggester#FuzzySuggester(Analyzer, Analyzer, int, int, int, int, boolean, int)} - * FuzzySuggester(analyzer, analyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, - * DEFAULT_MAX_EDITS, true, DEFAULT_MIN_PREFIX) * - * @param analyzer - * the analyzer used for this suggester + * @param analyzer the analyzer used for this suggester */ public FuzzySuggester(Analyzer analyzer) { this(analyzer, analyzer); @@ -93,10 +107,6 @@ public final class FuzzySuggester extends AnalyzingSuggester { /** * Creates a {@link FuzzySuggester} instance with an index & a query analyzer initialized with default values. - * Calls - * {@link FuzzySuggester#FuzzySuggester(Analyzer, Analyzer, int, int, int, int, boolean, int)} - * FuzzySuggester(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, - * DEFAULT_MAX_EDITS, true, DEFAULT_MIN_PREFIX) * * @param indexAnalyzer * Analyzer that will be used for analyzing suggestions while building the index. @@ -104,7 +114,8 @@ public final class FuzzySuggester extends AnalyzingSuggester { * Analyzer that will be used for analyzing query text during lookup */ public FuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer) { - this(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, DEFAULT_MAX_EDITS, true, DEFAULT_MIN_PREFIX); + this(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, DEFAULT_MAX_EDITS, true, + DEFAULT_NON_FUZZY_PREFIX, DEFAULT_MIN_FUZZY_LENGTH, DEFAULT_ALLOW_SEP_EDIT); } /** @@ -122,26 +133,34 @@ public final class FuzzySuggester extends AnalyzingSuggester { * @param maxGraphExpansions Maximum number of graph paths * to expand from the analyzed form. Set this to -1 for * no limit. - * - * @param maxEdits must be >= 0 and <= {@link LevenshteinAutomata#MAXIMUM_SUPPORTED_DISTANCE}. + * @param maxEdits must be >= 0 and <= {@link LevenshteinAutomata#MAXIMUM_SUPPORTED_DISTANCE} . * @param transpositions true if transpositions should be treated as a primitive * edit operation. If this is false, comparisons will implement the classic * Levenshtein algorithm. - * @param minPrefix length of common (non-fuzzy) prefix - * + * @param nonFuzzyPrefix length of common (non-fuzzy) prefix (see default {@link #DEFAULT_NON_FUZZY_PREFIX} + * @param minFuzzyLength minimum length of lookup key before any edits are allowed (see default {@link #DEFAULT_MIN_FUZZY_LENGTH}) + * @param allowSepEdit if true, the token separater is allowed to be an edit (so words may be split/joined) (see default {@link #DEFAULT_ALLOW_SEP_EDIT}) */ public FuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer, - int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions, int maxEdits, boolean transpositions, int minPrefix) { + int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions, + int maxEdits, boolean transpositions, int nonFuzzyPrefix, + int minFuzzyLength, boolean allowSepEdit) { super(indexAnalyzer, queryAnalyzer, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions); if (maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) { throw new IllegalArgumentException("maxEdits must be between 0 and " + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE); } - if (minPrefix < 0) { - throw new IllegalArgumentException("minPrefix must not be < 0"); + if (nonFuzzyPrefix < 0) { + throw new IllegalArgumentException("nonFuzzyPrefix must not be >= 0 (got " + nonFuzzyPrefix + ")"); } + if (minFuzzyLength < 0) { + throw new IllegalArgumentException("minFuzzyLength must not be >= 0 (got " + minFuzzyLength + ")"); + } + this.maxEdits = maxEdits; this.transpositions = transpositions; - this.minPrefix = minPrefix; + this.nonFuzzyPrefix = nonFuzzyPrefix; + this.minFuzzyLength = minFuzzyLength; + this.allowSepEdit = allowSepEdit; } @Override @@ -149,12 +168,17 @@ public final class FuzzySuggester extends AnalyzingSuggester { Automaton lookupAutomaton, FST> fst) throws IOException { - // nocommit we don't "penalize" for edits - // ... shouldn't we? ie, ed=0 completions should have - // higher rank than ed=1, at the same "weight"? maybe - // we can punt on this for starters ... or maybe we - // can re-run each prefix path through lev0, lev1, - // lev2 to figure out the number of edits? + + // TODO: right now there's no penalty for fuzzy/edits, + // ie a completion whose prefix matched exactly what the + // user typed gets no boost over completions that + // required an edit, which get no boost over completions + // requiring two edits. I suspect a multiplicative + // factor is appropriate (eg, say a fuzzy match must be at + // least 2X better weight than the non-fuzzy match to + // "compete") ... in which case I think the wFST needs + // to be log weights or something ... + Automaton levA = toLevenshteinAutomata(lookupAutomaton); /* Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8"); @@ -170,21 +194,19 @@ public final class FuzzySuggester extends AnalyzingSuggester { Automaton subs[] = new Automaton[ref.size()]; int upto = 0; for (IntsRef path : ref) { - if (path.length <= minPrefix) { + if (path.length <= nonFuzzyPrefix || path.length < minFuzzyLength) { subs[upto] = BasicAutomata.makeString(path.ints, path.offset, path.length); upto++; } else { - Automaton prefix = BasicAutomata.makeString(path.ints, path.offset, minPrefix); - int ints[] = new int[path.length-minPrefix]; - System.arraycopy(path.ints, path.offset+minPrefix, ints, 0, ints.length); - // nocommit i think we should pass 254 max? ie - // exclude 0xff ... this way we can't 'edit away' - // the sep? or ... maybe we want to allow that to - // be edited away? - // nocommit also the 0 byte ... we use that as - // trailer ... we probably shouldn't allow that byte - // to be edited (we could add alphaMin?) - LevenshteinAutomata lev = new LevenshteinAutomata(ints, 255, transpositions); + Automaton prefix = BasicAutomata.makeString(path.ints, path.offset, nonFuzzyPrefix); + int ints[] = new int[path.length-nonFuzzyPrefix]; + System.arraycopy(path.ints, path.offset+nonFuzzyPrefix, ints, 0, ints.length); + // TODO: maybe add alphaMin to LevenshteinAutomata, + // and pass 1 instead of 0? We probably don't want + // to allow the trailing dedup bytes to be + // edited... but then 0 byte is "in general" allowed + // on input (but not in UTF8). + LevenshteinAutomata lev = new LevenshteinAutomata(ints, allowSepEdit ? 255 : 254, transpositions); Automaton levAutomaton = lev.toAutomaton(maxEdits); Automaton combined = BasicOperations.concatenate(Arrays.asList(prefix, levAutomaton)); combined.setDeterministic(true); // its like the special case in concatenate itself, except we cloneExpanded already @@ -193,10 +215,6 @@ public final class FuzzySuggester extends AnalyzingSuggester { } } - // nocommit maybe we should reduce the LevN? the added - // arcs add cost during intersect (extra FST arc - // lookups...). could be net win... - if (subs.length == 0) { return BasicAutomata.makeEmpty(); // matches nothing } else if (subs.length == 1) { @@ -206,6 +224,10 @@ public final class FuzzySuggester extends AnalyzingSuggester { // TODO: we could call toLevenshteinAutomata() before det? // this only happens if you have multiple paths anyway (e.g. synonyms) BasicOperations.determinize(a); + + // Does not seem to help (and hurt maybe a bit: 6-9 + // prefix went from 19 to 18 kQPS): + // a.reduce(); return a; } } diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java index 49a5443be7e..4a50e8d0d0e 100644 --- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java +++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java @@ -65,7 +65,7 @@ public class LookupBenchmarkTest extends LuceneTestCase { private final static int warmup = 5; private final int num = 7; - private final boolean onlyMorePopular = true; + private final boolean onlyMorePopular = false; private final static Random random = new Random(0xdeadbeef); diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java index bc4d72d579a..3a9dde83476 100644 --- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java +++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java @@ -64,7 +64,7 @@ public class FuzzySuggesterTest extends LuceneTestCase { suggester.build(new TermFreqArrayIterator(keys)); int numIters = atLeast(10); for (int i = 0; i < numIters; i++) { - String addRandomEdit = addRandomEdit("foo bar boo", 2); + String addRandomEdit = addRandomEdit("foo bar boo", FuzzySuggester.DEFAULT_NON_FUZZY_PREFIX); List results = suggester.lookup(_TestUtil.stringToCharSequence(addRandomEdit, random()), false, 2); assertEquals(addRandomEdit, 1, results.size()); assertEquals("foo bar boo far", results.get(0).key.toString()); @@ -184,7 +184,7 @@ public class FuzzySuggesterTest extends LuceneTestCase { int options = 0; Analyzer a = new MockAnalyzer(random()); - FuzzySuggester suggester = new FuzzySuggester(a, a, options, 256, -1, 1, true, 1); + FuzzySuggester suggester = new FuzzySuggester(a, a, options, 256, -1, 1, true, 1, 3, true); suggester.build(new TermFreqArrayIterator(keys)); // TODO: would be nice if "ab " would allow the test to // pass, and more generally if the analyzer can know @@ -387,7 +387,7 @@ public class FuzzySuggesterTest extends LuceneTestCase { public void testExactFirst() throws Exception { Analyzer a = getUnusualAnalyzer(); - FuzzySuggester suggester = new FuzzySuggester(a, a, AnalyzingSuggester.EXACT_FIRST | AnalyzingSuggester.PRESERVE_SEP, 256, -1, 1, true, 1); + FuzzySuggester suggester = new FuzzySuggester(a, a, AnalyzingSuggester.EXACT_FIRST | AnalyzingSuggester.PRESERVE_SEP, 256, -1, 1, true, 1, 3, true); suggester.build(new TermFreqArrayIterator(new TermFreq[] { new TermFreq("x y", 1), new TermFreq("x y z", 3), @@ -426,7 +426,7 @@ public class FuzzySuggesterTest extends LuceneTestCase { public void testNonExactFirst() throws Exception { Analyzer a = getUnusualAnalyzer(); - FuzzySuggester suggester = new FuzzySuggester(a, a, AnalyzingSuggester.PRESERVE_SEP, 256, -1, 1, true, 1); + FuzzySuggester suggester = new FuzzySuggester(a, a, AnalyzingSuggester.PRESERVE_SEP, 256, -1, 1, true, 1, 3, true); suggester.build(new TermFreqArrayIterator(new TermFreq[] { new TermFreq("x y", 1), @@ -645,7 +645,7 @@ public class FuzzySuggesterTest extends LuceneTestCase { Analyzer a = new MockTokenEatingAnalyzer(numStopChars, preserveHoles); FuzzySuggester suggester = new FuzzySuggester(a, a, - preserveSep ? AnalyzingSuggester.PRESERVE_SEP : 0, 256, -1, 1, false, 1); + preserveSep ? AnalyzingSuggester.PRESERVE_SEP : 0, 256, -1, 1, false, 1, 3, true); suggester.build(new TermFreqArrayIterator(keys)); for (String prefix : allPrefixes) { @@ -703,8 +703,10 @@ public class FuzzySuggesterTest extends LuceneTestCase { } TokenStreamToAutomaton tokenStreamToAutomaton = suggester.getTokenStreamToAutomaton(); - // nocommit this is putting fox in charge of hen - // house! ie maybe we have a bug in suggester.toLevA ... + // NOTE: not great that we ask the suggester to give + // us the "answer key" (ie maybe we have a bug in + // suggester.toLevA ...) ... but testRandom2() fixes + // this: Automaton automaton = suggester.toLevenshteinAutomata(suggester.toLookupAutomaton(analyzedKey)); assertTrue(automaton.isDeterministic()); // TODO: could be faster... but its slowCompletor for a reason @@ -776,7 +778,7 @@ public class FuzzySuggesterTest extends LuceneTestCase { public void testMaxSurfaceFormsPerAnalyzedForm() throws Exception { Analyzer a = new MockAnalyzer(random()); - FuzzySuggester suggester = new FuzzySuggester(a, a, 0, 2, -1, 1, true, 1); + FuzzySuggester suggester = new FuzzySuggester(a, a, 0, 2, -1, 1, true, 1, 3, true); List keys = Arrays.asList(new TermFreq[] { new TermFreq("a", 40), @@ -800,7 +802,18 @@ public class FuzzySuggesterTest extends LuceneTestCase { StringBuilder builder = new StringBuilder(); for (int i = 0; i < input.length; i++) { if (i >= prefixLength && random().nextBoolean() && i < input.length-1) { - switch(random().nextInt(3)) { + switch(random().nextInt(4)) { + case 3: + if (i < input.length-1) { + // Transpose input[i] and input[1+i]: + builder.append(input[i+1]); + builder.append(input[i]); + for(int j=i+2;j answers = new ArrayList(); final Set seen = new HashSet(); for(int i=0;i