diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 1f0f329f7e1..88040a7ecd8 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -49,6 +49,10 @@ New Features for better search performance. (Han Jiang, Adrien Grand, Robert Muir, Mike McCandless) +* LUCENE-3846: New FuzzySuggester, like AnalyzingSuggester except it + also finds completions allowing for fuzzy edits in the input string. + (Robert Muir, Simon Willnauer, Mike McCandless) + API Changes * LUCENE-4399: Deprecated AppendingCodec. Lucene's term dictionaries diff --git a/lucene/common-build.xml b/lucene/common-build.xml index d1fd0f080b4..b1d4899170e 100644 --- a/lucene/common-build.xml +++ b/lucene/common-build.xml @@ -833,7 +833,7 @@ - + diff --git a/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java b/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java index 7a011c038ac..2b53641a046 100644 --- a/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java +++ b/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java @@ -22,6 +22,7 @@ import java.io.IOException; import java.io.OutputStreamWriter; import java.io.Writer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; @@ -88,6 +89,7 @@ public class TokenStreamToAutomaton { final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class); final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class); final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class); + final BytesRef term = termBytesAtt.getBytesRef(); in.reset(); diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/BasicAutomata.java b/lucene/core/src/java/org/apache/lucene/util/automaton/BasicAutomata.java index 128c7d975e0..0a793a638cc 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/BasicAutomata.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/BasicAutomata.java @@ -240,6 +240,20 @@ final public class BasicAutomata { a.deterministic = true; return a; } + + public static Automaton makeString(int[] word, int offset, int length) { + Automaton a = new Automaton(); + a.setDeterministic(true); + State s = new State(); + a.initial = s; + for (int i = offset; i < offset+length; i++) { + State s2 = new State(); + s.addTransition(new Transition(word[i], s2)); + s = s2; + } + s.accept = true; + return a; + } /** * Returns a new (deterministic and minimal) automaton that accepts the union diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java b/lucene/core/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java index 96211428861..92384c450f1 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java @@ -33,12 +33,13 @@ public class LevenshteinAutomata { /** @lucene.internal */ public static final int MAXIMUM_SUPPORTED_DISTANCE = 2; /* input word */ - final String input; final int word[]; /* the automata alphabet. */ final int alphabet[]; + /* the maximum symbol in the alphabet (e.g. 255 for UTF-8 or 10FFFF for UTF-32) */ + final int alphaMax; - /* the unicode ranges outside of alphabet */ + /* the ranges outside of alphabet */ final int rangeLower[]; final int rangeUpper[]; int numRanges = 0; @@ -50,17 +51,26 @@ public class LevenshteinAutomata { * Optionally count transpositions as a primitive edit. */ public LevenshteinAutomata(String input, boolean withTranspositions) { - this.input = input; - int length = Character.codePointCount(input, 0, input.length()); - word = new int[length]; - for (int i = 0, j = 0, cp = 0; i < input.length(); i += Character.charCount(cp)) { - word[j++] = cp = input.codePointAt(i); - } - + this(codePoints(input), Character.MAX_CODE_POINT, withTranspositions); + } + + /** + * Expert: specify a custom maximum possible symbol + * (alphaMax); default is Character.MAX_CODE_POINT. + */ + public LevenshteinAutomata(int[] word, int alphaMax, boolean withTranspositions) { + this.word = word; + this.alphaMax = alphaMax; + // calculate the alphabet SortedSet set = new TreeSet(); - for (int i = 0; i < word.length; i++) - set.add(word[i]); + for (int i = 0; i < word.length; i++) { + int v = word[i]; + if (v > alphaMax) { + throw new IllegalArgumentException("alphaMax exceeded by symbol " + v + " in word"); + } + set.add(v); + } alphabet = new int[set.size()]; Iterator iterator = set.iterator(); for (int i = 0; i < alphabet.length; i++) @@ -81,9 +91,9 @@ public class LevenshteinAutomata { lower = higher + 1; } /* add the final endpoint */ - if (lower <= Character.MAX_CODE_POINT) { + if (lower <= alphaMax) { rangeLower[numRanges] = lower; - rangeUpper[numRanges] = Character.MAX_CODE_POINT; + rangeUpper[numRanges] = alphaMax; numRanges++; } @@ -94,6 +104,15 @@ public class LevenshteinAutomata { }; } + private static int[] codePoints(String input) { + int length = Character.codePointCount(input, 0, input.length()); + int word[] = new int[length]; + for (int i = 0, j = 0, cp = 0; i < input.length(); i += Character.charCount(cp)) { + word[j++] = cp = input.codePointAt(i); + } + return word; + } + /** * Compute a DFA that accepts all strings within an edit distance of n. *

@@ -106,8 +125,9 @@ public class LevenshteinAutomata { *

*/ public Automaton toAutomaton(int n) { - if (n == 0) - return BasicAutomata.makeString(input); + if (n == 0) { + return BasicAutomata.makeString(word, 0, word.length); + } if (n >= descriptions.length) return null; diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/Util.java b/lucene/core/src/java/org/apache/lucene/util/fst/Util.java index cbfbacb19f9..4326b4dab74 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/Util.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/Util.java @@ -22,6 +22,8 @@ import java.util.*; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.fst.FST.Arc; +import org.apache.lucene.util.fst.FST.BytesReader; /** Static helper methods. * @@ -304,7 +306,10 @@ public final class Util { path.input.ints[path.input.length++] = path.arc.label; final int cmp = bottom.input.compareTo(path.input); path.input.length--; + + // We should never see dups: assert cmp != 0; + if (cmp < 0) { // Doesn't compete return; @@ -846,4 +851,93 @@ public final class Util { w.close(); } */ + + /** + * Reads the first arc greater or equal that the given label into the provided + * arc in place and returns it iff found, otherwise return null. + * + * @param label the label to ceil on + * @param fst the fst to operate on + * @param follow the arc to follow reading the label from + * @param arc the arc to read into in place + * @param in the fst's {@link BytesReader} + */ + public static Arc readCeilArc(int label, FST fst, Arc follow, + Arc arc, BytesReader in) throws IOException { + // TODO maybe this is a useful in the FST class - we could simplify some other code like FSTEnum? + if (label == FST.END_LABEL) { + if (follow.isFinal()) { + if (follow.target <= 0) { + arc.flags = FST.BIT_LAST_ARC; + } else { + arc.flags = 0; + // NOTE: nextArc is a node (not an address!) in this case: + arc.nextArc = follow.target; + arc.node = follow.target; + } + arc.output = follow.nextFinalOutput; + arc.label = FST.END_LABEL; + return arc; + } else { + return null; + } + } + + if (!FST.targetHasArcs(follow)) { + return null; + } + fst.readFirstTargetArc(follow, arc, in); + if (arc.bytesPerArc != 0 && arc.label != FST.END_LABEL) { + // Arcs are fixed array -- use binary search to find + // the target. + + int low = arc.arcIdx; + int high = arc.numArcs - 1; + int mid = 0; + // System.out.println("do arc array low=" + low + " high=" + high + + // " targetLabel=" + targetLabel); + while (low <= high) { + mid = (low + high) >>> 1; + in.pos = arc.posArcsStart; + in.skip(arc.bytesPerArc * mid + 1); + final int midLabel = fst.readLabel(in); + final int cmp = midLabel - label; + // System.out.println(" cycle low=" + low + " high=" + high + " mid=" + + // mid + " midLabel=" + midLabel + " cmp=" + cmp); + if (cmp < 0) { + low = mid + 1; + } else if (cmp > 0) { + high = mid - 1; + } else { + arc.arcIdx = mid-1; + return fst.readNextRealArc(arc, in); + } + } + if (low == arc.numArcs) { + // DEAD END! + return null; + } + + arc.arcIdx = (low > high ? high : low); + return fst.readNextRealArc(arc, in); + } + + // Linear scan + fst.readFirstRealTargetArc(follow.target, arc, in); + + while (true) { + // System.out.println(" non-bs cycle"); + // TODO: we should fix this code to not have to create + // object for the output of every arc we scan... only + // for the matching arc, if found + if (arc.label >= label) { + // System.out.println(" found!"); + return arc; + } else if (arc.isLast()) { + return null; + } else { + fst.readNextRealArc(arc, in); + } + } + } } diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java index f803dd79cdb..28a77085365 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java @@ -31,6 +31,7 @@ import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStreamToAutomaton; +import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; import org.apache.lucene.search.spell.TermFreqIterator; import org.apache.lucene.search.suggest.Lookup; import org.apache.lucene.search.suggest.fst.Sort; @@ -310,7 +311,7 @@ public class AnalyzingSuggester extends Lookup { } } - private TokenStreamToAutomaton getTokenStreamToAutomaton() { + TokenStreamToAutomaton getTokenStreamToAutomaton() { if (preserveSep) { return new EscapingTokenStreamToAutomaton(); } else { @@ -332,6 +333,7 @@ public class AnalyzingSuggester extends Lookup { BytesRef scratch = new BytesRef(); TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton(); + // analyzed sequence + 0(byte) + weight(int) + surface + analyzedLength(short) boolean success = false; byte buffer[] = new byte[8]; @@ -339,29 +341,8 @@ public class AnalyzingSuggester extends Lookup { ByteArrayDataOutput output = new ByteArrayDataOutput(buffer); BytesRef surfaceForm; while ((surfaceForm = iterator.next()) != null) { - - // Analyze surface form: - TokenStream ts = indexAnalyzer.tokenStream("", new StringReader(surfaceForm.utf8ToString())); - - // Create corresponding automaton: labels are bytes - // from each analyzed token, with byte 0 used as - // separator between tokens: - Automaton automaton = ts2a.toAutomaton(ts); - ts.end(); - ts.close(); - - replaceSep(automaton); - - assert SpecialOperations.isFinite(automaton); - - // Get all paths from the automaton (there can be - // more than one path, eg if the analyzer created a - // graph using SynFilter or WDF): - - // TODO: we could walk & add simultaneously, so we - // don't have to alloc [possibly biggish] - // intermediate HashSet in RAM: - Set paths = SpecialOperations.getFiniteStrings(automaton, maxGraphExpansions); + Set paths = toFiniteStrings(surfaceForm, ts2a); + maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, paths.size()); for (IntsRef path : paths) { @@ -510,27 +491,10 @@ public class AnalyzingSuggester extends Lookup { } //System.out.println("lookup key=" + key + " num=" + num); - + final BytesRef utf8Key = new BytesRef(key); try { - // TODO: is there a Reader from a CharSequence? - // Turn tokenstream into automaton: - TokenStream ts = queryAnalyzer.tokenStream("", new StringReader(key.toString())); - Automaton automaton = getTokenStreamToAutomaton().toAutomaton(ts); - ts.end(); - ts.close(); - - // TODO: we could use the end offset to "guess" - // whether the final token was a partial token; this - // would only be a heuristic ... but maybe an OK one. - // This way we could eg differentiate "net" from "net ", - // which we can't today... - - replaceSep(automaton); - - // TODO: we can optimize this somewhat by determinizing - // while we convert - BasicOperations.determinize(automaton); + Automaton lookupAutomaton = toLookupAutomaton(key); final CharsRef spare = new CharsRef(); @@ -538,8 +502,7 @@ public class AnalyzingSuggester extends Lookup { // Intersect automaton w/ suggest wFST and get all // prefix starting nodes & their outputs: - final List>> prefixPaths; - prefixPaths = FSTUtil.intersectPrefixPaths(automaton, fst); + //final PathIntersector intersector = getPathIntersector(lookupAutomaton, fst); //System.out.println(" prefixPaths: " + prefixPaths.size()); @@ -549,6 +512,8 @@ public class AnalyzingSuggester extends Lookup { final List results = new ArrayList(); + List>> prefixPaths = FSTUtil.intersectPrefixPaths(lookupAutomaton, fst); + if (exactFirst) { int count = 0; @@ -593,9 +558,9 @@ public class AnalyzingSuggester extends Lookup { // nodes we have and the // maxSurfaceFormsPerAnalyzedForm: for(MinResult> completion : completions) { - spare.grow(completion.output.output2.length); - UnicodeUtil.UTF8toUTF16(completion.output.output2, spare); - if (CHARSEQUENCE_COMPARATOR.compare(spare, key) == 0) { + if (utf8Key.bytesEquals(completion.output.output2)) { + spare.grow(completion.output.output2.length); + UnicodeUtil.UTF8toUTF16(completion.output.output2, spare); results.add(new LookupResult(spare.toString(), decodeWeight(completion.output.output1))); break; } @@ -630,9 +595,7 @@ public class AnalyzingSuggester extends Lookup { // In exactFirst mode, don't accept any paths // matching the surface form since that will // create duplicate results: - spare.grow(output.output2.length); - UnicodeUtil.UTF8toUTF16(output.output2, spare); - if (CHARSEQUENCE_COMPARATOR.compare(spare, key) == 0) { + if (utf8Key.bytesEquals(output.output2)) { // We found exact match, which means we should // have already found it in the first search: assert results.size() == 1; @@ -644,6 +607,8 @@ public class AnalyzingSuggester extends Lookup { } }; + prefixPaths = getFullPrefixPaths(prefixPaths, lookupAutomaton, fst); + for (FSTUtil.Path> path : prefixPaths) { searcher.addStartPaths(path.fstNode, path.output, true, path.input); } @@ -654,6 +619,10 @@ public class AnalyzingSuggester extends Lookup { spare.grow(completion.output.output2.length); UnicodeUtil.UTF8toUTF16(completion.output.output2, spare); LookupResult result = new LookupResult(spare.toString(), decodeWeight(completion.output.output1)); + + // TODO: for fuzzy case would be nice to return + // how many edits were required + //System.out.println(" result=" + result); results.add(result); @@ -670,6 +639,63 @@ public class AnalyzingSuggester extends Lookup { } } + /** Returns all prefix paths to initialize the search. */ + protected List>> getFullPrefixPaths(List>> prefixPaths, + Automaton lookupAutomaton, + FST> fst) + throws IOException { + return prefixPaths; + } + + final Set toFiniteStrings(final BytesRef surfaceForm, final TokenStreamToAutomaton ts2a) throws IOException { + // Analyze surface form: + TokenStream ts = indexAnalyzer.tokenStream("", new StringReader(surfaceForm.utf8ToString())); + + // Create corresponding automaton: labels are bytes + // from each analyzed token, with byte 0 used as + // separator between tokens: + Automaton automaton = ts2a.toAutomaton(ts); + ts.end(); + ts.close(); + + replaceSep(automaton); + + assert SpecialOperations.isFinite(automaton); + + // Get all paths from the automaton (there can be + // more than one path, eg if the analyzer created a + // graph using SynFilter or WDF): + + // TODO: we could walk & add simultaneously, so we + // don't have to alloc [possibly biggish] + // intermediate HashSet in RAM: + return SpecialOperations.getFiniteStrings(automaton, maxGraphExpansions); + } + + final Automaton toLookupAutomaton(final CharSequence key) throws IOException { + // TODO: is there a Reader from a CharSequence? + // Turn tokenstream into automaton: + TokenStream ts = queryAnalyzer.tokenStream("", new StringReader(key.toString())); + Automaton automaton = (getTokenStreamToAutomaton()).toAutomaton(ts); + ts.end(); + ts.close(); + + // TODO: we could use the end offset to "guess" + // whether the final token was a partial token; this + // would only be a heuristic ... but maybe an OK one. + // This way we could eg differentiate "net" from "net ", + // which we can't today... + + replaceSep(automaton); + + // TODO: we can optimize this somewhat by determinizing + // while we convert + BasicOperations.determinize(automaton); + return automaton; + } + + + /** * Returns the weight associated with an input string, * or null if it does not exist. diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java index c22da8f2369..686ae3b59e8 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java @@ -26,6 +26,7 @@ import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.State; import org.apache.lucene.util.automaton.Transition; import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.Util; // TODO: move to core? nobody else uses it yet though... @@ -62,57 +63,78 @@ public class FSTUtil { } } - /** Enumerates all paths in the automaton that also - * intersect the FST, accumulating the FST end node and - * output for each path. */ - public static List> intersectPrefixPaths(Automaton a, FST fst) throws IOException { + /** + * Enumerates all minimal prefix paths in the automaton that also intersect the FST, + * accumulating the FST end node and output for each path. + */ + public static List> intersectPrefixPaths(Automaton a, FST fst) + throws IOException { + assert a.isDeterministic(); final List> queue = new ArrayList>(); final List> endNodes = new ArrayList>(); - - queue.add(new Path(a.getInitialState(), - fst.getFirstArc(new FST.Arc()), - fst.outputs.getNoOutput(), - new IntsRef())); - + queue.add(new Path(a.getInitialState(), fst + .getFirstArc(new FST.Arc()), fst.outputs.getNoOutput(), + new IntsRef())); + final FST.Arc scratchArc = new FST.Arc(); final FST.BytesReader fstReader = fst.getBytesReader(0); - - //System.out.println("fst/a intersect"); - + while (queue.size() != 0) { - final Path path = queue.remove(queue.size()-1); - //System.out.println(" cycle path=" + path); + final Path path = queue.remove(queue.size() - 1); if (path.state.isAccept()) { endNodes.add(path); + // we can stop here if we accept this path, + // we accept all further paths too + continue; } - + IntsRef currentInput = path.input; - for(Transition t : path.state.getTransitions()) { - - // TODO: we can fix this if necessary: - if (t.getMin() != t.getMax()) { - throw new IllegalStateException("can only handle Transitions that match one character"); - } - - //System.out.println(" t=" + (char) t.getMin()); - - final FST.Arc nextArc = fst.findTargetArc(t.getMin(), path.fstNode, scratchArc, fstReader); - if (nextArc != null) { - //System.out.println(" fst matches"); - // Path continues: - IntsRef newInput = new IntsRef(currentInput.length + 1); - newInput.copyInts(currentInput); - newInput.ints[currentInput.length] = t.getMin(); - newInput.length = currentInput.length + 1; - - queue.add(new Path(t.getDest(), - new FST.Arc().copyFrom(nextArc), - fst.outputs.add(path.output, nextArc.output), - newInput)); + for (Transition t : path.state.getTransitions()) { + final int min = t.getMin(); + final int max = t.getMax(); + if (min == max) { + final FST.Arc nextArc = fst.findTargetArc(t.getMin(), + path.fstNode, scratchArc, fstReader); + if (nextArc != null) { + final IntsRef newInput = new IntsRef(currentInput.length + 1); + newInput.copyInts(currentInput); + newInput.ints[currentInput.length] = t.getMin(); + newInput.length = currentInput.length + 1; + queue.add(new Path(t.getDest(), new FST.Arc() + .copyFrom(nextArc), fst.outputs + .add(path.output, nextArc.output), newInput)); + } + } else { + // TODO: if this transition's TO state is accepting, and + // it accepts the entire range possible in the FST (ie. 0 to 255), + // we can simply use the prefix as the accepted state instead of + // looking up all the ranges and terminate early + // here. This just shifts the work from one queue + // (this one) to another (the completion search + // done in AnalyzingSuggester). + FST.Arc nextArc = Util.readCeilArc(min, fst, path.fstNode, + scratchArc, fstReader); + while (nextArc != null && nextArc.label <= max) { + assert nextArc.label <= max; + assert nextArc.label >= min : nextArc.label + " " + + min; + final IntsRef newInput = new IntsRef(currentInput.length + 1); + newInput.copyInts(currentInput); + newInput.ints[currentInput.length] = nextArc.label; + newInput.length = currentInput.length + 1; + queue.add(new Path(t.getDest(), new FST.Arc() + .copyFrom(nextArc), fst.outputs + .add(path.output, nextArc.output), newInput)); + final int label = nextArc.label; // used in assert + nextArc = nextArc.isLast() ? null : fst.readNextRealArc(nextArc, + fstReader); + assert nextArc == null || label < nextArc.label : "last: " + label + + " next: " + nextArc.label; + } } } } - return endNodes; } + } diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java new file mode 100644 index 00000000000..2169c43f282 --- /dev/null +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java @@ -0,0 +1,226 @@ +package org.apache.lucene.search.suggest.analyzing; +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStreamWriter; +import java.io.Writer; +import java.util.Arrays; +import java.util.List; +import java.util.Set; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; // javadocs +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.BasicAutomata; +import org.apache.lucene.util.automaton.BasicOperations; +import org.apache.lucene.util.automaton.LevenshteinAutomata; +import org.apache.lucene.util.automaton.SpecialOperations; +import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.PairOutputs.Pair; + +/** + * Implements a fuzzy {@link AnalyzingSuggester}. The similarity measurement is + * based on the Damerau-Levenshtein (optimal string alignment) algorithm, though + * you can explicitly choose classic Levenshtein by passing false + * for the transpositions parameter. + *

+ * At most, this query will match terms up to + * {@value org.apache.lucene.util.automaton.LevenshteinAutomata#MAXIMUM_SUPPORTED_DISTANCE} + * edits. Higher distances are not supported. Note that the + * fuzzy distance is measured in "byte space" on the bytes + * returned by the {@link TokenStream}'s {@link + * TermToBytesRefAttribute}, usually UTF8. By default + * the analyzed bytes must be at least 3 {@link + * #DEFAULT_MIN_FUZZY_LENGTH} bytes before any edits are + * considered. Furthermore, the first 1 {@link + * #DEFAULT_NON_FUZZY_PREFIX} byte is not allowed to be + * edited. We allow up to 1 (@link + * #DEFAULT_MAX_EDITS} edit. + * + *

+ * NOTE: This suggester does not boost suggestions that + * required no edits over suggestions that did require + * edits. This is a known limitation. + * + *

+ * Note: complex query analyzers can have a significant impact on the lookup + * performance. It's recommended to not use analyzers that drop or inject terms + * like synonyms to keep the complexity of the prefix intersection low for good + * lookup performance. At index time, complex analyzers can safely be used. + *

+ */ +public final class FuzzySuggester extends AnalyzingSuggester { + private final int maxEdits; + private final boolean transpositions; + private final int nonFuzzyPrefix; + private final int minFuzzyLength; + + /** + * The default minimum length of the key passed to {@link + * #lookup} before any edits are allowed. + */ + public static final int DEFAULT_MIN_FUZZY_LENGTH = 3; + + /** + * The default prefix length where edits are not allowed. + */ + public static final int DEFAULT_NON_FUZZY_PREFIX = 1; + + /** + * The default maximum number of edits for fuzzy + * suggestions. + */ + public static final int DEFAULT_MAX_EDITS = 1; + + /** + * Creates a {@link FuzzySuggester} instance initialized with default values. + * + * @param analyzer the analyzer used for this suggester + */ + public FuzzySuggester(Analyzer analyzer) { + this(analyzer, analyzer); + } + + /** + * Creates a {@link FuzzySuggester} instance with an index & a query analyzer initialized with default values. + * + * @param indexAnalyzer + * Analyzer that will be used for analyzing suggestions while building the index. + * @param queryAnalyzer + * Analyzer that will be used for analyzing query text during lookup + */ + public FuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer) { + this(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, DEFAULT_MAX_EDITS, true, + DEFAULT_NON_FUZZY_PREFIX, DEFAULT_MIN_FUZZY_LENGTH); + } + + /** + * Creates a {@link FuzzySuggester} instance. + * + * @param indexAnalyzer Analyzer that will be used for + * analyzing suggestions while building the index. + * @param queryAnalyzer Analyzer that will be used for + * analyzing query text during lookup + * @param options see {@link #EXACT_FIRST}, {@link #PRESERVE_SEP} + * @param maxSurfaceFormsPerAnalyzedForm Maximum number of + * surface forms to keep for a single analyzed form. + * When there are too many surface forms we discard the + * lowest weighted ones. + * @param maxGraphExpansions Maximum number of graph paths + * to expand from the analyzed form. Set this to -1 for + * no limit. + * @param maxEdits must be >= 0 and <= {@link LevenshteinAutomata#MAXIMUM_SUPPORTED_DISTANCE} . + * @param transpositions true if transpositions should be treated as a primitive + * edit operation. If this is false, comparisons will implement the classic + * Levenshtein algorithm. + * @param nonFuzzyPrefix length of common (non-fuzzy) prefix (see default {@link #DEFAULT_NON_FUZZY_PREFIX} + * @param minFuzzyLength minimum length of lookup key before any edits are allowed (see default {@link #DEFAULT_MIN_FUZZY_LENGTH}) + */ + public FuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer, + int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions, + int maxEdits, boolean transpositions, int nonFuzzyPrefix, + int minFuzzyLength) { + super(indexAnalyzer, queryAnalyzer, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions); + if (maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) { + throw new IllegalArgumentException("maxEdits must be between 0 and " + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE); + } + if (nonFuzzyPrefix < 0) { + throw new IllegalArgumentException("nonFuzzyPrefix must not be >= 0 (got " + nonFuzzyPrefix + ")"); + } + if (minFuzzyLength < 0) { + throw new IllegalArgumentException("minFuzzyLength must not be >= 0 (got " + minFuzzyLength + ")"); + } + + this.maxEdits = maxEdits; + this.transpositions = transpositions; + this.nonFuzzyPrefix = nonFuzzyPrefix; + this.minFuzzyLength = minFuzzyLength; + } + + @Override + protected List>> getFullPrefixPaths(List>> prefixPaths, + Automaton lookupAutomaton, + FST> fst) + throws IOException { + + // TODO: right now there's no penalty for fuzzy/edits, + // ie a completion whose prefix matched exactly what the + // user typed gets no boost over completions that + // required an edit, which get no boost over completions + // requiring two edits. I suspect a multiplicative + // factor is appropriate (eg, say a fuzzy match must be at + // least 2X better weight than the non-fuzzy match to + // "compete") ... in which case I think the wFST needs + // to be log weights or something ... + + Automaton levA = toLevenshteinAutomata(lookupAutomaton); + /* + Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8"); + w.write(levA.toDot()); + w.close(); + System.out.println("Wrote LevA to out.dot"); + */ + return FSTUtil.intersectPrefixPaths(levA, fst); + } + + Automaton toLevenshteinAutomata(Automaton automaton) { + final Set ref = SpecialOperations.getFiniteStrings(automaton, -1); + Automaton subs[] = new Automaton[ref.size()]; + int upto = 0; + for (IntsRef path : ref) { + if (path.length <= nonFuzzyPrefix || path.length < minFuzzyLength) { + subs[upto] = BasicAutomata.makeString(path.ints, path.offset, path.length); + upto++; + } else { + Automaton prefix = BasicAutomata.makeString(path.ints, path.offset, nonFuzzyPrefix); + int ints[] = new int[path.length-nonFuzzyPrefix]; + System.arraycopy(path.ints, path.offset+nonFuzzyPrefix, ints, 0, ints.length); + // TODO: maybe add alphaMin to LevenshteinAutomata, + // and pass 1 instead of 0? We probably don't want + // to allow the trailing dedup bytes to be + // edited... but then 0 byte is "in general" allowed + // on input (but not in UTF8). + LevenshteinAutomata lev = new LevenshteinAutomata(ints, 255, transpositions); + Automaton levAutomaton = lev.toAutomaton(maxEdits); + Automaton combined = BasicOperations.concatenate(Arrays.asList(prefix, levAutomaton)); + combined.setDeterministic(true); // its like the special case in concatenate itself, except we cloneExpanded already + subs[upto] = combined; + upto++; + } + } + + if (subs.length == 0) { + return BasicAutomata.makeEmpty(); // matches nothing + } else if (subs.length == 1) { + return subs[0]; + } else { + Automaton a = BasicOperations.union(Arrays.asList(subs)); + // TODO: we could call toLevenshteinAutomata() before det? + // this only happens if you have multiple paths anyway (e.g. synonyms) + BasicOperations.determinize(a); + + // Does not seem to help (and hurt maybe a bit: 6-9 + // prefix went from 19 to 18 kQPS): + // a.reduce(); + return a; + } + } +} diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java index 6dbb9600777..4a50e8d0d0e 100644 --- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java +++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java @@ -36,6 +36,7 @@ import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.search.suggest.Lookup; // javadocs import org.apache.lucene.search.suggest.analyzing.AnalyzingSuggester; +import org.apache.lucene.search.suggest.analyzing.FuzzySuggester; import org.apache.lucene.search.suggest.fst.FSTCompletionLookup; import org.apache.lucene.search.suggest.fst.WFSTCompletionLookup; import org.apache.lucene.search.suggest.jaspell.JaspellLookup; @@ -51,17 +52,20 @@ import org.junit.Ignore; public class LookupBenchmarkTest extends LuceneTestCase { @SuppressWarnings("unchecked") private final List> benchmarkClasses = Arrays.asList( + FuzzySuggester.class, + AnalyzingSuggester.class, JaspellLookup.class, TSTLookup.class, FSTCompletionLookup.class, - WFSTCompletionLookup.class, - AnalyzingSuggester.class); + WFSTCompletionLookup.class + + ); private final static int rounds = 15; private final static int warmup = 5; private final int num = 7; - private final boolean onlyMorePopular = true; + private final boolean onlyMorePopular = false; private final static Random random = new Random(0xdeadbeef); @@ -212,8 +216,9 @@ public class LookupBenchmarkTest extends LuceneTestCase { final List input = new ArrayList(benchmarkInput.size()); for (TermFreq tf : benchmarkInput) { String s = tf.term.utf8ToString(); - input.add(s.substring(0, Math.min(s.length(), - minPrefixLen + random.nextInt(maxPrefixLen - minPrefixLen + 1)))); + String sub = s.substring(0, Math.min(s.length(), + minPrefixLen + random.nextInt(maxPrefixLen - minPrefixLen + 1))); + input.add(sub); } BenchmarkResult result = measure(new Callable() { @@ -250,7 +255,9 @@ public class LookupBenchmarkTest extends LuceneTestCase { } return new BenchmarkResult(times, warmup, rounds); } catch (Exception e) { + e.printStackTrace(); throw new RuntimeException(e); + } } diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java new file mode 100644 index 00000000000..f7398a8e62f --- /dev/null +++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java @@ -0,0 +1,1128 @@ +package org.apache.lucene.search.suggest.analyzing; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Reader; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.TreeSet; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.CannedTokenStream; +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.analysis.MockTokenFilter; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.TokenStreamToAutomaton; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.search.suggest.Lookup.LookupResult; +import org.apache.lucene.search.suggest.TermFreq; +import org.apache.lucene.search.suggest.TermFreqArrayIterator; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util._TestUtil; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.State; +import org.apache.lucene.util.fst.Util; + +public class FuzzySuggesterTest extends LuceneTestCase { + + public void testRandomEdits() throws IOException { + List keys = new ArrayList(); + int numTerms = atLeast(100); + for (int i = 0; i < numTerms; i++) { + keys.add(new TermFreq("boo" + _TestUtil.randomSimpleString(random()), 1 + random().nextInt(100))); + } + keys.add(new TermFreq("foo bar boo far", 12)); + FuzzySuggester suggester = new FuzzySuggester(new MockAnalyzer(random(), MockTokenizer.KEYWORD, false)); + suggester.build(new TermFreqArrayIterator(keys)); + int numIters = atLeast(10); + for (int i = 0; i < numIters; i++) { + String addRandomEdit = addRandomEdit("foo bar boo", FuzzySuggester.DEFAULT_NON_FUZZY_PREFIX); + List results = suggester.lookup(_TestUtil.stringToCharSequence(addRandomEdit, random()), false, 2); + assertEquals(addRandomEdit, 1, results.size()); + assertEquals("foo bar boo far", results.get(0).key.toString()); + assertEquals(12, results.get(0).value, 0.01F); + } + } + + /** this is basically the WFST test ported to KeywordAnalyzer. so it acts the same */ + public void testKeyword() throws Exception { + TermFreq keys[] = new TermFreq[] { + new TermFreq("foo", 50), + new TermFreq("bar", 10), + new TermFreq("barbar", 12), + new TermFreq("barbara", 6) + }; + + FuzzySuggester suggester = new FuzzySuggester(new MockAnalyzer(random(), MockTokenizer.KEYWORD, false)); + suggester.build(new TermFreqArrayIterator(keys)); + + List results = suggester.lookup(_TestUtil.stringToCharSequence("bariar", random()), false, 2); + assertEquals(2, results.size()); + assertEquals("barbar", results.get(0).key.toString()); + assertEquals(12, results.get(0).value, 0.01F); + + results = suggester.lookup(_TestUtil.stringToCharSequence("barbr", random()), false, 2); + assertEquals(2, results.size()); + assertEquals("barbar", results.get(0).key.toString()); + assertEquals(12, results.get(0).value, 0.01F); + + results = suggester.lookup(_TestUtil.stringToCharSequence("barbara", random()), false, 2); + assertEquals(2, results.size()); + assertEquals("barbara", results.get(0).key.toString()); + assertEquals(6, results.get(0).value, 0.01F); + + results = suggester.lookup(_TestUtil.stringToCharSequence("barbar", random()), false, 2); + assertEquals(2, results.size()); + assertEquals("barbar", results.get(0).key.toString()); + assertEquals(12, results.get(0).value, 0.01F); + assertEquals("barbara", results.get(1).key.toString()); + assertEquals(6, results.get(1).value, 0.01F); + + results = suggester.lookup(_TestUtil.stringToCharSequence("barbaa", random()), false, 2); + assertEquals(2, results.size()); + assertEquals("barbar", results.get(0).key.toString()); + assertEquals(12, results.get(0).value, 0.01F); + assertEquals("barbara", results.get(1).key.toString()); + assertEquals(6, results.get(1).value, 0.01F); + + // top N of 2, but only foo is available + results = suggester.lookup(_TestUtil.stringToCharSequence("f", random()), false, 2); + assertEquals(1, results.size()); + assertEquals("foo", results.get(0).key.toString()); + assertEquals(50, results.get(0).value, 0.01F); + + // top N of 1 for 'bar': we return this even though + // barbar is higher because exactFirst is enabled: + results = suggester.lookup(_TestUtil.stringToCharSequence("bar", random()), false, 1); + assertEquals(1, results.size()); + assertEquals("bar", results.get(0).key.toString()); + assertEquals(10, results.get(0).value, 0.01F); + + // top N Of 2 for 'b' + results = suggester.lookup(_TestUtil.stringToCharSequence("b", random()), false, 2); + assertEquals(2, results.size()); + assertEquals("barbar", results.get(0).key.toString()); + assertEquals(12, results.get(0).value, 0.01F); + assertEquals("bar", results.get(1).key.toString()); + assertEquals(10, results.get(1).value, 0.01F); + + // top N of 3 for 'ba' + results = suggester.lookup(_TestUtil.stringToCharSequence("ba", random()), false, 3); + assertEquals(3, results.size()); + assertEquals("barbar", results.get(0).key.toString()); + assertEquals(12, results.get(0).value, 0.01F); + assertEquals("bar", results.get(1).key.toString()); + assertEquals(10, results.get(1).value, 0.01F); + assertEquals("barbara", results.get(2).key.toString()); + assertEquals(6, results.get(2).value, 0.01F); + } + + /** + * basic "standardanalyzer" test with stopword removal + */ + public void testStandard() throws Exception { + TermFreq keys[] = new TermFreq[] { + new TermFreq("the ghost of christmas past", 50), + }; + + Analyzer standard = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET, false); + FuzzySuggester suggester = new FuzzySuggester(standard); + suggester.build(new TermFreqArrayIterator(keys)); + + List results = suggester.lookup(_TestUtil.stringToCharSequence("the ghost of chris", random()), false, 1); + assertEquals(1, results.size()); + assertEquals("the ghost of christmas past", results.get(0).key.toString()); + assertEquals(50, results.get(0).value, 0.01F); + + // omit the 'the' since its a stopword, its suggested anyway + results = suggester.lookup(_TestUtil.stringToCharSequence("ghost of chris", random()), false, 1); + assertEquals(1, results.size()); + assertEquals("the ghost of christmas past", results.get(0).key.toString()); + assertEquals(50, results.get(0).value, 0.01F); + + // omit the 'the' and 'of' since they are stopwords, its suggested anyway + results = suggester.lookup(_TestUtil.stringToCharSequence("ghost chris", random()), false, 1); + assertEquals(1, results.size()); + assertEquals("the ghost of christmas past", results.get(0).key.toString()); + assertEquals(50, results.get(0).value, 0.01F); + } + + public void testNoSeps() throws Exception { + TermFreq[] keys = new TermFreq[] { + new TermFreq("ab cd", 0), + new TermFreq("abcd", 1), + }; + + int options = 0; + + Analyzer a = new MockAnalyzer(random()); + FuzzySuggester suggester = new FuzzySuggester(a, a, options, 256, -1, 1, true, 1, 3); + suggester.build(new TermFreqArrayIterator(keys)); + // TODO: would be nice if "ab " would allow the test to + // pass, and more generally if the analyzer can know + // that the user's current query has ended at a word, + // but, analyzers don't produce SEP tokens! + List r = suggester.lookup(_TestUtil.stringToCharSequence("ab c", random()), false, 2); + assertEquals(2, r.size()); + + // With no PRESERVE_SEPS specified, "ab c" should also + // complete to "abcd", which has higher weight so should + // appear first: + assertEquals("abcd", r.get(0).key.toString()); + } + + public void testGraphDups() throws Exception { + + final Analyzer analyzer = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true); + + return new TokenStreamComponents(tokenizer) { + int tokenStreamCounter = 0; + final TokenStream[] tokenStreams = new TokenStream[] { + new CannedTokenStream(new Token[] { + token("wifi",1,1), + token("hotspot",0,2), + token("network",1,1), + token("is",1,1), + token("slow",1,1) + }), + new CannedTokenStream(new Token[] { + token("wi",1,1), + token("hotspot",0,3), + token("fi",1,1), + token("network",1,1), + token("is",1,1), + token("fast",1,1) + + }), + new CannedTokenStream(new Token[] { + token("wifi",1,1), + token("hotspot",0,2), + token("network",1,1) + }), + }; + + @Override + public TokenStream getTokenStream() { + TokenStream result = tokenStreams[tokenStreamCounter]; + tokenStreamCounter++; + return result; + } + + @Override + protected void setReader(final Reader reader) throws IOException { + } + }; + } + }; + + TermFreq keys[] = new TermFreq[] { + new TermFreq("wifi network is slow", 50), + new TermFreq("wi fi network is fast", 10), + }; + FuzzySuggester suggester = new FuzzySuggester(analyzer); + suggester.build(new TermFreqArrayIterator(keys)); + + List results = suggester.lookup("wifi network", false, 10); + if (VERBOSE) { + System.out.println("Results: " + results); + } + assertEquals(2, results.size()); + assertEquals("wifi network is slow", results.get(0).key); + assertEquals(50, results.get(0).value); + assertEquals("wi fi network is fast", results.get(1).key); + assertEquals(10, results.get(1).value); + } + + + + public void testInputPathRequired() throws Exception { + + // SynonymMap.Builder b = new SynonymMap.Builder(false); + // b.add(new CharsRef("ab"), new CharsRef("ba"), true); + // final SynonymMap map = b.build(); + + // The Analyzer below mimics the functionality of the SynonymAnalyzer + // using the above map, so that the suggest module does not need a dependency on the + // synonym module + + final Analyzer analyzer = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true); + + return new TokenStreamComponents(tokenizer) { + int tokenStreamCounter = 0; + final TokenStream[] tokenStreams = new TokenStream[] { + new CannedTokenStream(new Token[] { + token("ab",1,1), + token("ba",0,1), + token("xc",1,1) + }), + new CannedTokenStream(new Token[] { + token("ba",1,1), + token("xd",1,1) + }), + new CannedTokenStream(new Token[] { + token("ab",1,1), + token("ba",0,1), + token("x",1,1) + }) + }; + + @Override + public TokenStream getTokenStream() { + TokenStream result = tokenStreams[tokenStreamCounter]; + tokenStreamCounter++; + return result; + } + + @Override + protected void setReader(final Reader reader) throws IOException { + } + }; + } + }; + + TermFreq keys[] = new TermFreq[] { + new TermFreq("ab xc", 50), + new TermFreq("ba xd", 50), + }; + FuzzySuggester suggester = new FuzzySuggester(analyzer); + suggester.build(new TermFreqArrayIterator(keys)); + List results = suggester.lookup("ab x", false, 1); + assertTrue(results.size() == 1); + } + + private static Token token(String term, int posInc, int posLength) { + final Token t = new Token(term, 0, 0); + t.setPositionIncrement(posInc); + t.setPositionLength(posLength); + return t; + } + + /* + private void printTokens(final Analyzer analyzer, String input) throws IOException { + System.out.println("Tokens for " + input); + TokenStream ts = analyzer.tokenStream("", new StringReader(input)); + ts.reset(); + final TermToBytesRefAttribute termBytesAtt = ts.addAttribute(TermToBytesRefAttribute.class); + final PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class); + final PositionLengthAttribute posLengthAtt = ts.addAttribute(PositionLengthAttribute.class); + + while(ts.incrementToken()) { + termBytesAtt.fillBytesRef(); + System.out.println(String.format("%s,%s,%s", termBytesAtt.getBytesRef().utf8ToString(), posIncAtt.getPositionIncrement(), posLengthAtt.getPositionLength())); + } + ts.end(); + ts.close(); + } + */ + + private final Analyzer getUnusualAnalyzer() { + return new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true); + + return new TokenStreamComponents(tokenizer) { + + int count; + + @Override + public TokenStream getTokenStream() { + // 4th time we are called, return tokens a b, + // else just a: + if (count++ != 3) { + return new CannedTokenStream(new Token[] { + token("a", 1, 1), + }); + } else { + // After that "a b": + return new CannedTokenStream(new Token[] { + token("a", 1, 1), + token("b", 1, 1), + }); + } + } + + @Override + protected void setReader(final Reader reader) throws IOException { + } + }; + } + }; + } + + public void testExactFirst() throws Exception { + + Analyzer a = getUnusualAnalyzer(); + FuzzySuggester suggester = new FuzzySuggester(a, a, AnalyzingSuggester.EXACT_FIRST | AnalyzingSuggester.PRESERVE_SEP, 256, -1, 1, true, 1, 3); + suggester.build(new TermFreqArrayIterator(new TermFreq[] { + new TermFreq("x y", 1), + new TermFreq("x y z", 3), + new TermFreq("x", 2), + new TermFreq("z z z", 20), + })); + + //System.out.println("ALL: " + suggester.lookup("x y", false, 6)); + + for(int topN=1;topN<6;topN++) { + List results = suggester.lookup("x y", false, topN); + //System.out.println("topN=" + topN + " " + results); + + assertEquals(Math.min(topN, 4), results.size()); + + assertEquals("x y", results.get(0).key); + assertEquals(1, results.get(0).value); + + if (topN > 1) { + assertEquals("z z z", results.get(1).key); + assertEquals(20, results.get(1).value); + + if (topN > 2) { + assertEquals("x y z", results.get(2).key); + assertEquals(3, results.get(2).value); + + if (topN > 3) { + assertEquals("x", results.get(3).key); + assertEquals(2, results.get(3).value); + } + } + } + } + } + + public void testNonExactFirst() throws Exception { + + Analyzer a = getUnusualAnalyzer(); + FuzzySuggester suggester = new FuzzySuggester(a, a, AnalyzingSuggester.PRESERVE_SEP, 256, -1, 1, true, 1, 3); + + suggester.build(new TermFreqArrayIterator(new TermFreq[] { + new TermFreq("x y", 1), + new TermFreq("x y z", 3), + new TermFreq("x", 2), + new TermFreq("z z z", 20), + })); + + for(int topN=1;topN<6;topN++) { + List results = suggester.lookup("p", false, topN); + + assertEquals(Math.min(topN, 4), results.size()); + + assertEquals("z z z", results.get(0).key); + assertEquals(20, results.get(0).value); + + if (topN > 1) { + assertEquals("x y z", results.get(1).key); + assertEquals(3, results.get(1).value); + + if (topN > 2) { + assertEquals("x", results.get(2).key); + assertEquals(2, results.get(2).value); + + if (topN > 3) { + assertEquals("x y", results.get(3).key); + assertEquals(1, results.get(3).value); + } + } + } + } + } + + // Holds surface form seperately: + private static class TermFreq2 implements Comparable { + public final String surfaceForm; + public final String analyzedForm; + public final long weight; + + public TermFreq2(String surfaceForm, String analyzedForm, long weight) { + this.surfaceForm = surfaceForm; + this.analyzedForm = analyzedForm; + this.weight = weight; + } + + @Override + public int compareTo(TermFreq2 other) { + int cmp = analyzedForm.compareTo(other.analyzedForm); + if (cmp != 0) { + return cmp; + } else if (weight > other.weight) { + return -1; + } else if (weight < other.weight) { + return 1; + } else { + assert false; + return 0; + } + } + } + + static boolean isStopChar(char ch, int numStopChars) { + //System.out.println("IS? " + ch + ": " + (ch - 'a') + ": " + ((ch - 'a') < numStopChars)); + return (ch - 'a') < numStopChars; + } + + // Like StopFilter: + private static class TokenEater extends TokenFilter { + private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final int numStopChars; + private final boolean preserveHoles; + private boolean first; + + public TokenEater(boolean preserveHoles, TokenStream in, int numStopChars) { + super(in); + this.preserveHoles = preserveHoles; + this.numStopChars = numStopChars; + } + + @Override + public void reset() throws IOException { + super.reset(); + first = true; + } + + @Override + public final boolean incrementToken() throws IOException { + int skippedPositions = 0; + while (input.incrementToken()) { + if (termAtt.length() != 1 || !isStopChar(termAtt.charAt(0), numStopChars)) { + int posInc = posIncrAtt.getPositionIncrement() + skippedPositions; + if (first) { + if (posInc == 0) { + // first token having posinc=0 is illegal. + posInc = 1; + } + first = false; + } + posIncrAtt.setPositionIncrement(posInc); + //System.out.println("RETURN term=" + termAtt + " numStopChars=" + numStopChars); + return true; + } + if (preserveHoles) { + skippedPositions += posIncrAtt.getPositionIncrement(); + } + } + + return false; + } + } + + private static class MockTokenEatingAnalyzer extends Analyzer { + private int numStopChars; + private boolean preserveHoles; + + public MockTokenEatingAnalyzer(int numStopChars, boolean preserveHoles) { + this.preserveHoles = preserveHoles; + this.numStopChars = numStopChars; + } + + @Override + public TokenStreamComponents createComponents(String fieldName, Reader reader) { + MockTokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH); + tokenizer.setEnableChecks(true); + TokenStream next; + if (numStopChars != 0) { + next = new TokenEater(preserveHoles, tokenizer, numStopChars); + } else { + next = tokenizer; + } + return new TokenStreamComponents(tokenizer, next); + } + } + + public void testRandom() throws Exception { + + int numQueries = atLeast(100); + + final List slowCompletor = new ArrayList(); + final TreeSet allPrefixes = new TreeSet(); + final Set seen = new HashSet(); + + TermFreq[] keys = new TermFreq[numQueries]; + + boolean preserveSep = random().nextBoolean(); + + final int numStopChars = random().nextInt(10); + final boolean preserveHoles = random().nextBoolean(); + + if (VERBOSE) { + System.out.println("TEST: " + numQueries + " words; preserveSep=" + preserveSep + " numStopChars=" + numStopChars + " preserveHoles=" + preserveHoles); + } + + for (int i = 0; i < numQueries; i++) { + int numTokens = _TestUtil.nextInt(random(), 1, 4); + String key; + String analyzedKey; + while(true) { + key = ""; + analyzedKey = ""; + for(int token=0;token < numTokens;token++) { + String s; + while (true) { + // TODO: would be nice to fix this slowCompletor/comparator to + // use full range, but we might lose some coverage too... + s = _TestUtil.randomSimpleString(random()); + if (s.length() > 0) { + if (token > 0) { + key += " "; + } + if (preserveSep && analyzedKey.length() > 0 && analyzedKey.charAt(analyzedKey.length()-1) != ' ') { + analyzedKey += " "; + } + key += s; + if (s.length() == 1 && isStopChar(s.charAt(0), numStopChars)) { + if (preserveSep && preserveHoles) { + analyzedKey += '\u0000'; + } + } else { + analyzedKey += s; + } + break; + } + } + } + + analyzedKey = analyzedKey.replaceAll("(^| )\u0000$", ""); + + // Don't add same surface form more than once: + if (!seen.contains(key)) { + seen.add(key); + break; + } + } + + for (int j = 1; j < key.length(); j++) { + allPrefixes.add(key.substring(0, j)); + } + // we can probably do Integer.MAX_VALUE here, but why worry. + int weight = random().nextInt(1<<24); + keys[i] = new TermFreq(key, weight); + + slowCompletor.add(new TermFreq2(key, analyzedKey, weight)); + } + + if (VERBOSE) { + // Don't just sort original list, to avoid VERBOSE + // altering the test: + List sorted = new ArrayList(slowCompletor); + Collections.sort(sorted); + for(TermFreq2 ent : sorted) { + System.out.println(" surface='" + ent.surfaceForm + " analyzed='" + ent.analyzedForm + "' weight=" + ent.weight); + } + } + + Analyzer a = new MockTokenEatingAnalyzer(numStopChars, preserveHoles); + FuzzySuggester suggester = new FuzzySuggester(a, a, + preserveSep ? AnalyzingSuggester.PRESERVE_SEP : 0, 256, -1, 1, false, 1, 3); + suggester.build(new TermFreqArrayIterator(keys)); + + for (String prefix : allPrefixes) { + + if (VERBOSE) { + System.out.println("\nTEST: prefix=" + prefix); + } + + final int topN = _TestUtil.nextInt(random(), 1, 10); + List r = suggester.lookup(_TestUtil.stringToCharSequence(prefix, random()), false, topN); + + // 2. go thru whole set to find suggestions: + List matches = new ArrayList(); + + // "Analyze" the key: + String[] tokens = prefix.split(" "); + StringBuilder builder = new StringBuilder(); + for(int i=0;i 0 && !builder.toString().endsWith(" ")) { + builder.append(' '); + } + + if (token.length() == 1 && isStopChar(token.charAt(0), numStopChars)) { + if (preserveSep && preserveHoles) { + builder.append("\u0000"); + } + } else { + builder.append(token); + } + } + + String analyzedKey = builder.toString(); + + // Remove trailing sep/holes (TokenStream.end() does + // not tell us any trailing holes, yet ... there is an + // issue open for this): + while (true) { + String s = analyzedKey.replaceAll("(^| )\u0000$", ""); + s = s.replaceAll("\\s+$", ""); + if (s.equals(analyzedKey)) { + break; + } + analyzedKey = s; + } + + if (analyzedKey.length() == 0) { + // Currently suggester can't suggest from the empty + // string! You get no results, not all results... + continue; + } + + if (VERBOSE) { + System.out.println(" analyzed: " + analyzedKey); + } + TokenStreamToAutomaton tokenStreamToAutomaton = suggester.getTokenStreamToAutomaton(); + + // NOTE: not great that we ask the suggester to give + // us the "answer key" (ie maybe we have a bug in + // suggester.toLevA ...) ... but testRandom2() fixes + // this: + Automaton automaton = suggester.toLevenshteinAutomata(suggester.toLookupAutomaton(analyzedKey)); + assertTrue(automaton.isDeterministic()); + // TODO: could be faster... but its slowCompletor for a reason + BytesRef spare = new BytesRef(); + for (TermFreq2 e : slowCompletor) { + spare.copyChars(e.analyzedForm); + Set finiteStrings = suggester.toFiniteStrings(spare, tokenStreamToAutomaton); + for (IntsRef intsRef : finiteStrings) { + State p = automaton.getInitialState(); + BytesRef ref = Util.toBytesRef(intsRef, spare); + boolean added = false; + for (int i = ref.offset; i < ref.length; i++) { + State q = p.step(ref.bytes[i] & 0xff); + if (q == null) { + break; + } else if (q.isAccept()) { + matches.add(new LookupResult(e.surfaceForm, e.weight)); + added = true; + break; + } + p = q; + } + if (!added && p.isAccept()) { + matches.add(new LookupResult(e.surfaceForm, e.weight)); + } + } + } + + assertTrue(numStopChars > 0 || matches.size() > 0); + + if (matches.size() > 1) { + Collections.sort(matches, new Comparator() { + public int compare(LookupResult left, LookupResult right) { + int cmp = Float.compare(right.value, left.value); + if (cmp == 0) { + return left.compareTo(right); + } else { + return cmp; + } + } + }); + } + + if (matches.size() > topN) { + matches = matches.subList(0, topN); + } + + if (VERBOSE) { + System.out.println(" expected:"); + for(LookupResult lr : matches) { + System.out.println(" key=" + lr.key + " weight=" + lr.value); + } + + System.out.println(" actual:"); + for(LookupResult lr : r) { + System.out.println(" key=" + lr.key + " weight=" + lr.value); + } + } + + assertEquals(prefix + " " + topN, matches.size(), r.size()); + for(int hit=0;hit keys = Arrays.asList(new TermFreq[] { + new TermFreq("a", 40), + new TermFreq("a ", 50), + new TermFreq(" a", 60), + }); + + Collections.shuffle(keys, random()); + suggester.build(new TermFreqArrayIterator(keys)); + + List results = suggester.lookup("a", false, 5); + assertEquals(2, results.size()); + assertEquals(" a", results.get(0).key); + assertEquals(60, results.get(0).value); + assertEquals("a ", results.get(1).key); + assertEquals(50, results.get(1).value); + } + + public void testEditSeps() throws Exception { + Analyzer a = new MockAnalyzer(random()); + FuzzySuggester suggester = new FuzzySuggester(a, a, FuzzySuggester.PRESERVE_SEP, 2, -1, 2, true, 1, 3); + + List keys = Arrays.asList(new TermFreq[] { + new TermFreq("foo bar", 40), + new TermFreq("foo bar baz", 50), + new TermFreq("barbaz", 60), + new TermFreq("barbazfoo", 10), + }); + + Collections.shuffle(keys, random()); + suggester.build(new TermFreqArrayIterator(keys)); + + assertEquals("[foo bar baz/50, foo bar/40]", suggester.lookup("foobar", false, 5).toString()); + assertEquals("[foo bar baz/50]", suggester.lookup("foobarbaz", false, 5).toString()); + assertEquals("[barbaz/60, barbazfoo/10]", suggester.lookup("bar baz", false, 5).toString()); + assertEquals("[barbazfoo/10]", suggester.lookup("bar baz foo", false, 5).toString()); + } + + private static String addRandomEdit(String string, int prefixLength) { + char[] input = string.toCharArray(); + StringBuilder builder = new StringBuilder(); + for (int i = 0; i < input.length; i++) { + if (i >= prefixLength && random().nextBoolean() && i < input.length-1) { + switch(random().nextInt(4)) { + case 3: + if (i < input.length-1) { + // Transpose input[i] and input[1+i]: + builder.append(input[i+1]); + builder.append(input[i]); + for(int j=i+2;j answers = new ArrayList(); + final Set seen = new HashSet(); + for(int i=0;i() { + @Override + public int compare(TermFreq a, TermFreq b) { + return a.term.compareTo(b.term); + } + }); + if (VERBOSE) { + System.out.println("\nTEST: targets"); + for(TermFreq tf : answers) { + System.out.println(" " + tf.term.utf8ToString() + " freq=" + tf.v); + } + } + + Analyzer a = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false); + int maxEdits = random().nextBoolean() ? 1 : 2; + int prefixLen = random().nextInt(4); + boolean transpositions = random().nextBoolean(); + // TODO: test graph analyzers + // TODO: test exactFirst / preserveSep permutations + FuzzySuggester suggest = new FuzzySuggester(a, a, 0, 256, -1, maxEdits, transpositions, prefixLen, prefixLen); + + if (VERBOSE) { + System.out.println("TEST: maxEdits=" + maxEdits + " prefixLen=" + prefixLen + " transpositions=" + transpositions + " num=" + NUM); + } + + Collections.shuffle(answers, random()); + suggest.build(new TermFreqArrayIterator(answers.toArray(new TermFreq[answers.size()]))); + + final int ITERS = atLeast(100); + for(int iter=0;iter actual = suggest.lookup(frag, false, NUM); + if (VERBOSE) { + System.out.println(" actual: " + actual.size()); + for(LookupResult c : actual) { + System.out.println(" " + c); + } + } + + Collections.sort(actual, new CompareByCostThenAlpha()); + + final int limit = Math.min(expected.size(), actual.size()); + for(int ans=0;ans slowFuzzyMatch(int prefixLen, int maxEdits, boolean allowTransposition, List answers, String frag) { + final List results = new ArrayList(); + final int fragLen = frag.length(); + for(TermFreq tf : answers) { + //System.out.println(" check s=" + tf.term.utf8ToString()); + boolean prefixMatches = true; + for(int i=0;i 1 && j > 1 && targetPoints.ints[i-1] == otherPoints.ints[j-2] && targetPoints.ints[i-2] == otherPoints.ints[j-1]) { + d[i][j] = Math.min(d[i][j], d[i-2][j-2] + cost); + } + } + } + + return d[n][m]; + } + + private static IntsRef toIntsRef(String s) { + IntsRef ref = new IntsRef(s.length()); // worst case + int utf16Len = s.length(); + for (int i = 0, cp = 0; i < utf16Len; i += Character.charCount(cp)) { + cp = ref.ints[ref.length++] = Character.codePointAt(s, i); + } + return ref; + } +}