mirror of https://github.com/apache/lucene.git
LUCENE-3846: add new FuzzySuggester
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1403779 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
commit
d8e44bd09d
|
@ -49,6 +49,10 @@ New Features
|
||||||
for better search performance.
|
for better search performance.
|
||||||
(Han Jiang, Adrien Grand, Robert Muir, Mike McCandless)
|
(Han Jiang, Adrien Grand, Robert Muir, Mike McCandless)
|
||||||
|
|
||||||
|
* LUCENE-3846: New FuzzySuggester, like AnalyzingSuggester except it
|
||||||
|
also finds completions allowing for fuzzy edits in the input string.
|
||||||
|
(Robert Muir, Simon Willnauer, Mike McCandless)
|
||||||
|
|
||||||
API Changes
|
API Changes
|
||||||
|
|
||||||
* LUCENE-4399: Deprecated AppendingCodec. Lucene's term dictionaries
|
* LUCENE-4399: Deprecated AppendingCodec. Lucene's term dictionaries
|
||||||
|
|
|
@ -22,6 +22,7 @@ import java.io.IOException;
|
||||||
import java.io.OutputStreamWriter;
|
import java.io.OutputStreamWriter;
|
||||||
import java.io.Writer;
|
import java.io.Writer;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
|
||||||
|
@ -88,6 +89,7 @@ public class TokenStreamToAutomaton {
|
||||||
final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class);
|
final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class);
|
||||||
final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
|
final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
|
||||||
final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class);
|
final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class);
|
||||||
|
|
||||||
final BytesRef term = termBytesAtt.getBytesRef();
|
final BytesRef term = termBytesAtt.getBytesRef();
|
||||||
|
|
||||||
in.reset();
|
in.reset();
|
||||||
|
|
|
@ -241,6 +241,20 @@ final public class BasicAutomata {
|
||||||
return a;
|
return a;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static Automaton makeString(int[] word, int offset, int length) {
|
||||||
|
Automaton a = new Automaton();
|
||||||
|
a.setDeterministic(true);
|
||||||
|
State s = new State();
|
||||||
|
a.initial = s;
|
||||||
|
for (int i = offset; i < offset+length; i++) {
|
||||||
|
State s2 = new State();
|
||||||
|
s.addTransition(new Transition(word[i], s2));
|
||||||
|
s = s2;
|
||||||
|
}
|
||||||
|
s.accept = true;
|
||||||
|
return a;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns a new (deterministic and minimal) automaton that accepts the union
|
* Returns a new (deterministic and minimal) automaton that accepts the union
|
||||||
* of the given collection of {@link BytesRef}s representing UTF-8 encoded
|
* of the given collection of {@link BytesRef}s representing UTF-8 encoded
|
||||||
|
|
|
@ -33,12 +33,13 @@ public class LevenshteinAutomata {
|
||||||
/** @lucene.internal */
|
/** @lucene.internal */
|
||||||
public static final int MAXIMUM_SUPPORTED_DISTANCE = 2;
|
public static final int MAXIMUM_SUPPORTED_DISTANCE = 2;
|
||||||
/* input word */
|
/* input word */
|
||||||
final String input;
|
|
||||||
final int word[];
|
final int word[];
|
||||||
/* the automata alphabet. */
|
/* the automata alphabet. */
|
||||||
final int alphabet[];
|
final int alphabet[];
|
||||||
|
/* the maximum symbol in the alphabet (e.g. 255 for UTF-8 or 10FFFF for UTF-32) */
|
||||||
|
final int alphaMax;
|
||||||
|
|
||||||
/* the unicode ranges outside of alphabet */
|
/* the ranges outside of alphabet */
|
||||||
final int rangeLower[];
|
final int rangeLower[];
|
||||||
final int rangeUpper[];
|
final int rangeUpper[];
|
||||||
int numRanges = 0;
|
int numRanges = 0;
|
||||||
|
@ -50,17 +51,26 @@ public class LevenshteinAutomata {
|
||||||
* Optionally count transpositions as a primitive edit.
|
* Optionally count transpositions as a primitive edit.
|
||||||
*/
|
*/
|
||||||
public LevenshteinAutomata(String input, boolean withTranspositions) {
|
public LevenshteinAutomata(String input, boolean withTranspositions) {
|
||||||
this.input = input;
|
this(codePoints(input), Character.MAX_CODE_POINT, withTranspositions);
|
||||||
int length = Character.codePointCount(input, 0, input.length());
|
|
||||||
word = new int[length];
|
|
||||||
for (int i = 0, j = 0, cp = 0; i < input.length(); i += Character.charCount(cp)) {
|
|
||||||
word[j++] = cp = input.codePointAt(i);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Expert: specify a custom maximum possible symbol
|
||||||
|
* (alphaMax); default is Character.MAX_CODE_POINT.
|
||||||
|
*/
|
||||||
|
public LevenshteinAutomata(int[] word, int alphaMax, boolean withTranspositions) {
|
||||||
|
this.word = word;
|
||||||
|
this.alphaMax = alphaMax;
|
||||||
|
|
||||||
// calculate the alphabet
|
// calculate the alphabet
|
||||||
SortedSet<Integer> set = new TreeSet<Integer>();
|
SortedSet<Integer> set = new TreeSet<Integer>();
|
||||||
for (int i = 0; i < word.length; i++)
|
for (int i = 0; i < word.length; i++) {
|
||||||
set.add(word[i]);
|
int v = word[i];
|
||||||
|
if (v > alphaMax) {
|
||||||
|
throw new IllegalArgumentException("alphaMax exceeded by symbol " + v + " in word");
|
||||||
|
}
|
||||||
|
set.add(v);
|
||||||
|
}
|
||||||
alphabet = new int[set.size()];
|
alphabet = new int[set.size()];
|
||||||
Iterator<Integer> iterator = set.iterator();
|
Iterator<Integer> iterator = set.iterator();
|
||||||
for (int i = 0; i < alphabet.length; i++)
|
for (int i = 0; i < alphabet.length; i++)
|
||||||
|
@ -81,9 +91,9 @@ public class LevenshteinAutomata {
|
||||||
lower = higher + 1;
|
lower = higher + 1;
|
||||||
}
|
}
|
||||||
/* add the final endpoint */
|
/* add the final endpoint */
|
||||||
if (lower <= Character.MAX_CODE_POINT) {
|
if (lower <= alphaMax) {
|
||||||
rangeLower[numRanges] = lower;
|
rangeLower[numRanges] = lower;
|
||||||
rangeUpper[numRanges] = Character.MAX_CODE_POINT;
|
rangeUpper[numRanges] = alphaMax;
|
||||||
numRanges++;
|
numRanges++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -94,6 +104,15 @@ public class LevenshteinAutomata {
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static int[] codePoints(String input) {
|
||||||
|
int length = Character.codePointCount(input, 0, input.length());
|
||||||
|
int word[] = new int[length];
|
||||||
|
for (int i = 0, j = 0, cp = 0; i < input.length(); i += Character.charCount(cp)) {
|
||||||
|
word[j++] = cp = input.codePointAt(i);
|
||||||
|
}
|
||||||
|
return word;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Compute a DFA that accepts all strings within an edit distance of <code>n</code>.
|
* Compute a DFA that accepts all strings within an edit distance of <code>n</code>.
|
||||||
* <p>
|
* <p>
|
||||||
|
@ -106,8 +125,9 @@ public class LevenshteinAutomata {
|
||||||
* </p>
|
* </p>
|
||||||
*/
|
*/
|
||||||
public Automaton toAutomaton(int n) {
|
public Automaton toAutomaton(int n) {
|
||||||
if (n == 0)
|
if (n == 0) {
|
||||||
return BasicAutomata.makeString(input);
|
return BasicAutomata.makeString(word, 0, word.length);
|
||||||
|
}
|
||||||
|
|
||||||
if (n >= descriptions.length)
|
if (n >= descriptions.length)
|
||||||
return null;
|
return null;
|
||||||
|
|
|
@ -22,6 +22,8 @@ import java.util.*;
|
||||||
|
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
import org.apache.lucene.util.IntsRef;
|
import org.apache.lucene.util.IntsRef;
|
||||||
|
import org.apache.lucene.util.fst.FST.Arc;
|
||||||
|
import org.apache.lucene.util.fst.FST.BytesReader;
|
||||||
|
|
||||||
/** Static helper methods.
|
/** Static helper methods.
|
||||||
*
|
*
|
||||||
|
@ -304,7 +306,10 @@ public final class Util {
|
||||||
path.input.ints[path.input.length++] = path.arc.label;
|
path.input.ints[path.input.length++] = path.arc.label;
|
||||||
final int cmp = bottom.input.compareTo(path.input);
|
final int cmp = bottom.input.compareTo(path.input);
|
||||||
path.input.length--;
|
path.input.length--;
|
||||||
|
|
||||||
|
// We should never see dups:
|
||||||
assert cmp != 0;
|
assert cmp != 0;
|
||||||
|
|
||||||
if (cmp < 0) {
|
if (cmp < 0) {
|
||||||
// Doesn't compete
|
// Doesn't compete
|
||||||
return;
|
return;
|
||||||
|
@ -846,4 +851,93 @@ public final class Util {
|
||||||
w.close();
|
w.close();
|
||||||
}
|
}
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reads the first arc greater or equal that the given label into the provided
|
||||||
|
* arc in place and returns it iff found, otherwise return <code>null</code>.
|
||||||
|
*
|
||||||
|
* @param label the label to ceil on
|
||||||
|
* @param fst the fst to operate on
|
||||||
|
* @param follow the arc to follow reading the label from
|
||||||
|
* @param arc the arc to read into in place
|
||||||
|
* @param in the fst's {@link BytesReader}
|
||||||
|
*/
|
||||||
|
public static <T> Arc<T> readCeilArc(int label, FST<T> fst, Arc<T> follow,
|
||||||
|
Arc<T> arc, BytesReader in) throws IOException {
|
||||||
|
// TODO maybe this is a useful in the FST class - we could simplify some other code like FSTEnum?
|
||||||
|
if (label == FST.END_LABEL) {
|
||||||
|
if (follow.isFinal()) {
|
||||||
|
if (follow.target <= 0) {
|
||||||
|
arc.flags = FST.BIT_LAST_ARC;
|
||||||
|
} else {
|
||||||
|
arc.flags = 0;
|
||||||
|
// NOTE: nextArc is a node (not an address!) in this case:
|
||||||
|
arc.nextArc = follow.target;
|
||||||
|
arc.node = follow.target;
|
||||||
|
}
|
||||||
|
arc.output = follow.nextFinalOutput;
|
||||||
|
arc.label = FST.END_LABEL;
|
||||||
|
return arc;
|
||||||
|
} else {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!FST.targetHasArcs(follow)) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
fst.readFirstTargetArc(follow, arc, in);
|
||||||
|
if (arc.bytesPerArc != 0 && arc.label != FST.END_LABEL) {
|
||||||
|
// Arcs are fixed array -- use binary search to find
|
||||||
|
// the target.
|
||||||
|
|
||||||
|
int low = arc.arcIdx;
|
||||||
|
int high = arc.numArcs - 1;
|
||||||
|
int mid = 0;
|
||||||
|
// System.out.println("do arc array low=" + low + " high=" + high +
|
||||||
|
// " targetLabel=" + targetLabel);
|
||||||
|
while (low <= high) {
|
||||||
|
mid = (low + high) >>> 1;
|
||||||
|
in.pos = arc.posArcsStart;
|
||||||
|
in.skip(arc.bytesPerArc * mid + 1);
|
||||||
|
final int midLabel = fst.readLabel(in);
|
||||||
|
final int cmp = midLabel - label;
|
||||||
|
// System.out.println(" cycle low=" + low + " high=" + high + " mid=" +
|
||||||
|
// mid + " midLabel=" + midLabel + " cmp=" + cmp);
|
||||||
|
if (cmp < 0) {
|
||||||
|
low = mid + 1;
|
||||||
|
} else if (cmp > 0) {
|
||||||
|
high = mid - 1;
|
||||||
|
} else {
|
||||||
|
arc.arcIdx = mid-1;
|
||||||
|
return fst.readNextRealArc(arc, in);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (low == arc.numArcs) {
|
||||||
|
// DEAD END!
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
arc.arcIdx = (low > high ? high : low);
|
||||||
|
return fst.readNextRealArc(arc, in);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Linear scan
|
||||||
|
fst.readFirstRealTargetArc(follow.target, arc, in);
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
// System.out.println(" non-bs cycle");
|
||||||
|
// TODO: we should fix this code to not have to create
|
||||||
|
// object for the output of every arc we scan... only
|
||||||
|
// for the matching arc, if found
|
||||||
|
if (arc.label >= label) {
|
||||||
|
// System.out.println(" found!");
|
||||||
|
return arc;
|
||||||
|
} else if (arc.isLast()) {
|
||||||
|
return null;
|
||||||
|
} else {
|
||||||
|
fst.readNextRealArc(arc, in);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -31,6 +31,7 @@ import java.util.Set;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.TokenStreamToAutomaton;
|
import org.apache.lucene.analysis.TokenStreamToAutomaton;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
|
||||||
import org.apache.lucene.search.spell.TermFreqIterator;
|
import org.apache.lucene.search.spell.TermFreqIterator;
|
||||||
import org.apache.lucene.search.suggest.Lookup;
|
import org.apache.lucene.search.suggest.Lookup;
|
||||||
import org.apache.lucene.search.suggest.fst.Sort;
|
import org.apache.lucene.search.suggest.fst.Sort;
|
||||||
|
@ -310,7 +311,7 @@ public class AnalyzingSuggester extends Lookup {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private TokenStreamToAutomaton getTokenStreamToAutomaton() {
|
TokenStreamToAutomaton getTokenStreamToAutomaton() {
|
||||||
if (preserveSep) {
|
if (preserveSep) {
|
||||||
return new EscapingTokenStreamToAutomaton();
|
return new EscapingTokenStreamToAutomaton();
|
||||||
} else {
|
} else {
|
||||||
|
@ -332,6 +333,7 @@ public class AnalyzingSuggester extends Lookup {
|
||||||
BytesRef scratch = new BytesRef();
|
BytesRef scratch = new BytesRef();
|
||||||
|
|
||||||
TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton();
|
TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton();
|
||||||
|
|
||||||
// analyzed sequence + 0(byte) + weight(int) + surface + analyzedLength(short)
|
// analyzed sequence + 0(byte) + weight(int) + surface + analyzedLength(short)
|
||||||
boolean success = false;
|
boolean success = false;
|
||||||
byte buffer[] = new byte[8];
|
byte buffer[] = new byte[8];
|
||||||
|
@ -339,29 +341,8 @@ public class AnalyzingSuggester extends Lookup {
|
||||||
ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
|
ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
|
||||||
BytesRef surfaceForm;
|
BytesRef surfaceForm;
|
||||||
while ((surfaceForm = iterator.next()) != null) {
|
while ((surfaceForm = iterator.next()) != null) {
|
||||||
|
Set<IntsRef> paths = toFiniteStrings(surfaceForm, ts2a);
|
||||||
|
|
||||||
// Analyze surface form:
|
|
||||||
TokenStream ts = indexAnalyzer.tokenStream("", new StringReader(surfaceForm.utf8ToString()));
|
|
||||||
|
|
||||||
// Create corresponding automaton: labels are bytes
|
|
||||||
// from each analyzed token, with byte 0 used as
|
|
||||||
// separator between tokens:
|
|
||||||
Automaton automaton = ts2a.toAutomaton(ts);
|
|
||||||
ts.end();
|
|
||||||
ts.close();
|
|
||||||
|
|
||||||
replaceSep(automaton);
|
|
||||||
|
|
||||||
assert SpecialOperations.isFinite(automaton);
|
|
||||||
|
|
||||||
// Get all paths from the automaton (there can be
|
|
||||||
// more than one path, eg if the analyzer created a
|
|
||||||
// graph using SynFilter or WDF):
|
|
||||||
|
|
||||||
// TODO: we could walk & add simultaneously, so we
|
|
||||||
// don't have to alloc [possibly biggish]
|
|
||||||
// intermediate HashSet in RAM:
|
|
||||||
Set<IntsRef> paths = SpecialOperations.getFiniteStrings(automaton, maxGraphExpansions);
|
|
||||||
maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, paths.size());
|
maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, paths.size());
|
||||||
|
|
||||||
for (IntsRef path : paths) {
|
for (IntsRef path : paths) {
|
||||||
|
@ -510,27 +491,10 @@ public class AnalyzingSuggester extends Lookup {
|
||||||
}
|
}
|
||||||
|
|
||||||
//System.out.println("lookup key=" + key + " num=" + num);
|
//System.out.println("lookup key=" + key + " num=" + num);
|
||||||
|
final BytesRef utf8Key = new BytesRef(key);
|
||||||
try {
|
try {
|
||||||
|
|
||||||
// TODO: is there a Reader from a CharSequence?
|
Automaton lookupAutomaton = toLookupAutomaton(key);
|
||||||
// Turn tokenstream into automaton:
|
|
||||||
TokenStream ts = queryAnalyzer.tokenStream("", new StringReader(key.toString()));
|
|
||||||
Automaton automaton = getTokenStreamToAutomaton().toAutomaton(ts);
|
|
||||||
ts.end();
|
|
||||||
ts.close();
|
|
||||||
|
|
||||||
// TODO: we could use the end offset to "guess"
|
|
||||||
// whether the final token was a partial token; this
|
|
||||||
// would only be a heuristic ... but maybe an OK one.
|
|
||||||
// This way we could eg differentiate "net" from "net ",
|
|
||||||
// which we can't today...
|
|
||||||
|
|
||||||
replaceSep(automaton);
|
|
||||||
|
|
||||||
// TODO: we can optimize this somewhat by determinizing
|
|
||||||
// while we convert
|
|
||||||
BasicOperations.determinize(automaton);
|
|
||||||
|
|
||||||
final CharsRef spare = new CharsRef();
|
final CharsRef spare = new CharsRef();
|
||||||
|
|
||||||
|
@ -538,8 +502,7 @@ public class AnalyzingSuggester extends Lookup {
|
||||||
|
|
||||||
// Intersect automaton w/ suggest wFST and get all
|
// Intersect automaton w/ suggest wFST and get all
|
||||||
// prefix starting nodes & their outputs:
|
// prefix starting nodes & their outputs:
|
||||||
final List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths;
|
//final PathIntersector intersector = getPathIntersector(lookupAutomaton, fst);
|
||||||
prefixPaths = FSTUtil.intersectPrefixPaths(automaton, fst);
|
|
||||||
|
|
||||||
//System.out.println(" prefixPaths: " + prefixPaths.size());
|
//System.out.println(" prefixPaths: " + prefixPaths.size());
|
||||||
|
|
||||||
|
@ -549,6 +512,8 @@ public class AnalyzingSuggester extends Lookup {
|
||||||
|
|
||||||
final List<LookupResult> results = new ArrayList<LookupResult>();
|
final List<LookupResult> results = new ArrayList<LookupResult>();
|
||||||
|
|
||||||
|
List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths = FSTUtil.intersectPrefixPaths(lookupAutomaton, fst);
|
||||||
|
|
||||||
if (exactFirst) {
|
if (exactFirst) {
|
||||||
|
|
||||||
int count = 0;
|
int count = 0;
|
||||||
|
@ -593,9 +558,9 @@ public class AnalyzingSuggester extends Lookup {
|
||||||
// nodes we have and the
|
// nodes we have and the
|
||||||
// maxSurfaceFormsPerAnalyzedForm:
|
// maxSurfaceFormsPerAnalyzedForm:
|
||||||
for(MinResult<Pair<Long,BytesRef>> completion : completions) {
|
for(MinResult<Pair<Long,BytesRef>> completion : completions) {
|
||||||
|
if (utf8Key.bytesEquals(completion.output.output2)) {
|
||||||
spare.grow(completion.output.output2.length);
|
spare.grow(completion.output.output2.length);
|
||||||
UnicodeUtil.UTF8toUTF16(completion.output.output2, spare);
|
UnicodeUtil.UTF8toUTF16(completion.output.output2, spare);
|
||||||
if (CHARSEQUENCE_COMPARATOR.compare(spare, key) == 0) {
|
|
||||||
results.add(new LookupResult(spare.toString(), decodeWeight(completion.output.output1)));
|
results.add(new LookupResult(spare.toString(), decodeWeight(completion.output.output1)));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -630,9 +595,7 @@ public class AnalyzingSuggester extends Lookup {
|
||||||
// In exactFirst mode, don't accept any paths
|
// In exactFirst mode, don't accept any paths
|
||||||
// matching the surface form since that will
|
// matching the surface form since that will
|
||||||
// create duplicate results:
|
// create duplicate results:
|
||||||
spare.grow(output.output2.length);
|
if (utf8Key.bytesEquals(output.output2)) {
|
||||||
UnicodeUtil.UTF8toUTF16(output.output2, spare);
|
|
||||||
if (CHARSEQUENCE_COMPARATOR.compare(spare, key) == 0) {
|
|
||||||
// We found exact match, which means we should
|
// We found exact match, which means we should
|
||||||
// have already found it in the first search:
|
// have already found it in the first search:
|
||||||
assert results.size() == 1;
|
assert results.size() == 1;
|
||||||
|
@ -644,6 +607,8 @@ public class AnalyzingSuggester extends Lookup {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
prefixPaths = getFullPrefixPaths(prefixPaths, lookupAutomaton, fst);
|
||||||
|
|
||||||
for (FSTUtil.Path<Pair<Long,BytesRef>> path : prefixPaths) {
|
for (FSTUtil.Path<Pair<Long,BytesRef>> path : prefixPaths) {
|
||||||
searcher.addStartPaths(path.fstNode, path.output, true, path.input);
|
searcher.addStartPaths(path.fstNode, path.output, true, path.input);
|
||||||
}
|
}
|
||||||
|
@ -654,6 +619,10 @@ public class AnalyzingSuggester extends Lookup {
|
||||||
spare.grow(completion.output.output2.length);
|
spare.grow(completion.output.output2.length);
|
||||||
UnicodeUtil.UTF8toUTF16(completion.output.output2, spare);
|
UnicodeUtil.UTF8toUTF16(completion.output.output2, spare);
|
||||||
LookupResult result = new LookupResult(spare.toString(), decodeWeight(completion.output.output1));
|
LookupResult result = new LookupResult(spare.toString(), decodeWeight(completion.output.output1));
|
||||||
|
|
||||||
|
// TODO: for fuzzy case would be nice to return
|
||||||
|
// how many edits were required
|
||||||
|
|
||||||
//System.out.println(" result=" + result);
|
//System.out.println(" result=" + result);
|
||||||
results.add(result);
|
results.add(result);
|
||||||
|
|
||||||
|
@ -670,6 +639,63 @@ public class AnalyzingSuggester extends Lookup {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Returns all prefix paths to initialize the search. */
|
||||||
|
protected List<FSTUtil.Path<Pair<Long,BytesRef>>> getFullPrefixPaths(List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths,
|
||||||
|
Automaton lookupAutomaton,
|
||||||
|
FST<Pair<Long,BytesRef>> fst)
|
||||||
|
throws IOException {
|
||||||
|
return prefixPaths;
|
||||||
|
}
|
||||||
|
|
||||||
|
final Set<IntsRef> toFiniteStrings(final BytesRef surfaceForm, final TokenStreamToAutomaton ts2a) throws IOException {
|
||||||
|
// Analyze surface form:
|
||||||
|
TokenStream ts = indexAnalyzer.tokenStream("", new StringReader(surfaceForm.utf8ToString()));
|
||||||
|
|
||||||
|
// Create corresponding automaton: labels are bytes
|
||||||
|
// from each analyzed token, with byte 0 used as
|
||||||
|
// separator between tokens:
|
||||||
|
Automaton automaton = ts2a.toAutomaton(ts);
|
||||||
|
ts.end();
|
||||||
|
ts.close();
|
||||||
|
|
||||||
|
replaceSep(automaton);
|
||||||
|
|
||||||
|
assert SpecialOperations.isFinite(automaton);
|
||||||
|
|
||||||
|
// Get all paths from the automaton (there can be
|
||||||
|
// more than one path, eg if the analyzer created a
|
||||||
|
// graph using SynFilter or WDF):
|
||||||
|
|
||||||
|
// TODO: we could walk & add simultaneously, so we
|
||||||
|
// don't have to alloc [possibly biggish]
|
||||||
|
// intermediate HashSet in RAM:
|
||||||
|
return SpecialOperations.getFiniteStrings(automaton, maxGraphExpansions);
|
||||||
|
}
|
||||||
|
|
||||||
|
final Automaton toLookupAutomaton(final CharSequence key) throws IOException {
|
||||||
|
// TODO: is there a Reader from a CharSequence?
|
||||||
|
// Turn tokenstream into automaton:
|
||||||
|
TokenStream ts = queryAnalyzer.tokenStream("", new StringReader(key.toString()));
|
||||||
|
Automaton automaton = (getTokenStreamToAutomaton()).toAutomaton(ts);
|
||||||
|
ts.end();
|
||||||
|
ts.close();
|
||||||
|
|
||||||
|
// TODO: we could use the end offset to "guess"
|
||||||
|
// whether the final token was a partial token; this
|
||||||
|
// would only be a heuristic ... but maybe an OK one.
|
||||||
|
// This way we could eg differentiate "net" from "net ",
|
||||||
|
// which we can't today...
|
||||||
|
|
||||||
|
replaceSep(automaton);
|
||||||
|
|
||||||
|
// TODO: we can optimize this somewhat by determinizing
|
||||||
|
// while we convert
|
||||||
|
BasicOperations.determinize(automaton);
|
||||||
|
return automaton;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the weight associated with an input string,
|
* Returns the weight associated with an input string,
|
||||||
* or null if it does not exist.
|
* or null if it does not exist.
|
||||||
|
|
|
@ -26,6 +26,7 @@ import org.apache.lucene.util.automaton.Automaton;
|
||||||
import org.apache.lucene.util.automaton.State;
|
import org.apache.lucene.util.automaton.State;
|
||||||
import org.apache.lucene.util.automaton.Transition;
|
import org.apache.lucene.util.automaton.Transition;
|
||||||
import org.apache.lucene.util.fst.FST;
|
import org.apache.lucene.util.fst.FST;
|
||||||
|
import org.apache.lucene.util.fst.Util;
|
||||||
|
|
||||||
// TODO: move to core? nobody else uses it yet though...
|
// TODO: move to core? nobody else uses it yet though...
|
||||||
|
|
||||||
|
@ -62,57 +63,78 @@ public class FSTUtil {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Enumerates all paths in the automaton that also
|
/**
|
||||||
* intersect the FST, accumulating the FST end node and
|
* Enumerates all minimal prefix paths in the automaton that also intersect the FST,
|
||||||
* output for each path. */
|
* accumulating the FST end node and output for each path.
|
||||||
public static<T> List<Path<T>> intersectPrefixPaths(Automaton a, FST<T> fst) throws IOException {
|
*/
|
||||||
|
public static <T> List<Path<T>> intersectPrefixPaths(Automaton a, FST<T> fst)
|
||||||
|
throws IOException {
|
||||||
|
assert a.isDeterministic();
|
||||||
final List<Path<T>> queue = new ArrayList<Path<T>>();
|
final List<Path<T>> queue = new ArrayList<Path<T>>();
|
||||||
final List<Path<T>> endNodes = new ArrayList<Path<T>>();
|
final List<Path<T>> endNodes = new ArrayList<Path<T>>();
|
||||||
|
queue.add(new Path<T>(a.getInitialState(), fst
|
||||||
queue.add(new Path<T>(a.getInitialState(),
|
.getFirstArc(new FST.Arc<T>()), fst.outputs.getNoOutput(),
|
||||||
fst.getFirstArc(new FST.Arc<T>()),
|
|
||||||
fst.outputs.getNoOutput(),
|
|
||||||
new IntsRef()));
|
new IntsRef()));
|
||||||
|
|
||||||
final FST.Arc<T> scratchArc = new FST.Arc<T>();
|
final FST.Arc<T> scratchArc = new FST.Arc<T>();
|
||||||
final FST.BytesReader fstReader = fst.getBytesReader(0);
|
final FST.BytesReader fstReader = fst.getBytesReader(0);
|
||||||
|
|
||||||
//System.out.println("fst/a intersect");
|
|
||||||
|
|
||||||
while (queue.size() != 0) {
|
while (queue.size() != 0) {
|
||||||
final Path<T> path = queue.remove(queue.size() - 1);
|
final Path<T> path = queue.remove(queue.size() - 1);
|
||||||
//System.out.println(" cycle path=" + path);
|
|
||||||
if (path.state.isAccept()) {
|
if (path.state.isAccept()) {
|
||||||
endNodes.add(path);
|
endNodes.add(path);
|
||||||
|
// we can stop here if we accept this path,
|
||||||
|
// we accept all further paths too
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
IntsRef currentInput = path.input;
|
IntsRef currentInput = path.input;
|
||||||
for (Transition t : path.state.getTransitions()) {
|
for (Transition t : path.state.getTransitions()) {
|
||||||
|
final int min = t.getMin();
|
||||||
// TODO: we can fix this if necessary:
|
final int max = t.getMax();
|
||||||
if (t.getMin() != t.getMax()) {
|
if (min == max) {
|
||||||
throw new IllegalStateException("can only handle Transitions that match one character");
|
final FST.Arc<T> nextArc = fst.findTargetArc(t.getMin(),
|
||||||
}
|
path.fstNode, scratchArc, fstReader);
|
||||||
|
|
||||||
//System.out.println(" t=" + (char) t.getMin());
|
|
||||||
|
|
||||||
final FST.Arc<T> nextArc = fst.findTargetArc(t.getMin(), path.fstNode, scratchArc, fstReader);
|
|
||||||
if (nextArc != null) {
|
if (nextArc != null) {
|
||||||
//System.out.println(" fst matches");
|
final IntsRef newInput = new IntsRef(currentInput.length + 1);
|
||||||
// Path continues:
|
|
||||||
IntsRef newInput = new IntsRef(currentInput.length + 1);
|
|
||||||
newInput.copyInts(currentInput);
|
newInput.copyInts(currentInput);
|
||||||
newInput.ints[currentInput.length] = t.getMin();
|
newInput.ints[currentInput.length] = t.getMin();
|
||||||
newInput.length = currentInput.length + 1;
|
newInput.length = currentInput.length + 1;
|
||||||
|
queue.add(new Path<T>(t.getDest(), new FST.Arc<T>()
|
||||||
queue.add(new Path<T>(t.getDest(),
|
.copyFrom(nextArc), fst.outputs
|
||||||
new FST.Arc<T>().copyFrom(nextArc),
|
.add(path.output, nextArc.output), newInput));
|
||||||
fst.outputs.add(path.output, nextArc.output),
|
}
|
||||||
newInput));
|
} else {
|
||||||
|
// TODO: if this transition's TO state is accepting, and
|
||||||
|
// it accepts the entire range possible in the FST (ie. 0 to 255),
|
||||||
|
// we can simply use the prefix as the accepted state instead of
|
||||||
|
// looking up all the ranges and terminate early
|
||||||
|
// here. This just shifts the work from one queue
|
||||||
|
// (this one) to another (the completion search
|
||||||
|
// done in AnalyzingSuggester).
|
||||||
|
FST.Arc<T> nextArc = Util.readCeilArc(min, fst, path.fstNode,
|
||||||
|
scratchArc, fstReader);
|
||||||
|
while (nextArc != null && nextArc.label <= max) {
|
||||||
|
assert nextArc.label <= max;
|
||||||
|
assert nextArc.label >= min : nextArc.label + " "
|
||||||
|
+ min;
|
||||||
|
final IntsRef newInput = new IntsRef(currentInput.length + 1);
|
||||||
|
newInput.copyInts(currentInput);
|
||||||
|
newInput.ints[currentInput.length] = nextArc.label;
|
||||||
|
newInput.length = currentInput.length + 1;
|
||||||
|
queue.add(new Path<T>(t.getDest(), new FST.Arc<T>()
|
||||||
|
.copyFrom(nextArc), fst.outputs
|
||||||
|
.add(path.output, nextArc.output), newInput));
|
||||||
|
final int label = nextArc.label; // used in assert
|
||||||
|
nextArc = nextArc.isLast() ? null : fst.readNextRealArc(nextArc,
|
||||||
|
fstReader);
|
||||||
|
assert nextArc == null || label < nextArc.label : "last: " + label
|
||||||
|
+ " next: " + nextArc.label;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return endNodes;
|
return endNodes;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,226 @@
|
||||||
|
package org.apache.lucene.search.suggest.analyzing;
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
import java.io.FileOutputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.OutputStreamWriter;
|
||||||
|
import java.io.Writer;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; // javadocs
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.IntsRef;
|
||||||
|
import org.apache.lucene.util.automaton.Automaton;
|
||||||
|
import org.apache.lucene.util.automaton.BasicAutomata;
|
||||||
|
import org.apache.lucene.util.automaton.BasicOperations;
|
||||||
|
import org.apache.lucene.util.automaton.LevenshteinAutomata;
|
||||||
|
import org.apache.lucene.util.automaton.SpecialOperations;
|
||||||
|
import org.apache.lucene.util.fst.FST;
|
||||||
|
import org.apache.lucene.util.fst.PairOutputs.Pair;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Implements a fuzzy {@link AnalyzingSuggester}. The similarity measurement is
|
||||||
|
* based on the Damerau-Levenshtein (optimal string alignment) algorithm, though
|
||||||
|
* you can explicitly choose classic Levenshtein by passing <code>false</code>
|
||||||
|
* for the <code>transpositions</code> parameter.
|
||||||
|
* <p>
|
||||||
|
* At most, this query will match terms up to
|
||||||
|
* {@value org.apache.lucene.util.automaton.LevenshteinAutomata#MAXIMUM_SUPPORTED_DISTANCE}
|
||||||
|
* edits. Higher distances are not supported. Note that the
|
||||||
|
* fuzzy distance is measured in "byte space" on the bytes
|
||||||
|
* returned by the {@link TokenStream}'s {@link
|
||||||
|
* TermToBytesRefAttribute}, usually UTF8. By default
|
||||||
|
* the analyzed bytes must be at least 3 {@link
|
||||||
|
* #DEFAULT_MIN_FUZZY_LENGTH} bytes before any edits are
|
||||||
|
* considered. Furthermore, the first 1 {@link
|
||||||
|
* #DEFAULT_NON_FUZZY_PREFIX} byte is not allowed to be
|
||||||
|
* edited. We allow up to 1 (@link
|
||||||
|
* #DEFAULT_MAX_EDITS} edit.
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* NOTE: This suggester does not boost suggestions that
|
||||||
|
* required no edits over suggestions that did require
|
||||||
|
* edits. This is a known limitation.
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* Note: complex query analyzers can have a significant impact on the lookup
|
||||||
|
* performance. It's recommended to not use analyzers that drop or inject terms
|
||||||
|
* like synonyms to keep the complexity of the prefix intersection low for good
|
||||||
|
* lookup performance. At index time, complex analyzers can safely be used.
|
||||||
|
* </p>
|
||||||
|
*/
|
||||||
|
public final class FuzzySuggester extends AnalyzingSuggester {
|
||||||
|
private final int maxEdits;
|
||||||
|
private final boolean transpositions;
|
||||||
|
private final int nonFuzzyPrefix;
|
||||||
|
private final int minFuzzyLength;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The default minimum length of the key passed to {@link
|
||||||
|
* #lookup} before any edits are allowed.
|
||||||
|
*/
|
||||||
|
public static final int DEFAULT_MIN_FUZZY_LENGTH = 3;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The default prefix length where edits are not allowed.
|
||||||
|
*/
|
||||||
|
public static final int DEFAULT_NON_FUZZY_PREFIX = 1;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The default maximum number of edits for fuzzy
|
||||||
|
* suggestions.
|
||||||
|
*/
|
||||||
|
public static final int DEFAULT_MAX_EDITS = 1;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a {@link FuzzySuggester} instance initialized with default values.
|
||||||
|
*
|
||||||
|
* @param analyzer the analyzer used for this suggester
|
||||||
|
*/
|
||||||
|
public FuzzySuggester(Analyzer analyzer) {
|
||||||
|
this(analyzer, analyzer);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a {@link FuzzySuggester} instance with an index & a query analyzer initialized with default values.
|
||||||
|
*
|
||||||
|
* @param indexAnalyzer
|
||||||
|
* Analyzer that will be used for analyzing suggestions while building the index.
|
||||||
|
* @param queryAnalyzer
|
||||||
|
* Analyzer that will be used for analyzing query text during lookup
|
||||||
|
*/
|
||||||
|
public FuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer) {
|
||||||
|
this(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, DEFAULT_MAX_EDITS, true,
|
||||||
|
DEFAULT_NON_FUZZY_PREFIX, DEFAULT_MIN_FUZZY_LENGTH);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a {@link FuzzySuggester} instance.
|
||||||
|
*
|
||||||
|
* @param indexAnalyzer Analyzer that will be used for
|
||||||
|
* analyzing suggestions while building the index.
|
||||||
|
* @param queryAnalyzer Analyzer that will be used for
|
||||||
|
* analyzing query text during lookup
|
||||||
|
* @param options see {@link #EXACT_FIRST}, {@link #PRESERVE_SEP}
|
||||||
|
* @param maxSurfaceFormsPerAnalyzedForm Maximum number of
|
||||||
|
* surface forms to keep for a single analyzed form.
|
||||||
|
* When there are too many surface forms we discard the
|
||||||
|
* lowest weighted ones.
|
||||||
|
* @param maxGraphExpansions Maximum number of graph paths
|
||||||
|
* to expand from the analyzed form. Set this to -1 for
|
||||||
|
* no limit.
|
||||||
|
* @param maxEdits must be >= 0 and <= {@link LevenshteinAutomata#MAXIMUM_SUPPORTED_DISTANCE} .
|
||||||
|
* @param transpositions <code>true</code> if transpositions should be treated as a primitive
|
||||||
|
* edit operation. If this is false, comparisons will implement the classic
|
||||||
|
* Levenshtein algorithm.
|
||||||
|
* @param nonFuzzyPrefix length of common (non-fuzzy) prefix (see default {@link #DEFAULT_NON_FUZZY_PREFIX}
|
||||||
|
* @param minFuzzyLength minimum length of lookup key before any edits are allowed (see default {@link #DEFAULT_MIN_FUZZY_LENGTH})
|
||||||
|
*/
|
||||||
|
public FuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer,
|
||||||
|
int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions,
|
||||||
|
int maxEdits, boolean transpositions, int nonFuzzyPrefix,
|
||||||
|
int minFuzzyLength) {
|
||||||
|
super(indexAnalyzer, queryAnalyzer, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions);
|
||||||
|
if (maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
|
||||||
|
throw new IllegalArgumentException("maxEdits must be between 0 and " + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE);
|
||||||
|
}
|
||||||
|
if (nonFuzzyPrefix < 0) {
|
||||||
|
throw new IllegalArgumentException("nonFuzzyPrefix must not be >= 0 (got " + nonFuzzyPrefix + ")");
|
||||||
|
}
|
||||||
|
if (minFuzzyLength < 0) {
|
||||||
|
throw new IllegalArgumentException("minFuzzyLength must not be >= 0 (got " + minFuzzyLength + ")");
|
||||||
|
}
|
||||||
|
|
||||||
|
this.maxEdits = maxEdits;
|
||||||
|
this.transpositions = transpositions;
|
||||||
|
this.nonFuzzyPrefix = nonFuzzyPrefix;
|
||||||
|
this.minFuzzyLength = minFuzzyLength;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected List<FSTUtil.Path<Pair<Long,BytesRef>>> getFullPrefixPaths(List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths,
|
||||||
|
Automaton lookupAutomaton,
|
||||||
|
FST<Pair<Long,BytesRef>> fst)
|
||||||
|
throws IOException {
|
||||||
|
|
||||||
|
// TODO: right now there's no penalty for fuzzy/edits,
|
||||||
|
// ie a completion whose prefix matched exactly what the
|
||||||
|
// user typed gets no boost over completions that
|
||||||
|
// required an edit, which get no boost over completions
|
||||||
|
// requiring two edits. I suspect a multiplicative
|
||||||
|
// factor is appropriate (eg, say a fuzzy match must be at
|
||||||
|
// least 2X better weight than the non-fuzzy match to
|
||||||
|
// "compete") ... in which case I think the wFST needs
|
||||||
|
// to be log weights or something ...
|
||||||
|
|
||||||
|
Automaton levA = toLevenshteinAutomata(lookupAutomaton);
|
||||||
|
/*
|
||||||
|
Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
|
||||||
|
w.write(levA.toDot());
|
||||||
|
w.close();
|
||||||
|
System.out.println("Wrote LevA to out.dot");
|
||||||
|
*/
|
||||||
|
return FSTUtil.intersectPrefixPaths(levA, fst);
|
||||||
|
}
|
||||||
|
|
||||||
|
Automaton toLevenshteinAutomata(Automaton automaton) {
|
||||||
|
final Set<IntsRef> ref = SpecialOperations.getFiniteStrings(automaton, -1);
|
||||||
|
Automaton subs[] = new Automaton[ref.size()];
|
||||||
|
int upto = 0;
|
||||||
|
for (IntsRef path : ref) {
|
||||||
|
if (path.length <= nonFuzzyPrefix || path.length < minFuzzyLength) {
|
||||||
|
subs[upto] = BasicAutomata.makeString(path.ints, path.offset, path.length);
|
||||||
|
upto++;
|
||||||
|
} else {
|
||||||
|
Automaton prefix = BasicAutomata.makeString(path.ints, path.offset, nonFuzzyPrefix);
|
||||||
|
int ints[] = new int[path.length-nonFuzzyPrefix];
|
||||||
|
System.arraycopy(path.ints, path.offset+nonFuzzyPrefix, ints, 0, ints.length);
|
||||||
|
// TODO: maybe add alphaMin to LevenshteinAutomata,
|
||||||
|
// and pass 1 instead of 0? We probably don't want
|
||||||
|
// to allow the trailing dedup bytes to be
|
||||||
|
// edited... but then 0 byte is "in general" allowed
|
||||||
|
// on input (but not in UTF8).
|
||||||
|
LevenshteinAutomata lev = new LevenshteinAutomata(ints, 255, transpositions);
|
||||||
|
Automaton levAutomaton = lev.toAutomaton(maxEdits);
|
||||||
|
Automaton combined = BasicOperations.concatenate(Arrays.asList(prefix, levAutomaton));
|
||||||
|
combined.setDeterministic(true); // its like the special case in concatenate itself, except we cloneExpanded already
|
||||||
|
subs[upto] = combined;
|
||||||
|
upto++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (subs.length == 0) {
|
||||||
|
return BasicAutomata.makeEmpty(); // matches nothing
|
||||||
|
} else if (subs.length == 1) {
|
||||||
|
return subs[0];
|
||||||
|
} else {
|
||||||
|
Automaton a = BasicOperations.union(Arrays.asList(subs));
|
||||||
|
// TODO: we could call toLevenshteinAutomata() before det?
|
||||||
|
// this only happens if you have multiple paths anyway (e.g. synonyms)
|
||||||
|
BasicOperations.determinize(a);
|
||||||
|
|
||||||
|
// Does not seem to help (and hurt maybe a bit: 6-9
|
||||||
|
// prefix went from 19 to 18 kQPS):
|
||||||
|
// a.reduce();
|
||||||
|
return a;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -36,6 +36,7 @@ import org.apache.lucene.analysis.MockAnalyzer;
|
||||||
import org.apache.lucene.analysis.MockTokenizer;
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
import org.apache.lucene.search.suggest.Lookup; // javadocs
|
import org.apache.lucene.search.suggest.Lookup; // javadocs
|
||||||
import org.apache.lucene.search.suggest.analyzing.AnalyzingSuggester;
|
import org.apache.lucene.search.suggest.analyzing.AnalyzingSuggester;
|
||||||
|
import org.apache.lucene.search.suggest.analyzing.FuzzySuggester;
|
||||||
import org.apache.lucene.search.suggest.fst.FSTCompletionLookup;
|
import org.apache.lucene.search.suggest.fst.FSTCompletionLookup;
|
||||||
import org.apache.lucene.search.suggest.fst.WFSTCompletionLookup;
|
import org.apache.lucene.search.suggest.fst.WFSTCompletionLookup;
|
||||||
import org.apache.lucene.search.suggest.jaspell.JaspellLookup;
|
import org.apache.lucene.search.suggest.jaspell.JaspellLookup;
|
||||||
|
@ -51,17 +52,20 @@ import org.junit.Ignore;
|
||||||
public class LookupBenchmarkTest extends LuceneTestCase {
|
public class LookupBenchmarkTest extends LuceneTestCase {
|
||||||
@SuppressWarnings("unchecked")
|
@SuppressWarnings("unchecked")
|
||||||
private final List<Class<? extends Lookup>> benchmarkClasses = Arrays.asList(
|
private final List<Class<? extends Lookup>> benchmarkClasses = Arrays.asList(
|
||||||
|
FuzzySuggester.class,
|
||||||
|
AnalyzingSuggester.class,
|
||||||
JaspellLookup.class,
|
JaspellLookup.class,
|
||||||
TSTLookup.class,
|
TSTLookup.class,
|
||||||
FSTCompletionLookup.class,
|
FSTCompletionLookup.class,
|
||||||
WFSTCompletionLookup.class,
|
WFSTCompletionLookup.class
|
||||||
AnalyzingSuggester.class);
|
|
||||||
|
);
|
||||||
|
|
||||||
private final static int rounds = 15;
|
private final static int rounds = 15;
|
||||||
private final static int warmup = 5;
|
private final static int warmup = 5;
|
||||||
|
|
||||||
private final int num = 7;
|
private final int num = 7;
|
||||||
private final boolean onlyMorePopular = true;
|
private final boolean onlyMorePopular = false;
|
||||||
|
|
||||||
private final static Random random = new Random(0xdeadbeef);
|
private final static Random random = new Random(0xdeadbeef);
|
||||||
|
|
||||||
|
@ -212,8 +216,9 @@ public class LookupBenchmarkTest extends LuceneTestCase {
|
||||||
final List<String> input = new ArrayList<String>(benchmarkInput.size());
|
final List<String> input = new ArrayList<String>(benchmarkInput.size());
|
||||||
for (TermFreq tf : benchmarkInput) {
|
for (TermFreq tf : benchmarkInput) {
|
||||||
String s = tf.term.utf8ToString();
|
String s = tf.term.utf8ToString();
|
||||||
input.add(s.substring(0, Math.min(s.length(),
|
String sub = s.substring(0, Math.min(s.length(),
|
||||||
minPrefixLen + random.nextInt(maxPrefixLen - minPrefixLen + 1))));
|
minPrefixLen + random.nextInt(maxPrefixLen - minPrefixLen + 1)));
|
||||||
|
input.add(sub);
|
||||||
}
|
}
|
||||||
|
|
||||||
BenchmarkResult result = measure(new Callable<Integer>() {
|
BenchmarkResult result = measure(new Callable<Integer>() {
|
||||||
|
@ -250,7 +255,9 @@ public class LookupBenchmarkTest extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
return new BenchmarkResult(times, warmup, rounds);
|
return new BenchmarkResult(times, warmup, rounds);
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
|
e.printStackTrace();
|
||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue