LUCENE-3846: simplify overriding required for FuzzySuggester

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3846@1401372 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2012-10-23 18:11:32 +00:00
parent 95b99f48d2
commit 2f136ff27c
2 changed files with 35 additions and 75 deletions

View File

@ -501,7 +501,7 @@ public class AnalyzingSuggester extends Lookup {
// Intersect automaton w/ suggest wFST and get all
// prefix starting nodes & their outputs:
final PathIntersector intersector = getPathIntersector(lookupAutomaton, fst);
//final PathIntersector intersector = getPathIntersector(lookupAutomaton, fst);
//System.out.println(" prefixPaths: " + prefixPaths.size());
@ -511,8 +511,9 @@ public class AnalyzingSuggester extends Lookup {
final List<LookupResult> results = new ArrayList<LookupResult>();
List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths = FSTUtil.intersectPrefixPaths(lookupAutomaton, fst);
if (exactFirst) {
final List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths = intersector.intersectExact();
int count = 0;
for (FSTUtil.Path<Pair<Long,BytesRef>> path : prefixPaths) {
@ -604,7 +605,9 @@ public class AnalyzingSuggester extends Lookup {
}
}
};
final List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths = intersector.intersectAll();
prefixPaths = getFullPrefixPaths(prefixPaths, lookupAutomaton, fst);
for (FSTUtil.Path<Pair<Long,BytesRef>> path : prefixPaths) {
searcher.addStartPaths(path.fstNode, path.output, true, path.input);
}
@ -615,6 +618,10 @@ public class AnalyzingSuggester extends Lookup {
spare.grow(completion.output.output2.length);
UnicodeUtil.UTF8toUTF16(completion.output.output2, spare);
LookupResult result = new LookupResult(spare.toString(), decodeWeight(completion.output.output1));
// nocommit for fuzzy case would be nice to return
// how many edits were required...:
//System.out.println(" result=" + result);
results.add(result);
@ -631,6 +638,13 @@ public class AnalyzingSuggester extends Lookup {
}
}
protected List<FSTUtil.Path<Pair<Long,BytesRef>>> getFullPrefixPaths(List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths,
Automaton lookupAutomaton,
FST<Pair<Long,BytesRef>> fst)
throws IOException {
return prefixPaths;
}
final Set<IntsRef> toFiniteStrings(final BytesRef surfaceForm, final TokenStreamToAutomaton ts2a) throws IOException {
// Analyze surface form:
TokenStream ts = indexAnalyzer.tokenStream("", new StringReader(surfaceForm.utf8ToString()));
@ -706,46 +720,4 @@ public class AnalyzingSuggester extends Lookup {
return left.output1.compareTo(right.output1);
}
};
/**
* Returns a new {@link PathIntersector}.
*
* <p>NOTE: The labels on the transitions incoming
* automaton are bytes returned by the {@link
* TokenStream}'s {@link TermToBytesRefAttribute}, which
* are typically UTF8 encoded.
*/
protected PathIntersector getPathIntersector(Automaton automaton, FST<Pair<Long,BytesRef>> fst) {
return new PathIntersector(automaton, fst);
}
/**
* This class is used to obtain the prefix paths in the automaton that also intersect the FST.
*/
protected static class PathIntersector {
protected List<FSTUtil.Path<Pair<Long,BytesRef>>> intersect;
protected final Automaton automaton;
protected final FST<Pair<Long,BytesRef>> fst;
/**
* Creates a new {@link PathIntersector}
*/
public PathIntersector(Automaton automaton, FST<Pair<Long,BytesRef>> fst) {
this.automaton = automaton;
this.fst = fst;
}
/**
* Returns the prefix paths for exact first top N search.
*/
public List<FSTUtil.Path<Pair<Long,BytesRef>>> intersectExact() throws IOException {
return intersect = FSTUtil.intersectPrefixPaths(automaton, fst);
}
/**
* Returns the prefix paths for top N search.
*/
public List<FSTUtil.Path<Pair<Long,BytesRef>>> intersectAll() throws IOException {
return intersect == null ? intersect = FSTUtil.intersectPrefixPaths(automaton, fst) : intersect;
}
}
}

View File

@ -26,7 +26,6 @@ import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; // javadocs
import org.apache.lucene.search.suggest.analyzing.FSTUtil.Path;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.automaton.Automaton;
@ -146,9 +145,24 @@ public final class FuzzySuggester extends AnalyzingSuggester {
}
@Override
protected PathIntersector getPathIntersector(Automaton automaton,
FST<Pair<Long,BytesRef>> fst) {
return new FuzzyPathIntersector(automaton, fst);
protected List<FSTUtil.Path<Pair<Long,BytesRef>>> getFullPrefixPaths(List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths,
Automaton lookupAutomaton,
FST<Pair<Long,BytesRef>> fst)
throws IOException {
// nocommit we don't "penalize" for edits
// ... shouldn't we? ie, ed=0 completions should have
// higher rank than ed=1, at the same "weight"? maybe
// we can punt on this for starters ... or maybe we
// can re-run each prefix path through lev0, lev1,
// lev2 to figure out the number of edits?
Automaton levA = toLevenshteinAutomata(lookupAutomaton);
/*
Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
w.write(levA.toDot());
w.close();
System.out.println("Wrote LevA to out.dot");
*/
return FSTUtil.intersectPrefixPaths(levA, fst);
}
Automaton toLevenshteinAutomata(Automaton automaton) {
@ -195,30 +209,4 @@ public final class FuzzySuggester extends AnalyzingSuggester {
return a;
}
}
private final class FuzzyPathIntersector extends PathIntersector {
public FuzzyPathIntersector(Automaton automaton,
FST<Pair<Long,BytesRef>> fst) {
super(automaton, fst);
}
@Override
public List<Path<Pair<Long,BytesRef>>> intersectAll() throws IOException {
// nocommit we don't "penalize" for edits
// ... shouldn't we? ie, ed=0 completions should have
// higher rank than ed=1, at the same "weight"? maybe
// we can punt on this for starters ... or maybe we
// can re-run each prefix path through lev0, lev1,
// lev2 to figure out the number of edits?
Automaton levA = toLevenshteinAutomata(automaton);
/*
Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
w.write(levA.toDot());
w.close();
System.out.println("Wrote LevA to out.dot");
*/
return FSTUtil.intersectPrefixPaths(levA, fst);
}
}
}