LUCENE-3846: simplify overriding required for FuzzySuggester

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3846@1401372 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2012-10-23 18:11:32 +00:00
parent 95b99f48d2
commit 2f136ff27c
2 changed files with 35 additions and 75 deletions

View File

@ -501,7 +501,7 @@ public class AnalyzingSuggester extends Lookup {
// Intersect automaton w/ suggest wFST and get all // Intersect automaton w/ suggest wFST and get all
// prefix starting nodes & their outputs: // prefix starting nodes & their outputs:
final PathIntersector intersector = getPathIntersector(lookupAutomaton, fst); //final PathIntersector intersector = getPathIntersector(lookupAutomaton, fst);
//System.out.println(" prefixPaths: " + prefixPaths.size()); //System.out.println(" prefixPaths: " + prefixPaths.size());
@ -511,8 +511,9 @@ public class AnalyzingSuggester extends Lookup {
final List<LookupResult> results = new ArrayList<LookupResult>(); final List<LookupResult> results = new ArrayList<LookupResult>();
List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths = FSTUtil.intersectPrefixPaths(lookupAutomaton, fst);
if (exactFirst) { if (exactFirst) {
final List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths = intersector.intersectExact();
int count = 0; int count = 0;
for (FSTUtil.Path<Pair<Long,BytesRef>> path : prefixPaths) { for (FSTUtil.Path<Pair<Long,BytesRef>> path : prefixPaths) {
@ -604,7 +605,9 @@ public class AnalyzingSuggester extends Lookup {
} }
} }
}; };
final List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths = intersector.intersectAll();
prefixPaths = getFullPrefixPaths(prefixPaths, lookupAutomaton, fst);
for (FSTUtil.Path<Pair<Long,BytesRef>> path : prefixPaths) { for (FSTUtil.Path<Pair<Long,BytesRef>> path : prefixPaths) {
searcher.addStartPaths(path.fstNode, path.output, true, path.input); searcher.addStartPaths(path.fstNode, path.output, true, path.input);
} }
@ -615,6 +618,10 @@ public class AnalyzingSuggester extends Lookup {
spare.grow(completion.output.output2.length); spare.grow(completion.output.output2.length);
UnicodeUtil.UTF8toUTF16(completion.output.output2, spare); UnicodeUtil.UTF8toUTF16(completion.output.output2, spare);
LookupResult result = new LookupResult(spare.toString(), decodeWeight(completion.output.output1)); LookupResult result = new LookupResult(spare.toString(), decodeWeight(completion.output.output1));
// nocommit for fuzzy case would be nice to return
// how many edits were required...:
//System.out.println(" result=" + result); //System.out.println(" result=" + result);
results.add(result); results.add(result);
@ -631,6 +638,13 @@ public class AnalyzingSuggester extends Lookup {
} }
} }
protected List<FSTUtil.Path<Pair<Long,BytesRef>>> getFullPrefixPaths(List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths,
Automaton lookupAutomaton,
FST<Pair<Long,BytesRef>> fst)
throws IOException {
return prefixPaths;
}
final Set<IntsRef> toFiniteStrings(final BytesRef surfaceForm, final TokenStreamToAutomaton ts2a) throws IOException { final Set<IntsRef> toFiniteStrings(final BytesRef surfaceForm, final TokenStreamToAutomaton ts2a) throws IOException {
// Analyze surface form: // Analyze surface form:
TokenStream ts = indexAnalyzer.tokenStream("", new StringReader(surfaceForm.utf8ToString())); TokenStream ts = indexAnalyzer.tokenStream("", new StringReader(surfaceForm.utf8ToString()));
@ -706,46 +720,4 @@ public class AnalyzingSuggester extends Lookup {
return left.output1.compareTo(right.output1); return left.output1.compareTo(right.output1);
} }
}; };
/**
* Returns a new {@link PathIntersector}.
*
* <p>NOTE: The labels on the transitions incoming
* automaton are bytes returned by the {@link
* TokenStream}'s {@link TermToBytesRefAttribute}, which
* are typically UTF8 encoded.
*/
protected PathIntersector getPathIntersector(Automaton automaton, FST<Pair<Long,BytesRef>> fst) {
return new PathIntersector(automaton, fst);
}
/**
* This class is used to obtain the prefix paths in the automaton that also intersect the FST.
*/
protected static class PathIntersector {
protected List<FSTUtil.Path<Pair<Long,BytesRef>>> intersect;
protected final Automaton automaton;
protected final FST<Pair<Long,BytesRef>> fst;
/**
* Creates a new {@link PathIntersector}
*/
public PathIntersector(Automaton automaton, FST<Pair<Long,BytesRef>> fst) {
this.automaton = automaton;
this.fst = fst;
}
/**
* Returns the prefix paths for exact first top N search.
*/
public List<FSTUtil.Path<Pair<Long,BytesRef>>> intersectExact() throws IOException {
return intersect = FSTUtil.intersectPrefixPaths(automaton, fst);
}
/**
* Returns the prefix paths for top N search.
*/
public List<FSTUtil.Path<Pair<Long,BytesRef>>> intersectAll() throws IOException {
return intersect == null ? intersect = FSTUtil.intersectPrefixPaths(automaton, fst) : intersect;
}
}
} }

View File

@ -26,7 +26,6 @@ import java.util.Set;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; // javadocs import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; // javadocs
import org.apache.lucene.search.suggest.analyzing.FSTUtil.Path;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.Automaton;
@ -146,9 +145,24 @@ public final class FuzzySuggester extends AnalyzingSuggester {
} }
@Override @Override
protected PathIntersector getPathIntersector(Automaton automaton, protected List<FSTUtil.Path<Pair<Long,BytesRef>>> getFullPrefixPaths(List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths,
FST<Pair<Long,BytesRef>> fst) { Automaton lookupAutomaton,
return new FuzzyPathIntersector(automaton, fst); FST<Pair<Long,BytesRef>> fst)
throws IOException {
// nocommit we don't "penalize" for edits
// ... shouldn't we? ie, ed=0 completions should have
// higher rank than ed=1, at the same "weight"? maybe
// we can punt on this for starters ... or maybe we
// can re-run each prefix path through lev0, lev1,
// lev2 to figure out the number of edits?
Automaton levA = toLevenshteinAutomata(lookupAutomaton);
/*
Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
w.write(levA.toDot());
w.close();
System.out.println("Wrote LevA to out.dot");
*/
return FSTUtil.intersectPrefixPaths(levA, fst);
} }
Automaton toLevenshteinAutomata(Automaton automaton) { Automaton toLevenshteinAutomata(Automaton automaton) {
@ -195,30 +209,4 @@ public final class FuzzySuggester extends AnalyzingSuggester {
return a; return a;
} }
} }
private final class FuzzyPathIntersector extends PathIntersector {
public FuzzyPathIntersector(Automaton automaton,
FST<Pair<Long,BytesRef>> fst) {
super(automaton, fst);
}
@Override
public List<Path<Pair<Long,BytesRef>>> intersectAll() throws IOException {
// nocommit we don't "penalize" for edits
// ... shouldn't we? ie, ed=0 completions should have
// higher rank than ed=1, at the same "weight"? maybe
// we can punt on this for starters ... or maybe we
// can re-run each prefix path through lev0, lev1,
// lev2 to figure out the number of edits?
Automaton levA = toLevenshteinAutomata(automaton);
/*
Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
w.write(levA.toDot());
w.close();
System.out.println("Wrote LevA to out.dot");
*/
return FSTUtil.intersectPrefixPaths(levA, fst);
}
}
} }