mirror of https://github.com/apache/lucene.git
LUCENE-3846: simplify overriding required for FuzzySuggester
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3846@1401372 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
95b99f48d2
commit
2f136ff27c
|
@ -501,7 +501,7 @@ public class AnalyzingSuggester extends Lookup {
|
||||||
|
|
||||||
// Intersect automaton w/ suggest wFST and get all
|
// Intersect automaton w/ suggest wFST and get all
|
||||||
// prefix starting nodes & their outputs:
|
// prefix starting nodes & their outputs:
|
||||||
final PathIntersector intersector = getPathIntersector(lookupAutomaton, fst);
|
//final PathIntersector intersector = getPathIntersector(lookupAutomaton, fst);
|
||||||
|
|
||||||
//System.out.println(" prefixPaths: " + prefixPaths.size());
|
//System.out.println(" prefixPaths: " + prefixPaths.size());
|
||||||
|
|
||||||
|
@ -511,8 +511,9 @@ public class AnalyzingSuggester extends Lookup {
|
||||||
|
|
||||||
final List<LookupResult> results = new ArrayList<LookupResult>();
|
final List<LookupResult> results = new ArrayList<LookupResult>();
|
||||||
|
|
||||||
|
List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths = FSTUtil.intersectPrefixPaths(lookupAutomaton, fst);
|
||||||
|
|
||||||
if (exactFirst) {
|
if (exactFirst) {
|
||||||
final List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths = intersector.intersectExact();
|
|
||||||
|
|
||||||
int count = 0;
|
int count = 0;
|
||||||
for (FSTUtil.Path<Pair<Long,BytesRef>> path : prefixPaths) {
|
for (FSTUtil.Path<Pair<Long,BytesRef>> path : prefixPaths) {
|
||||||
|
@ -604,7 +605,9 @@ public class AnalyzingSuggester extends Lookup {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
final List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths = intersector.intersectAll();
|
|
||||||
|
prefixPaths = getFullPrefixPaths(prefixPaths, lookupAutomaton, fst);
|
||||||
|
|
||||||
for (FSTUtil.Path<Pair<Long,BytesRef>> path : prefixPaths) {
|
for (FSTUtil.Path<Pair<Long,BytesRef>> path : prefixPaths) {
|
||||||
searcher.addStartPaths(path.fstNode, path.output, true, path.input);
|
searcher.addStartPaths(path.fstNode, path.output, true, path.input);
|
||||||
}
|
}
|
||||||
|
@ -615,6 +618,10 @@ public class AnalyzingSuggester extends Lookup {
|
||||||
spare.grow(completion.output.output2.length);
|
spare.grow(completion.output.output2.length);
|
||||||
UnicodeUtil.UTF8toUTF16(completion.output.output2, spare);
|
UnicodeUtil.UTF8toUTF16(completion.output.output2, spare);
|
||||||
LookupResult result = new LookupResult(spare.toString(), decodeWeight(completion.output.output1));
|
LookupResult result = new LookupResult(spare.toString(), decodeWeight(completion.output.output1));
|
||||||
|
|
||||||
|
// nocommit for fuzzy case would be nice to return
|
||||||
|
// how many edits were required...:
|
||||||
|
|
||||||
//System.out.println(" result=" + result);
|
//System.out.println(" result=" + result);
|
||||||
results.add(result);
|
results.add(result);
|
||||||
|
|
||||||
|
@ -631,6 +638,13 @@ public class AnalyzingSuggester extends Lookup {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected List<FSTUtil.Path<Pair<Long,BytesRef>>> getFullPrefixPaths(List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths,
|
||||||
|
Automaton lookupAutomaton,
|
||||||
|
FST<Pair<Long,BytesRef>> fst)
|
||||||
|
throws IOException {
|
||||||
|
return prefixPaths;
|
||||||
|
}
|
||||||
|
|
||||||
final Set<IntsRef> toFiniteStrings(final BytesRef surfaceForm, final TokenStreamToAutomaton ts2a) throws IOException {
|
final Set<IntsRef> toFiniteStrings(final BytesRef surfaceForm, final TokenStreamToAutomaton ts2a) throws IOException {
|
||||||
// Analyze surface form:
|
// Analyze surface form:
|
||||||
TokenStream ts = indexAnalyzer.tokenStream("", new StringReader(surfaceForm.utf8ToString()));
|
TokenStream ts = indexAnalyzer.tokenStream("", new StringReader(surfaceForm.utf8ToString()));
|
||||||
|
@ -706,46 +720,4 @@ public class AnalyzingSuggester extends Lookup {
|
||||||
return left.output1.compareTo(right.output1);
|
return left.output1.compareTo(right.output1);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns a new {@link PathIntersector}.
|
|
||||||
*
|
|
||||||
* <p>NOTE: The labels on the transitions incoming
|
|
||||||
* automaton are bytes returned by the {@link
|
|
||||||
* TokenStream}'s {@link TermToBytesRefAttribute}, which
|
|
||||||
* are typically UTF8 encoded.
|
|
||||||
*/
|
|
||||||
protected PathIntersector getPathIntersector(Automaton automaton, FST<Pair<Long,BytesRef>> fst) {
|
|
||||||
return new PathIntersector(automaton, fst);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This class is used to obtain the prefix paths in the automaton that also intersect the FST.
|
|
||||||
*/
|
|
||||||
protected static class PathIntersector {
|
|
||||||
protected List<FSTUtil.Path<Pair<Long,BytesRef>>> intersect;
|
|
||||||
protected final Automaton automaton;
|
|
||||||
protected final FST<Pair<Long,BytesRef>> fst;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Creates a new {@link PathIntersector}
|
|
||||||
*/
|
|
||||||
public PathIntersector(Automaton automaton, FST<Pair<Long,BytesRef>> fst) {
|
|
||||||
this.automaton = automaton;
|
|
||||||
this.fst = fst;
|
|
||||||
}
|
|
||||||
/**
|
|
||||||
* Returns the prefix paths for exact first top N search.
|
|
||||||
*/
|
|
||||||
public List<FSTUtil.Path<Pair<Long,BytesRef>>> intersectExact() throws IOException {
|
|
||||||
return intersect = FSTUtil.intersectPrefixPaths(automaton, fst);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the prefix paths for top N search.
|
|
||||||
*/
|
|
||||||
public List<FSTUtil.Path<Pair<Long,BytesRef>>> intersectAll() throws IOException {
|
|
||||||
return intersect == null ? intersect = FSTUtil.intersectPrefixPaths(automaton, fst) : intersect;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -26,7 +26,6 @@ import java.util.Set;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; // javadocs
|
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; // javadocs
|
||||||
import org.apache.lucene.search.suggest.analyzing.FSTUtil.Path;
|
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
import org.apache.lucene.util.IntsRef;
|
import org.apache.lucene.util.IntsRef;
|
||||||
import org.apache.lucene.util.automaton.Automaton;
|
import org.apache.lucene.util.automaton.Automaton;
|
||||||
|
@ -146,9 +145,24 @@ public final class FuzzySuggester extends AnalyzingSuggester {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected PathIntersector getPathIntersector(Automaton automaton,
|
protected List<FSTUtil.Path<Pair<Long,BytesRef>>> getFullPrefixPaths(List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths,
|
||||||
FST<Pair<Long,BytesRef>> fst) {
|
Automaton lookupAutomaton,
|
||||||
return new FuzzyPathIntersector(automaton, fst);
|
FST<Pair<Long,BytesRef>> fst)
|
||||||
|
throws IOException {
|
||||||
|
// nocommit we don't "penalize" for edits
|
||||||
|
// ... shouldn't we? ie, ed=0 completions should have
|
||||||
|
// higher rank than ed=1, at the same "weight"? maybe
|
||||||
|
// we can punt on this for starters ... or maybe we
|
||||||
|
// can re-run each prefix path through lev0, lev1,
|
||||||
|
// lev2 to figure out the number of edits?
|
||||||
|
Automaton levA = toLevenshteinAutomata(lookupAutomaton);
|
||||||
|
/*
|
||||||
|
Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
|
||||||
|
w.write(levA.toDot());
|
||||||
|
w.close();
|
||||||
|
System.out.println("Wrote LevA to out.dot");
|
||||||
|
*/
|
||||||
|
return FSTUtil.intersectPrefixPaths(levA, fst);
|
||||||
}
|
}
|
||||||
|
|
||||||
Automaton toLevenshteinAutomata(Automaton automaton) {
|
Automaton toLevenshteinAutomata(Automaton automaton) {
|
||||||
|
@ -195,30 +209,4 @@ public final class FuzzySuggester extends AnalyzingSuggester {
|
||||||
return a;
|
return a;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private final class FuzzyPathIntersector extends PathIntersector {
|
|
||||||
|
|
||||||
public FuzzyPathIntersector(Automaton automaton,
|
|
||||||
FST<Pair<Long,BytesRef>> fst) {
|
|
||||||
super(automaton, fst);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public List<Path<Pair<Long,BytesRef>>> intersectAll() throws IOException {
|
|
||||||
// nocommit we don't "penalize" for edits
|
|
||||||
// ... shouldn't we? ie, ed=0 completions should have
|
|
||||||
// higher rank than ed=1, at the same "weight"? maybe
|
|
||||||
// we can punt on this for starters ... or maybe we
|
|
||||||
// can re-run each prefix path through lev0, lev1,
|
|
||||||
// lev2 to figure out the number of edits?
|
|
||||||
Automaton levA = toLevenshteinAutomata(automaton);
|
|
||||||
/*
|
|
||||||
Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
|
|
||||||
w.write(levA.toDot());
|
|
||||||
w.close();
|
|
||||||
System.out.println("Wrote LevA to out.dot");
|
|
||||||
*/
|
|
||||||
return FSTUtil.intersectPrefixPaths(levA, fst);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue