mirror of https://github.com/apache/lucene.git
LUCENE-4024: FuzzyQuery should never do edit distance > 2
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1334819 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
4c151d54e0
commit
8f7c1da3ba
|
@ -28,7 +28,7 @@ import org.apache.lucene.util.ToStringUtils;
|
||||||
import org.apache.lucene.util.automaton.LevenshteinAutomata;
|
import org.apache.lucene.util.automaton.LevenshteinAutomata;
|
||||||
|
|
||||||
/** Implements the fuzzy search query. The similarity measurement
|
/** Implements the fuzzy search query. The similarity measurement
|
||||||
* is based on the Levenshtein (edit distance) algorithm.
|
* is based on the Damerau-Levenshtein (optimal string alignment) algorithm.
|
||||||
*
|
*
|
||||||
* <p>This query uses {@link MultiTermQuery.TopTermsScoringBooleanQueryRewrite}
|
* <p>This query uses {@link MultiTermQuery.TopTermsScoringBooleanQueryRewrite}
|
||||||
* as default. So terms will be collected and scored according to their
|
* as default. So terms will be collected and scored according to their
|
||||||
|
@ -37,94 +37,81 @@ import org.apache.lucene.util.automaton.LevenshteinAutomata;
|
||||||
*/
|
*/
|
||||||
public class FuzzyQuery extends MultiTermQuery {
|
public class FuzzyQuery extends MultiTermQuery {
|
||||||
|
|
||||||
public final static float defaultMinSimilarity = LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE;
|
public final static int defaultMaxEdits = LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE;
|
||||||
public final static int defaultPrefixLength = 0;
|
public final static int defaultPrefixLength = 0;
|
||||||
public final static int defaultMaxExpansions = 50;
|
public final static int defaultMaxExpansions = 50;
|
||||||
|
public final static boolean defaultTranspositions = true;
|
||||||
|
|
||||||
private float minimumSimilarity;
|
private final int maxEdits;
|
||||||
private int prefixLength;
|
private final int maxExpansions;
|
||||||
private boolean termLongEnough = false;
|
private final boolean transpositions;
|
||||||
|
private final int prefixLength;
|
||||||
protected Term term;
|
private final Term term;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create a new FuzzyQuery that will match terms with a similarity
|
* Create a new FuzzyQuery that will match terms with an edit distance
|
||||||
* of at least <code>minimumSimilarity</code> to <code>term</code>.
|
* of at most <code>maxEdits</code> to <code>term</code>.
|
||||||
* If a <code>prefixLength</code> > 0 is specified, a common prefix
|
* If a <code>prefixLength</code> > 0 is specified, a common prefix
|
||||||
* of that length is also required.
|
* of that length is also required.
|
||||||
*
|
*
|
||||||
* @param term the term to search for
|
* @param term the term to search for
|
||||||
* @param minimumSimilarity a value between 0 and 1 to set the required similarity
|
* @param maxEdits must be >= 0 and <= {@link LevenshteinAutomata#MAXIMUM_SUPPORTED_DISTANCE}.
|
||||||
* between the query term and the matching terms. For example, for a
|
|
||||||
* <code>minimumSimilarity</code> of <code>0.5</code> a term of the same length
|
|
||||||
* as the query term is considered similar to the query term if the edit distance
|
|
||||||
* between both terms is less than <code>length(term)*0.5</code>
|
|
||||||
* <p>
|
|
||||||
* Alternatively, if <code>minimumSimilarity</code> is >= 1f, it is interpreted
|
|
||||||
* as a pure Levenshtein edit distance. For example, a value of <code>2f</code>
|
|
||||||
* will match all terms within an edit distance of <code>2</code> from the
|
|
||||||
* query term. Edit distances specified in this way may not be fractional.
|
|
||||||
*
|
|
||||||
* @param prefixLength length of common (non-fuzzy) prefix
|
* @param prefixLength length of common (non-fuzzy) prefix
|
||||||
* @param maxExpansions the maximum number of terms to match. If this number is
|
* @param maxExpansions the maximum number of terms to match. If this number is
|
||||||
* greater than {@link BooleanQuery#getMaxClauseCount} when the query is rewritten,
|
* greater than {@link BooleanQuery#getMaxClauseCount} when the query is rewritten,
|
||||||
* then the maxClauseCount will be used instead.
|
* then the maxClauseCount will be used instead.
|
||||||
* @throws IllegalArgumentException if minimumSimilarity is >= 1 or < 0
|
* @param transpositions true if transpositions should be treated as a primitive
|
||||||
* or if prefixLength < 0
|
* edit operation. If this is false, comparisons will implement the classic
|
||||||
|
* Levenshtein algorithm.
|
||||||
*/
|
*/
|
||||||
public FuzzyQuery(Term term, float minimumSimilarity, int prefixLength,
|
public FuzzyQuery(Term term, int maxEdits, int prefixLength, int maxExpansions, boolean transpositions) {
|
||||||
int maxExpansions) {
|
|
||||||
super(term.field());
|
super(term.field());
|
||||||
this.term = term;
|
|
||||||
|
|
||||||
if (minimumSimilarity >= 1.0f && minimumSimilarity != (int)minimumSimilarity)
|
if (maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
|
||||||
throw new IllegalArgumentException("fractional edit distances are not allowed");
|
throw new IllegalArgumentException("maxEdits must be between 0 and " + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE);
|
||||||
if (minimumSimilarity < 0.0f)
|
}
|
||||||
throw new IllegalArgumentException("minimumSimilarity < 0");
|
if (prefixLength < 0) {
|
||||||
if (prefixLength < 0)
|
throw new IllegalArgumentException("prefixLength cannot be negative.");
|
||||||
throw new IllegalArgumentException("prefixLength < 0");
|
}
|
||||||
if (maxExpansions < 0)
|
if (maxExpansions < 0) {
|
||||||
throw new IllegalArgumentException("maxExpansions < 0");
|
throw new IllegalArgumentException("maxExpansions cannot be negative.");
|
||||||
|
|
||||||
setRewriteMethod(new MultiTermQuery.TopTermsScoringBooleanQueryRewrite(maxExpansions));
|
|
||||||
|
|
||||||
String text = term.text();
|
|
||||||
int len = text.codePointCount(0, text.length());
|
|
||||||
if (len > 0 && (minimumSimilarity >= 1f || len > 1.0f / (1.0f - minimumSimilarity))) {
|
|
||||||
this.termLongEnough = true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
this.minimumSimilarity = minimumSimilarity;
|
this.term = term;
|
||||||
|
this.maxEdits = maxEdits;
|
||||||
this.prefixLength = prefixLength;
|
this.prefixLength = prefixLength;
|
||||||
|
this.transpositions = transpositions;
|
||||||
|
this.maxExpansions = maxExpansions;
|
||||||
|
setRewriteMethod(new MultiTermQuery.TopTermsScoringBooleanQueryRewrite(maxExpansions));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Calls {@link #FuzzyQuery(Term, float) FuzzyQuery(term, minimumSimilarity, prefixLength, defaultMaxExpansions)}.
|
* Calls {@link #FuzzyQuery(Term, int, int, int, boolean)
|
||||||
|
* FuzzyQuery(term, minimumSimilarity, prefixLength, defaultMaxExpansions, defaultTranspositions)}.
|
||||||
*/
|
*/
|
||||||
public FuzzyQuery(Term term, float minimumSimilarity, int prefixLength) {
|
public FuzzyQuery(Term term, int maxEdits, int prefixLength) {
|
||||||
this(term, minimumSimilarity, prefixLength, defaultMaxExpansions);
|
this(term, maxEdits, prefixLength, defaultMaxExpansions, defaultTranspositions);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Calls {@link #FuzzyQuery(Term, float) FuzzyQuery(term, minimumSimilarity, 0, defaultMaxExpansions)}.
|
* Calls {@link #FuzzyQuery(Term, int, int) FuzzyQuery(term, maxEdits, defaultPrefixLength)}.
|
||||||
*/
|
*/
|
||||||
public FuzzyQuery(Term term, float minimumSimilarity) {
|
public FuzzyQuery(Term term, int maxEdits) {
|
||||||
this(term, minimumSimilarity, defaultPrefixLength, defaultMaxExpansions);
|
this(term, maxEdits, defaultPrefixLength);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Calls {@link #FuzzyQuery(Term, float) FuzzyQuery(term, defaultMinSimilarity, 0, defaultMaxExpansions)}.
|
* Calls {@link #FuzzyQuery(Term, int) FuzzyQuery(term, defaultMaxEdits)}.
|
||||||
*/
|
*/
|
||||||
public FuzzyQuery(Term term) {
|
public FuzzyQuery(Term term) {
|
||||||
this(term, defaultMinSimilarity, defaultPrefixLength, defaultMaxExpansions);
|
this(term, defaultMaxEdits);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the minimum similarity that is required for this query to match.
|
* @return the maximum number of edit distances allowed for this query to match.
|
||||||
* @return float value between 0.0 and 1.0
|
|
||||||
*/
|
*/
|
||||||
public float getMinSimilarity() {
|
public int getMaxEdits() {
|
||||||
return minimumSimilarity;
|
return maxEdits;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -138,13 +125,10 @@ public class FuzzyQuery extends MultiTermQuery {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
|
protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
|
||||||
if (!termLongEnough) { // can only match if it's exact
|
if (maxEdits == 0 || prefixLength >= term.text().length()) { // can only match if it's exact
|
||||||
return new SingleTermsEnum(terms.iterator(null), term.bytes());
|
return new SingleTermsEnum(terms.iterator(null), term.bytes());
|
||||||
}
|
}
|
||||||
// TODO: should we expose the transpositions option to this query?
|
return new FuzzyTermsEnum(terms, atts, getTerm(), maxEdits, prefixLength, transpositions);
|
||||||
// maybe move the old/slowish stuff (lev without transpositions, n > 2, etc) all to contrib,
|
|
||||||
// deprecate it, and just have a faster/simpler/better one in core?
|
|
||||||
return new FuzzyTermsEnum(terms, atts, getTerm(), minimumSimilarity, prefixLength, false);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -163,7 +147,7 @@ public class FuzzyQuery extends MultiTermQuery {
|
||||||
}
|
}
|
||||||
buffer.append(term.text());
|
buffer.append(term.text());
|
||||||
buffer.append('~');
|
buffer.append('~');
|
||||||
buffer.append(Float.toString(minimumSimilarity));
|
buffer.append(Integer.toString(maxEdits));
|
||||||
buffer.append(ToStringUtils.boost(getBoost()));
|
buffer.append(ToStringUtils.boost(getBoost()));
|
||||||
return buffer.toString();
|
return buffer.toString();
|
||||||
}
|
}
|
||||||
|
@ -172,8 +156,10 @@ public class FuzzyQuery extends MultiTermQuery {
|
||||||
public int hashCode() {
|
public int hashCode() {
|
||||||
final int prime = 31;
|
final int prime = 31;
|
||||||
int result = super.hashCode();
|
int result = super.hashCode();
|
||||||
result = prime * result + Float.floatToIntBits(minimumSimilarity);
|
result = prime * result + maxEdits;
|
||||||
result = prime * result + prefixLength;
|
result = prime * result + prefixLength;
|
||||||
|
result = prime * result + maxExpansions;
|
||||||
|
result = prime * result + (transpositions ? 0 : 1);
|
||||||
result = prime * result + ((term == null) ? 0 : term.hashCode());
|
result = prime * result + ((term == null) ? 0 : term.hashCode());
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
@ -187,11 +173,14 @@ public class FuzzyQuery extends MultiTermQuery {
|
||||||
if (getClass() != obj.getClass())
|
if (getClass() != obj.getClass())
|
||||||
return false;
|
return false;
|
||||||
FuzzyQuery other = (FuzzyQuery) obj;
|
FuzzyQuery other = (FuzzyQuery) obj;
|
||||||
if (Float.floatToIntBits(minimumSimilarity) != Float
|
if (maxEdits != other.maxEdits)
|
||||||
.floatToIntBits(other.minimumSimilarity))
|
|
||||||
return false;
|
return false;
|
||||||
if (prefixLength != other.prefixLength)
|
if (prefixLength != other.prefixLength)
|
||||||
return false;
|
return false;
|
||||||
|
if (maxExpansions != other.maxExpansions)
|
||||||
|
return false;
|
||||||
|
if (transpositions != other.transpositions)
|
||||||
|
return false;
|
||||||
if (term == null) {
|
if (term == null) {
|
||||||
if (other.term != null)
|
if (other.term != null)
|
||||||
return false;
|
return false;
|
||||||
|
@ -199,6 +188,31 @@ public class FuzzyQuery extends MultiTermQuery {
|
||||||
return false;
|
return false;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @deprecated pass integer edit distances instead.
|
||||||
|
*/
|
||||||
|
@Deprecated
|
||||||
|
public final static float defaultMinSimilarity = LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Helper function to convert from deprecated "minimumSimilarity" fractions
|
||||||
|
* to raw edit distances.
|
||||||
|
*
|
||||||
|
* @param minimumSimilarity scaled similarity
|
||||||
|
* @param termLen length (in unicode codepoints) of the term.
|
||||||
|
* @return equivalent number of maxEdits
|
||||||
|
* @deprecated pass integer edit distances instead.
|
||||||
|
*/
|
||||||
|
@Deprecated
|
||||||
|
public static int floatToEdits(float minimumSimilarity, int termLen) {
|
||||||
|
if (minimumSimilarity > 1f) {
|
||||||
|
return (int) Math.min(minimumSimilarity, LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE);
|
||||||
|
} else if (minimumSimilarity == 0.0f) {
|
||||||
|
return 0; // 0 means exact, not infinite # of edits!
|
||||||
|
} else {
|
||||||
|
return Math.min((int) ((1D-minimumSimilarity) * termLen),
|
||||||
|
LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -34,8 +34,6 @@ import org.apache.lucene.util.AttributeImpl;
|
||||||
import org.apache.lucene.util.AttributeSource;
|
import org.apache.lucene.util.AttributeSource;
|
||||||
import org.apache.lucene.util.Bits;
|
import org.apache.lucene.util.Bits;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
import org.apache.lucene.util.IntsRef;
|
|
||||||
import org.apache.lucene.util.StringHelper;
|
|
||||||
import org.apache.lucene.util.UnicodeUtil;
|
import org.apache.lucene.util.UnicodeUtil;
|
||||||
import org.apache.lucene.util.automaton.Automaton;
|
import org.apache.lucene.util.automaton.Automaton;
|
||||||
import org.apache.lucene.util.automaton.BasicAutomata;
|
import org.apache.lucene.util.automaton.BasicAutomata;
|
||||||
|
@ -51,7 +49,7 @@ import org.apache.lucene.util.automaton.LevenshteinAutomata;
|
||||||
* {@link #getComparator}. Each term in the enumeration is
|
* {@link #getComparator}. Each term in the enumeration is
|
||||||
* greater than all that precede it.</p>
|
* greater than all that precede it.</p>
|
||||||
*/
|
*/
|
||||||
public final class FuzzyTermsEnum extends TermsEnum {
|
public class FuzzyTermsEnum extends TermsEnum {
|
||||||
private TermsEnum actualEnum;
|
private TermsEnum actualEnum;
|
||||||
private BoostAttribute actualBoostAtt;
|
private BoostAttribute actualBoostAtt;
|
||||||
|
|
||||||
|
@ -67,18 +65,18 @@ public final class FuzzyTermsEnum extends TermsEnum {
|
||||||
// TODO: chicken-and-egg
|
// TODO: chicken-and-egg
|
||||||
private final Comparator<BytesRef> termComparator = BytesRef.getUTF8SortedAsUnicodeComparator();
|
private final Comparator<BytesRef> termComparator = BytesRef.getUTF8SortedAsUnicodeComparator();
|
||||||
|
|
||||||
private final float minSimilarity;
|
protected final float minSimilarity;
|
||||||
private final float scale_factor;
|
protected final float scale_factor;
|
||||||
|
|
||||||
private final int termLength;
|
protected final int termLength;
|
||||||
|
|
||||||
private int maxEdits;
|
protected int maxEdits;
|
||||||
private final boolean raw;
|
protected final boolean raw;
|
||||||
|
|
||||||
private final Terms terms;
|
protected final Terms terms;
|
||||||
private final Term term;
|
private final Term term;
|
||||||
private final int termText[];
|
protected final int termText[];
|
||||||
private final int realPrefixLength;
|
protected final int realPrefixLength;
|
||||||
|
|
||||||
private final boolean transpositions;
|
private final boolean transpositions;
|
||||||
|
|
||||||
|
@ -95,7 +93,8 @@ public final class FuzzyTermsEnum extends TermsEnum {
|
||||||
* thats contains information about competitive boosts during rewrite. It is also used
|
* thats contains information about competitive boosts during rewrite. It is also used
|
||||||
* to cache DFAs between segment transitions.
|
* to cache DFAs between segment transitions.
|
||||||
* @param term Pattern term.
|
* @param term Pattern term.
|
||||||
* @param minSimilarity Minimum required similarity for terms from the reader.
|
* @param minSimilarity Minimum required similarity for terms from the reader. Pass an integer value
|
||||||
|
* representing edit distance. Passing a fraction is deprecated.
|
||||||
* @param prefixLength Length of required common prefix. Default value is 0.
|
* @param prefixLength Length of required common prefix. Default value is 0.
|
||||||
* @throws IOException
|
* @throws IOException
|
||||||
*/
|
*/
|
||||||
|
@ -149,7 +148,7 @@ public final class FuzzyTermsEnum extends TermsEnum {
|
||||||
* return an automata-based enum for matching up to editDistance from
|
* return an automata-based enum for matching up to editDistance from
|
||||||
* lastTerm, if possible
|
* lastTerm, if possible
|
||||||
*/
|
*/
|
||||||
private TermsEnum getAutomatonEnum(int editDistance, BytesRef lastTerm)
|
protected TermsEnum getAutomatonEnum(int editDistance, BytesRef lastTerm)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
final List<CompiledAutomaton> runAutomata = initAutomata(editDistance);
|
final List<CompiledAutomaton> runAutomata = initAutomata(editDistance);
|
||||||
if (editDistance < runAutomata.size()) {
|
if (editDistance < runAutomata.size()) {
|
||||||
|
@ -187,7 +186,7 @@ public final class FuzzyTermsEnum extends TermsEnum {
|
||||||
}
|
}
|
||||||
|
|
||||||
/** swap in a new actual enum to proxy to */
|
/** swap in a new actual enum to proxy to */
|
||||||
private void setEnum(TermsEnum actualEnum) {
|
protected void setEnum(TermsEnum actualEnum) {
|
||||||
this.actualEnum = actualEnum;
|
this.actualEnum = actualEnum;
|
||||||
this.actualBoostAtt = actualEnum.attributes().addAttribute(BoostAttribute.class);
|
this.actualBoostAtt = actualEnum.attributes().addAttribute(BoostAttribute.class);
|
||||||
}
|
}
|
||||||
|
@ -209,14 +208,21 @@ public final class FuzzyTermsEnum extends TermsEnum {
|
||||||
maxEdits--;
|
maxEdits--;
|
||||||
|
|
||||||
if (oldMaxEdits != maxEdits || init) { // the maximum n has changed
|
if (oldMaxEdits != maxEdits || init) { // the maximum n has changed
|
||||||
TermsEnum newEnum = getAutomatonEnum(maxEdits, lastTerm);
|
maxEditDistanceChanged(lastTerm, maxEdits, init);
|
||||||
if (newEnum != null) {
|
|
||||||
setEnum(newEnum);
|
|
||||||
} else if (init) {
|
|
||||||
setEnum(new LinearFuzzyTermsEnum());
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected void maxEditDistanceChanged(BytesRef lastTerm, int maxEdits, boolean init)
|
||||||
|
throws IOException {
|
||||||
|
TermsEnum newEnum = getAutomatonEnum(maxEdits, lastTerm);
|
||||||
|
// instead of assert, we do a hard check in case someone uses our enum directly
|
||||||
|
// assert newEnum != null;
|
||||||
|
if (newEnum == null) {
|
||||||
|
assert maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE;
|
||||||
|
throw new IllegalArgumentException("maxEdits cannot be > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE");
|
||||||
|
}
|
||||||
|
setEnum(newEnum);
|
||||||
|
}
|
||||||
|
|
||||||
// for some raw min similarity and input term length, the maximum # of edits
|
// for some raw min similarity and input term length, the maximum # of edits
|
||||||
private int initialMaxDistance(float minimumSimilarity, int termLen) {
|
private int initialMaxDistance(float minimumSimilarity, int termLen) {
|
||||||
|
@ -383,194 +389,6 @@ public final class FuzzyTermsEnum extends TermsEnum {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Implement fuzzy enumeration with linear brute force.
|
|
||||||
*/
|
|
||||||
private class LinearFuzzyTermsEnum extends FilteredTermsEnum {
|
|
||||||
/* Allows us save time required to create a new array
|
|
||||||
* every time similarity is called.
|
|
||||||
*/
|
|
||||||
private int[] d;
|
|
||||||
private int[] p;
|
|
||||||
|
|
||||||
// this is the text, minus the prefix
|
|
||||||
private final int[] text;
|
|
||||||
|
|
||||||
private final BoostAttribute boostAtt =
|
|
||||||
attributes().addAttribute(BoostAttribute.class);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Constructor for enumeration of all terms from specified <code>reader</code> which share a prefix of
|
|
||||||
* length <code>prefixLength</code> with <code>term</code> and which have a fuzzy similarity >
|
|
||||||
* <code>minSimilarity</code>.
|
|
||||||
* <p>
|
|
||||||
* After calling the constructor the enumeration is already pointing to the first
|
|
||||||
* valid term if such a term exists.
|
|
||||||
*
|
|
||||||
* @throws IOException
|
|
||||||
*/
|
|
||||||
public LinearFuzzyTermsEnum() throws IOException {
|
|
||||||
super(terms.iterator(null));
|
|
||||||
|
|
||||||
this.text = new int[termLength - realPrefixLength];
|
|
||||||
System.arraycopy(termText, realPrefixLength, text, 0, text.length);
|
|
||||||
final String prefix = UnicodeUtil.newString(termText, 0, realPrefixLength);
|
|
||||||
prefixBytesRef = new BytesRef(prefix);
|
|
||||||
this.d = new int[this.text.length + 1];
|
|
||||||
this.p = new int[this.text.length + 1];
|
|
||||||
|
|
||||||
setInitialSeekTerm(prefixBytesRef);
|
|
||||||
}
|
|
||||||
|
|
||||||
private final BytesRef prefixBytesRef;
|
|
||||||
// used for unicode conversion from BytesRef byte[] to int[]
|
|
||||||
private final IntsRef utf32 = new IntsRef(20);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* The termCompare method in FuzzyTermEnum uses Levenshtein distance to
|
|
||||||
* calculate the distance between the given term and the comparing term.
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
protected final AcceptStatus accept(BytesRef term) {
|
|
||||||
if (StringHelper.startsWith(term, prefixBytesRef)) {
|
|
||||||
UnicodeUtil.UTF8toUTF32(term, utf32);
|
|
||||||
final float similarity = similarity(utf32.ints, realPrefixLength, utf32.length - realPrefixLength);
|
|
||||||
if (similarity > minSimilarity) {
|
|
||||||
boostAtt.setBoost((similarity - minSimilarity) * scale_factor);
|
|
||||||
return AcceptStatus.YES;
|
|
||||||
} else return AcceptStatus.NO;
|
|
||||||
} else {
|
|
||||||
return AcceptStatus.END;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/******************************
|
|
||||||
* Compute Levenshtein distance
|
|
||||||
******************************/
|
|
||||||
|
|
||||||
/**
|
|
||||||
* <p>Similarity returns a number that is 1.0f or less (including negative numbers)
|
|
||||||
* based on how similar the Term is compared to a target term. It returns
|
|
||||||
* exactly 0.0f when
|
|
||||||
* <pre>
|
|
||||||
* editDistance > maximumEditDistance</pre>
|
|
||||||
* Otherwise it returns:
|
|
||||||
* <pre>
|
|
||||||
* 1 - (editDistance / length)</pre>
|
|
||||||
* where length is the length of the shortest term (text or target) including a
|
|
||||||
* prefix that are identical and editDistance is the Levenshtein distance for
|
|
||||||
* the two words.</p>
|
|
||||||
*
|
|
||||||
* <p>Embedded within this algorithm is a fail-fast Levenshtein distance
|
|
||||||
* algorithm. The fail-fast algorithm differs from the standard Levenshtein
|
|
||||||
* distance algorithm in that it is aborted if it is discovered that the
|
|
||||||
* minimum distance between the words is greater than some threshold.
|
|
||||||
*
|
|
||||||
* <p>To calculate the maximum distance threshold we use the following formula:
|
|
||||||
* <pre>
|
|
||||||
* (1 - minimumSimilarity) * length</pre>
|
|
||||||
* where length is the shortest term including any prefix that is not part of the
|
|
||||||
* similarity comparison. This formula was derived by solving for what maximum value
|
|
||||||
* of distance returns false for the following statements:
|
|
||||||
* <pre>
|
|
||||||
* similarity = 1 - ((float)distance / (float) (prefixLength + Math.min(textlen, targetlen)));
|
|
||||||
* return (similarity > minimumSimilarity);</pre>
|
|
||||||
* where distance is the Levenshtein distance for the two words.
|
|
||||||
* </p>
|
|
||||||
* <p>Levenshtein distance (also known as edit distance) is a measure of similarity
|
|
||||||
* between two strings where the distance is measured as the number of character
|
|
||||||
* deletions, insertions or substitutions required to transform one string to
|
|
||||||
* the other string.
|
|
||||||
* @param target the target word or phrase
|
|
||||||
* @return the similarity, 0.0 or less indicates that it matches less than the required
|
|
||||||
* threshold and 1.0 indicates that the text and target are identical
|
|
||||||
*/
|
|
||||||
private final float similarity(final int[] target, int offset, int length) {
|
|
||||||
final int m = length;
|
|
||||||
final int n = text.length;
|
|
||||||
if (n == 0) {
|
|
||||||
//we don't have anything to compare. That means if we just add
|
|
||||||
//the letters for m we get the new word
|
|
||||||
return realPrefixLength == 0 ? 0.0f : 1.0f - ((float) m / realPrefixLength);
|
|
||||||
}
|
|
||||||
if (m == 0) {
|
|
||||||
return realPrefixLength == 0 ? 0.0f : 1.0f - ((float) n / realPrefixLength);
|
|
||||||
}
|
|
||||||
|
|
||||||
final int maxDistance = calculateMaxDistance(m);
|
|
||||||
|
|
||||||
if (maxDistance < Math.abs(m-n)) {
|
|
||||||
//just adding the characters of m to n or vice-versa results in
|
|
||||||
//too many edits
|
|
||||||
//for example "pre" length is 3 and "prefixes" length is 8. We can see that
|
|
||||||
//given this optimal circumstance, the edit distance cannot be less than 5.
|
|
||||||
//which is 8-3 or more precisely Math.abs(3-8).
|
|
||||||
//if our maximum edit distance is 4, then we can discard this word
|
|
||||||
//without looking at it.
|
|
||||||
return Float.NEGATIVE_INFINITY;
|
|
||||||
}
|
|
||||||
|
|
||||||
// init matrix d
|
|
||||||
for (int i = 0; i <=n; ++i) {
|
|
||||||
p[i] = i;
|
|
||||||
}
|
|
||||||
|
|
||||||
// start computing edit distance
|
|
||||||
for (int j = 1; j<=m; ++j) { // iterates through target
|
|
||||||
int bestPossibleEditDistance = m;
|
|
||||||
final int t_j = target[offset+j-1]; // jth character of t
|
|
||||||
d[0] = j;
|
|
||||||
|
|
||||||
for (int i=1; i<=n; ++i) { // iterates through text
|
|
||||||
// minimum of cell to the left+1, to the top+1, diagonally left and up +(0|1)
|
|
||||||
if (t_j != text[i-1]) {
|
|
||||||
d[i] = Math.min(Math.min(d[i-1], p[i]), p[i-1]) + 1;
|
|
||||||
} else {
|
|
||||||
d[i] = Math.min(Math.min(d[i-1]+1, p[i]+1), p[i-1]);
|
|
||||||
}
|
|
||||||
bestPossibleEditDistance = Math.min(bestPossibleEditDistance, d[i]);
|
|
||||||
}
|
|
||||||
|
|
||||||
//After calculating row i, the best possible edit distance
|
|
||||||
//can be found by found by finding the smallest value in a given column.
|
|
||||||
//If the bestPossibleEditDistance is greater than the max distance, abort.
|
|
||||||
|
|
||||||
if (j > maxDistance && bestPossibleEditDistance > maxDistance) { //equal is okay, but not greater
|
|
||||||
//the closest the target can be to the text is just too far away.
|
|
||||||
//this target is leaving the party early.
|
|
||||||
return Float.NEGATIVE_INFINITY;
|
|
||||||
}
|
|
||||||
|
|
||||||
// copy current distance counts to 'previous row' distance counts: swap p and d
|
|
||||||
int _d[] = p;
|
|
||||||
p = d;
|
|
||||||
d = _d;
|
|
||||||
}
|
|
||||||
|
|
||||||
// our last action in the above loop was to switch d and p, so p now
|
|
||||||
// actually has the most recent cost counts
|
|
||||||
|
|
||||||
// this will return less than 0.0 when the edit distance is
|
|
||||||
// greater than the number of characters in the shorter word.
|
|
||||||
// but this was the formula that was previously used in FuzzyTermEnum,
|
|
||||||
// so it has not been changed (even though minimumSimilarity must be
|
|
||||||
// greater than 0.0)
|
|
||||||
return 1.0f - ((float)p[n] / (float) (realPrefixLength + Math.min(n, m)));
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* The max Distance is the maximum Levenshtein distance for the text
|
|
||||||
* compared to some other value that results in score that is
|
|
||||||
* better than the minimum similarity.
|
|
||||||
* @param m the length of the "other value"
|
|
||||||
* @return the maximum levenshtein distance that we care about
|
|
||||||
*/
|
|
||||||
private int calculateMaxDistance(int m) {
|
|
||||||
return raw ? maxEdits : Math.min(maxEdits,
|
|
||||||
(int)((1-minSimilarity) * (Math.min(text.length, m) + realPrefixLength)));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/** @lucene.internal */
|
/** @lucene.internal */
|
||||||
public float getMinSimilarity() {
|
public float getMinSimilarity() {
|
||||||
return minSimilarity;
|
return minSimilarity;
|
||||||
|
|
|
@ -52,32 +52,32 @@ public class TestFuzzyQuery extends LuceneTestCase {
|
||||||
IndexSearcher searcher = newSearcher(reader);
|
IndexSearcher searcher = newSearcher(reader);
|
||||||
writer.close();
|
writer.close();
|
||||||
|
|
||||||
FuzzyQuery query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 0);
|
FuzzyQuery query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMaxEdits, 0);
|
||||||
ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
|
ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
assertEquals(3, hits.length);
|
assertEquals(3, hits.length);
|
||||||
|
|
||||||
// same with prefix
|
// same with prefix
|
||||||
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 1);
|
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMaxEdits, 1);
|
||||||
hits = searcher.search(query, null, 1000).scoreDocs;
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
assertEquals(3, hits.length);
|
assertEquals(3, hits.length);
|
||||||
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 2);
|
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMaxEdits, 2);
|
||||||
hits = searcher.search(query, null, 1000).scoreDocs;
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
assertEquals(3, hits.length);
|
assertEquals(3, hits.length);
|
||||||
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 3);
|
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMaxEdits, 3);
|
||||||
hits = searcher.search(query, null, 1000).scoreDocs;
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
assertEquals(3, hits.length);
|
assertEquals(3, hits.length);
|
||||||
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 4);
|
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMaxEdits, 4);
|
||||||
hits = searcher.search(query, null, 1000).scoreDocs;
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
assertEquals(2, hits.length);
|
assertEquals(2, hits.length);
|
||||||
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 5);
|
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMaxEdits, 5);
|
||||||
hits = searcher.search(query, null, 1000).scoreDocs;
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
assertEquals(1, hits.length);
|
assertEquals(1, hits.length);
|
||||||
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 6);
|
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMaxEdits, 6);
|
||||||
hits = searcher.search(query, null, 1000).scoreDocs;
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
assertEquals(1, hits.length);
|
assertEquals(1, hits.length);
|
||||||
|
|
||||||
// test scoring
|
// test scoring
|
||||||
query = new FuzzyQuery(new Term("field", "bbbbb"), FuzzyQuery.defaultMinSimilarity, 0);
|
query = new FuzzyQuery(new Term("field", "bbbbb"), FuzzyQuery.defaultMaxEdits, 0);
|
||||||
hits = searcher.search(query, null, 1000).scoreDocs;
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
assertEquals("3 documents should match", 3, hits.length);
|
assertEquals("3 documents should match", 3, hits.length);
|
||||||
List<String> order = Arrays.asList("bbbbb","abbbb","aabbb");
|
List<String> order = Arrays.asList("bbbbb","abbbb","aabbb");
|
||||||
|
@ -89,7 +89,7 @@ public class TestFuzzyQuery extends LuceneTestCase {
|
||||||
|
|
||||||
// test pq size by supplying maxExpansions=2
|
// test pq size by supplying maxExpansions=2
|
||||||
// This query would normally return 3 documents, because 3 terms match (see above):
|
// This query would normally return 3 documents, because 3 terms match (see above):
|
||||||
query = new FuzzyQuery(new Term("field", "bbbbb"), FuzzyQuery.defaultMinSimilarity, 0, 2);
|
query = new FuzzyQuery(new Term("field", "bbbbb"), FuzzyQuery.defaultMaxEdits, 0, 2, false);
|
||||||
hits = searcher.search(query, null, 1000).scoreDocs;
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
assertEquals("only 2 documents should match", 2, hits.length);
|
assertEquals("only 2 documents should match", 2, hits.length);
|
||||||
order = Arrays.asList("bbbbb","abbbb");
|
order = Arrays.asList("bbbbb","abbbb");
|
||||||
|
@ -100,15 +100,15 @@ public class TestFuzzyQuery extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
// not similar enough:
|
// not similar enough:
|
||||||
query = new FuzzyQuery(new Term("field", "xxxxx"), FuzzyQuery.defaultMinSimilarity, 0);
|
query = new FuzzyQuery(new Term("field", "xxxxx"), FuzzyQuery.defaultMaxEdits, 0);
|
||||||
hits = searcher.search(query, null, 1000).scoreDocs;
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
assertEquals(0, hits.length);
|
assertEquals(0, hits.length);
|
||||||
query = new FuzzyQuery(new Term("field", "aaccc"), FuzzyQuery.defaultMinSimilarity, 0); // edit distance to "aaaaa" = 3
|
query = new FuzzyQuery(new Term("field", "aaccc"), FuzzyQuery.defaultMaxEdits, 0); // edit distance to "aaaaa" = 3
|
||||||
hits = searcher.search(query, null, 1000).scoreDocs;
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
assertEquals(0, hits.length);
|
assertEquals(0, hits.length);
|
||||||
|
|
||||||
// query identical to a word in the index:
|
// query identical to a word in the index:
|
||||||
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 0);
|
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMaxEdits, 0);
|
||||||
hits = searcher.search(query, null, 1000).scoreDocs;
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
assertEquals(3, hits.length);
|
assertEquals(3, hits.length);
|
||||||
assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
|
assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
|
||||||
|
@ -117,7 +117,7 @@ public class TestFuzzyQuery extends LuceneTestCase {
|
||||||
assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb"));
|
assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb"));
|
||||||
|
|
||||||
// query similar to a word in the index:
|
// query similar to a word in the index:
|
||||||
query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 0);
|
query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMaxEdits, 0);
|
||||||
hits = searcher.search(query, null, 1000).scoreDocs;
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
assertEquals(3, hits.length);
|
assertEquals(3, hits.length);
|
||||||
assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
|
assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
|
||||||
|
@ -125,158 +125,69 @@ public class TestFuzzyQuery extends LuceneTestCase {
|
||||||
assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb"));
|
assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb"));
|
||||||
|
|
||||||
// now with prefix
|
// now with prefix
|
||||||
query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 1);
|
query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMaxEdits, 1);
|
||||||
hits = searcher.search(query, null, 1000).scoreDocs;
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
assertEquals(3, hits.length);
|
assertEquals(3, hits.length);
|
||||||
assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
|
assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
|
||||||
assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab"));
|
assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab"));
|
||||||
assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb"));
|
assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb"));
|
||||||
query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 2);
|
query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMaxEdits, 2);
|
||||||
hits = searcher.search(query, null, 1000).scoreDocs;
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
assertEquals(3, hits.length);
|
assertEquals(3, hits.length);
|
||||||
assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
|
assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
|
||||||
assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab"));
|
assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab"));
|
||||||
assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb"));
|
assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb"));
|
||||||
query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 3);
|
query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMaxEdits, 3);
|
||||||
hits = searcher.search(query, null, 1000).scoreDocs;
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
assertEquals(3, hits.length);
|
assertEquals(3, hits.length);
|
||||||
assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
|
assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
|
||||||
assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab"));
|
assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab"));
|
||||||
assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb"));
|
assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb"));
|
||||||
query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 4);
|
query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMaxEdits, 4);
|
||||||
hits = searcher.search(query, null, 1000).scoreDocs;
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
assertEquals(2, hits.length);
|
assertEquals(2, hits.length);
|
||||||
assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
|
assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
|
||||||
assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab"));
|
assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab"));
|
||||||
query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 5);
|
query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMaxEdits, 5);
|
||||||
hits = searcher.search(query, null, 1000).scoreDocs;
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
assertEquals(0, hits.length);
|
assertEquals(0, hits.length);
|
||||||
|
|
||||||
|
|
||||||
query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 0);
|
query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMaxEdits, 0);
|
||||||
hits = searcher.search(query, null, 1000).scoreDocs;
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
assertEquals(1, hits.length);
|
assertEquals(1, hits.length);
|
||||||
assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd"));
|
assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd"));
|
||||||
|
|
||||||
// now with prefix
|
// now with prefix
|
||||||
query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 1);
|
query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMaxEdits, 1);
|
||||||
hits = searcher.search(query, null, 1000).scoreDocs;
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
assertEquals(1, hits.length);
|
assertEquals(1, hits.length);
|
||||||
assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd"));
|
assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd"));
|
||||||
query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 2);
|
query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMaxEdits, 2);
|
||||||
hits = searcher.search(query, null, 1000).scoreDocs;
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
assertEquals(1, hits.length);
|
assertEquals(1, hits.length);
|
||||||
assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd"));
|
assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd"));
|
||||||
query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 3);
|
query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMaxEdits, 3);
|
||||||
hits = searcher.search(query, null, 1000).scoreDocs;
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
assertEquals(1, hits.length);
|
assertEquals(1, hits.length);
|
||||||
assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd"));
|
assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd"));
|
||||||
query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 4);
|
query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMaxEdits, 4);
|
||||||
hits = searcher.search(query, null, 1000).scoreDocs;
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
assertEquals(1, hits.length);
|
assertEquals(1, hits.length);
|
||||||
assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd"));
|
assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd"));
|
||||||
query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 5);
|
query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMaxEdits, 5);
|
||||||
hits = searcher.search(query, null, 1000).scoreDocs;
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
assertEquals(0, hits.length);
|
assertEquals(0, hits.length);
|
||||||
|
|
||||||
|
|
||||||
// different field = no match:
|
// different field = no match:
|
||||||
query = new FuzzyQuery(new Term("anotherfield", "ddddX"), FuzzyQuery.defaultMinSimilarity, 0);
|
query = new FuzzyQuery(new Term("anotherfield", "ddddX"), FuzzyQuery.defaultMaxEdits, 0);
|
||||||
hits = searcher.search(query, null, 1000).scoreDocs;
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
assertEquals(0, hits.length);
|
assertEquals(0, hits.length);
|
||||||
|
|
||||||
reader.close();
|
reader.close();
|
||||||
directory.close();
|
directory.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testFuzzinessLong() throws Exception {
|
|
||||||
Directory directory = newDirectory();
|
|
||||||
RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
|
|
||||||
addDoc("aaaaaaa", writer);
|
|
||||||
addDoc("segment", writer);
|
|
||||||
|
|
||||||
IndexReader reader = writer.getReader();
|
|
||||||
IndexSearcher searcher = newSearcher(reader);
|
|
||||||
writer.close();
|
|
||||||
|
|
||||||
FuzzyQuery query;
|
|
||||||
// not similar enough:
|
|
||||||
query = new FuzzyQuery(new Term("field", "xxxxx"), 0.5f, 0);
|
|
||||||
ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
|
|
||||||
assertEquals(0, hits.length);
|
|
||||||
// edit distance to "aaaaaaa" = 3, this matches because the string is longer than
|
|
||||||
// in testDefaultFuzziness so a bigger difference is allowed:
|
|
||||||
query = new FuzzyQuery(new Term("field", "aaaaccc"), 0.5f, 0);
|
|
||||||
hits = searcher.search(query, null, 1000).scoreDocs;
|
|
||||||
assertEquals(1, hits.length);
|
|
||||||
assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaaaa"));
|
|
||||||
|
|
||||||
// now with prefix
|
|
||||||
query = new FuzzyQuery(new Term("field", "aaaaccc"), 0.5f, 1);
|
|
||||||
hits = searcher.search(query, null, 1000).scoreDocs;
|
|
||||||
assertEquals(1, hits.length);
|
|
||||||
assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaaaa"));
|
|
||||||
query = new FuzzyQuery(new Term("field", "aaaaccc"), 0.5f, 4);
|
|
||||||
hits = searcher.search(query, null, 1000).scoreDocs;
|
|
||||||
assertEquals(1, hits.length);
|
|
||||||
assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaaaa"));
|
|
||||||
query = new FuzzyQuery(new Term("field", "aaaaccc"), 0.5f, 5);
|
|
||||||
hits = searcher.search(query, null, 1000).scoreDocs;
|
|
||||||
assertEquals(0, hits.length);
|
|
||||||
|
|
||||||
// no match, more than half of the characters is wrong:
|
|
||||||
query = new FuzzyQuery(new Term("field", "aaacccc"), 0.5f, 0);
|
|
||||||
hits = searcher.search(query, null, 1000).scoreDocs;
|
|
||||||
assertEquals(0, hits.length);
|
|
||||||
|
|
||||||
// now with prefix
|
|
||||||
query = new FuzzyQuery(new Term("field", "aaacccc"), 0.5f, 2);
|
|
||||||
hits = searcher.search(query, null, 1000).scoreDocs;
|
|
||||||
assertEquals(0, hits.length);
|
|
||||||
|
|
||||||
// "student" and "stellent" are indeed similar to "segment" by default:
|
|
||||||
query = new FuzzyQuery(new Term("field", "student"), 0.5f, 0);
|
|
||||||
hits = searcher.search(query, null, 1000).scoreDocs;
|
|
||||||
assertEquals(1, hits.length);
|
|
||||||
query = new FuzzyQuery(new Term("field", "stellent"), 0.5f, 0);
|
|
||||||
hits = searcher.search(query, null, 1000).scoreDocs;
|
|
||||||
assertEquals(1, hits.length);
|
|
||||||
|
|
||||||
// now with prefix
|
|
||||||
query = new FuzzyQuery(new Term("field", "student"), 0.5f, 1);
|
|
||||||
hits = searcher.search(query, null, 1000).scoreDocs;
|
|
||||||
assertEquals(1, hits.length);
|
|
||||||
query = new FuzzyQuery(new Term("field", "stellent"), 0.5f, 1);
|
|
||||||
hits = searcher.search(query, null, 1000).scoreDocs;
|
|
||||||
assertEquals(1, hits.length);
|
|
||||||
query = new FuzzyQuery(new Term("field", "student"), 0.5f, 2);
|
|
||||||
hits = searcher.search(query, null, 1000).scoreDocs;
|
|
||||||
assertEquals(0, hits.length);
|
|
||||||
query = new FuzzyQuery(new Term("field", "stellent"), 0.5f, 2);
|
|
||||||
hits = searcher.search(query, null, 1000).scoreDocs;
|
|
||||||
assertEquals(0, hits.length);
|
|
||||||
|
|
||||||
// "student" doesn't match anymore thanks to increased minimum similarity:
|
|
||||||
query = new FuzzyQuery(new Term("field", "student"), 0.6f, 0);
|
|
||||||
hits = searcher.search(query, null, 1000).scoreDocs;
|
|
||||||
assertEquals(0, hits.length);
|
|
||||||
|
|
||||||
try {
|
|
||||||
query = new FuzzyQuery(new Term("field", "student"), 1.1f);
|
|
||||||
fail("Expected IllegalArgumentException");
|
|
||||||
} catch (IllegalArgumentException e) {
|
|
||||||
// expecting exception
|
|
||||||
}
|
|
||||||
try {
|
|
||||||
query = new FuzzyQuery(new Term("field", "student"), -0.1f);
|
|
||||||
fail("Expected IllegalArgumentException");
|
|
||||||
} catch (IllegalArgumentException e) {
|
|
||||||
// expecting exception
|
|
||||||
}
|
|
||||||
|
|
||||||
reader.close();
|
|
||||||
directory.close();
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* MultiTermQuery provides (via attribute) information about which values
|
* MultiTermQuery provides (via attribute) information about which values
|
||||||
|
@ -307,7 +218,7 @@ public class TestFuzzyQuery extends LuceneTestCase {
|
||||||
|
|
||||||
MultiReader mr = new MultiReader(ir1, ir2);
|
MultiReader mr = new MultiReader(ir1, ir2);
|
||||||
IndexSearcher searcher = newSearcher(mr);
|
IndexSearcher searcher = newSearcher(mr);
|
||||||
FuzzyQuery fq = new FuzzyQuery(new Term("field", "z123456"), 1f, 0, 2);
|
FuzzyQuery fq = new FuzzyQuery(new Term("field", "z123456"), 1, 0, 2, false);
|
||||||
TopDocs docs = searcher.search(fq, 2);
|
TopDocs docs = searcher.search(fq, 2);
|
||||||
assertEquals(5, docs.totalHits); // 5 docs, from the a and b's
|
assertEquals(5, docs.totalHits); // 5 docs, from the a and b's
|
||||||
mr.close();
|
mr.close();
|
||||||
|
@ -319,41 +230,6 @@ public class TestFuzzyQuery extends LuceneTestCase {
|
||||||
directory2.close();
|
directory2.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testTokenLengthOpt() throws IOException {
|
|
||||||
Directory directory = newDirectory();
|
|
||||||
RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
|
|
||||||
addDoc("12345678911", writer);
|
|
||||||
addDoc("segment", writer);
|
|
||||||
|
|
||||||
IndexReader reader = writer.getReader();
|
|
||||||
IndexSearcher searcher = newSearcher(reader);
|
|
||||||
writer.close();
|
|
||||||
|
|
||||||
Query query;
|
|
||||||
// term not over 10 chars, so optimization shortcuts
|
|
||||||
query = new FuzzyQuery(new Term("field", "1234569"), 0.9f);
|
|
||||||
ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
|
|
||||||
assertEquals(0, hits.length);
|
|
||||||
|
|
||||||
// 10 chars, so no optimization
|
|
||||||
query = new FuzzyQuery(new Term("field", "1234567891"), 0.9f);
|
|
||||||
hits = searcher.search(query, null, 1000).scoreDocs;
|
|
||||||
assertEquals(0, hits.length);
|
|
||||||
|
|
||||||
// over 10 chars, so no optimization
|
|
||||||
query = new FuzzyQuery(new Term("field", "12345678911"), 0.9f);
|
|
||||||
hits = searcher.search(query, null, 1000).scoreDocs;
|
|
||||||
assertEquals(1, hits.length);
|
|
||||||
|
|
||||||
// over 10 chars, no match
|
|
||||||
query = new FuzzyQuery(new Term("field", "sdfsdfsdfsdf"), 0.9f);
|
|
||||||
hits = searcher.search(query, null, 1000).scoreDocs;
|
|
||||||
assertEquals(0, hits.length);
|
|
||||||
|
|
||||||
reader.close();
|
|
||||||
directory.close();
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Test the TopTermsBoostOnlyBooleanQueryRewrite rewrite method. */
|
/** Test the TopTermsBoostOnlyBooleanQueryRewrite rewrite method. */
|
||||||
public void testBoostOnlyRewrite() throws Exception {
|
public void testBoostOnlyRewrite() throws Exception {
|
||||||
Directory directory = newDirectory();
|
Directory directory = newDirectory();
|
||||||
|
@ -404,7 +280,7 @@ public class TestFuzzyQuery extends LuceneTestCase {
|
||||||
IndexReader r = w.getReader();
|
IndexReader r = w.getReader();
|
||||||
w.close();
|
w.close();
|
||||||
|
|
||||||
Query q = new FuzzyQuery(new Term("field", "giga"), 0.9f);
|
Query q = new FuzzyQuery(new Term("field", "giga"), 0);
|
||||||
|
|
||||||
// 3. search
|
// 3. search
|
||||||
IndexSearcher searcher = newSearcher(r);
|
IndexSearcher searcher = newSearcher(r);
|
||||||
|
@ -435,26 +311,17 @@ public class TestFuzzyQuery extends LuceneTestCase {
|
||||||
assertEquals(1, hits.length);
|
assertEquals(1, hits.length);
|
||||||
assertEquals("foobar", searcher.doc(hits[0].doc).get("field"));
|
assertEquals("foobar", searcher.doc(hits[0].doc).get("field"));
|
||||||
|
|
||||||
q = new FuzzyQuery(new Term("field", "t"), 3);
|
try {
|
||||||
hits = searcher.search(q, 10).scoreDocs;
|
q = new FuzzyQuery(new Term("field", "t"), 3);
|
||||||
assertEquals(1, hits.length);
|
fail();
|
||||||
assertEquals("test", searcher.doc(hits[0].doc).get("field"));
|
} catch (IllegalArgumentException expected) {
|
||||||
|
// expected
|
||||||
q = new FuzzyQuery(new Term("field", "a"), 4f, 0, 50);
|
}
|
||||||
hits = searcher.search(q, 10).scoreDocs;
|
|
||||||
assertEquals(1, hits.length);
|
|
||||||
assertEquals("test", searcher.doc(hits[0].doc).get("field"));
|
|
||||||
|
|
||||||
q = new FuzzyQuery(new Term("field", "a"), 6f, 0, 50);
|
|
||||||
hits = searcher.search(q, 10).scoreDocs;
|
|
||||||
assertEquals(2, hits.length);
|
|
||||||
assertEquals("test", searcher.doc(hits[0].doc).get("field"));
|
|
||||||
assertEquals("foobar", searcher.doc(hits[1].doc).get("field"));
|
|
||||||
|
|
||||||
reader.close();
|
reader.close();
|
||||||
index.close();
|
index.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
private void addDoc(String text, RandomIndexWriter writer) throws IOException {
|
private void addDoc(String text, RandomIndexWriter writer) throws IOException {
|
||||||
Document doc = new Document();
|
Document doc = new Document();
|
||||||
doc.add(newField("field", text, TextField.TYPE_STORED));
|
doc.add(newField("field", text, TextField.TYPE_STORED));
|
||||||
|
|
|
@ -90,7 +90,7 @@ public class TestSpanMultiTermQueryWrapper extends LuceneTestCase {
|
||||||
|
|
||||||
public void testFuzzy2() throws Exception {
|
public void testFuzzy2() throws Exception {
|
||||||
// maximum of 1 term expansion
|
// maximum of 1 term expansion
|
||||||
FuzzyQuery fq = new FuzzyQuery(new Term("field", "broan"), 1f, 0, 1);
|
FuzzyQuery fq = new FuzzyQuery(new Term("field", "broan"), 1, 0, 1, false);
|
||||||
SpanQuery sfq = new SpanMultiTermQueryWrapper<FuzzyQuery>(fq);
|
SpanQuery sfq = new SpanMultiTermQueryWrapper<FuzzyQuery>(fq);
|
||||||
// will only match jumps over lazy broun dog
|
// will only match jumps over lazy broun dog
|
||||||
SpanPositionRangeQuery sprq = new SpanPositionRangeQuery(sfq, 0, 100);
|
SpanPositionRangeQuery sprq = new SpanPositionRangeQuery(sfq, 0, 100);
|
||||||
|
|
|
@ -669,12 +669,12 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
@Override
|
@Override
|
||||||
public void run() throws Exception {
|
public void run() throws Exception {
|
||||||
numHighlights = 0;
|
numHighlights = 0;
|
||||||
FuzzyQuery fuzzyQuery = new FuzzyQuery(new Term(FIELD_NAME, "kinnedy"), 0.5f);
|
FuzzyQuery fuzzyQuery = new FuzzyQuery(new Term(FIELD_NAME, "kinnedy"), 2);
|
||||||
fuzzyQuery.setRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE);
|
fuzzyQuery.setRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE);
|
||||||
doSearching(fuzzyQuery);
|
doSearching(fuzzyQuery);
|
||||||
doStandardHighlights(analyzer, searcher, hits, query, HighlighterTest.this, true);
|
doStandardHighlights(analyzer, searcher, hits, query, HighlighterTest.this, true);
|
||||||
assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
|
assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
|
||||||
numHighlights == 5);
|
numHighlights == 4);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -774,7 +774,10 @@ public abstract class QueryParserBase {
|
||||||
*/
|
*/
|
||||||
protected Query newFuzzyQuery(Term term, float minimumSimilarity, int prefixLength) {
|
protected Query newFuzzyQuery(Term term, float minimumSimilarity, int prefixLength) {
|
||||||
// FuzzyQuery doesn't yet allow constant score rewrite
|
// FuzzyQuery doesn't yet allow constant score rewrite
|
||||||
return new FuzzyQuery(term,minimumSimilarity,prefixLength);
|
String text = term.text();
|
||||||
|
int numEdits = FuzzyQuery.floatToEdits(minimumSimilarity,
|
||||||
|
text.codePointCount(0, text.length()));
|
||||||
|
return new FuzzyQuery(term,numEdits,prefixLength);
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: Should this be protected instead?
|
// TODO: Should this be protected instead?
|
||||||
|
|
|
@ -191,12 +191,13 @@ enabling substantial customization to how a query is created.
|
||||||
<p>Note: You cannot use a * or ? symbol as the first character of a search.</p>
|
<p>Note: You cannot use a * or ? symbol as the first character of a search.</p>
|
||||||
<a name="N1009B"></a><a name="Fuzzy_Searches"></a>
|
<a name="N1009B"></a><a name="Fuzzy_Searches"></a>
|
||||||
<h3 class="boxed">Fuzzy Searches</h3>
|
<h3 class="boxed">Fuzzy Searches</h3>
|
||||||
<p>Lucene supports fuzzy searches based on the Levenshtein Distance, or Edit Distance algorithm. To do a fuzzy search use the tilde, "~", symbol at the end of a Single word Term. For example to search for a term similar in spelling to "roam" use the fuzzy search: </p>
|
<p>Lucene supports fuzzy searches based on Damerau-Levenshtein Distance. To do a fuzzy search use the tilde, "~", symbol at the end of a Single word Term. For example to search for a term similar in spelling to "roam" use the fuzzy search: </p>
|
||||||
<pre class="code">roam~</pre>
|
<pre class="code">roam~</pre>
|
||||||
<p>This search will find terms like foam and roams.</p>
|
<p>This search will find terms like foam and roams.</p>
|
||||||
<p>Starting with Lucene 1.9 an additional (optional) parameter can specify the required similarity. The value is between 0 and 1, with a value closer to 1 only terms with a higher similarity will be matched. For example:</p>
|
<p>An additional (optional) parameter can specify the maximum number of edits allowed. The value is between 0 and 2, For example:</p>
|
||||||
<pre class="code">roam~0.8</pre>
|
<pre class="code">roam~1</pre>
|
||||||
<p>The default that is used if the parameter is not given is 0.5.</p>
|
<p>The default that is used if the parameter is not given is 2 edit distances.</p>
|
||||||
|
<p>Previously, a floating point value was allowed here. This syntax is considered deprecated and will be removed in Lucene 5.0</p>
|
||||||
<a name="N100B4"></a><a name="Proximity_Searches"></a>
|
<a name="N100B4"></a><a name="Proximity_Searches"></a>
|
||||||
<h3 class="boxed">Proximity Searches</h3>
|
<h3 class="boxed">Proximity Searches</h3>
|
||||||
<p>Lucene supports finding words are a within a specific distance away. To do a proximity search use the tilde, "~", symbol at the end of a Phrase. For example to search for a "apache" and "jakarta" within 10 words of each other in a document use the search: </p>
|
<p>Lucene supports finding words are a within a specific distance away. To do a proximity search use the tilde, "~", symbol at the end of a Phrase. For example to search for a "apache" and "jakarta" within 10 words of each other in a document use the search: </p>
|
||||||
|
|
|
@ -34,9 +34,13 @@ public class FuzzyQueryNodeBuilder implements StandardQueryBuilder {
|
||||||
|
|
||||||
public FuzzyQuery build(QueryNode queryNode) throws QueryNodeException {
|
public FuzzyQuery build(QueryNode queryNode) throws QueryNodeException {
|
||||||
FuzzyQueryNode fuzzyNode = (FuzzyQueryNode) queryNode;
|
FuzzyQueryNode fuzzyNode = (FuzzyQueryNode) queryNode;
|
||||||
|
String text = fuzzyNode.getTextAsString();
|
||||||
|
|
||||||
|
int numEdits = FuzzyQuery.floatToEdits(fuzzyNode.getSimilarity(),
|
||||||
|
text.codePointCount(0, text.length()));
|
||||||
|
|
||||||
return new FuzzyQuery(new Term(fuzzyNode.getFieldAsString(), fuzzyNode
|
return new FuzzyQuery(new Term(fuzzyNode.getFieldAsString(), fuzzyNode
|
||||||
.getTextAsString()), fuzzyNode.getSimilarity(), fuzzyNode
|
.getTextAsString()), numEdits, fuzzyNode
|
||||||
.getPrefixLength());
|
.getPrefixLength());
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -5,7 +5,7 @@ import org.apache.lucene.queryparser.xml.DOMUtils;
|
||||||
import org.apache.lucene.queryparser.xml.ParserException;
|
import org.apache.lucene.queryparser.xml.ParserException;
|
||||||
import org.apache.lucene.queryparser.xml.QueryBuilder;
|
import org.apache.lucene.queryparser.xml.QueryBuilder;
|
||||||
import org.apache.lucene.sandbox.queries.FuzzyLikeThisQuery;
|
import org.apache.lucene.sandbox.queries.FuzzyLikeThisQuery;
|
||||||
import org.apache.lucene.search.FuzzyQuery;
|
import org.apache.lucene.sandbox.queries.SlowFuzzyQuery;
|
||||||
import org.apache.lucene.search.Query;
|
import org.apache.lucene.search.Query;
|
||||||
import org.w3c.dom.Element;
|
import org.w3c.dom.Element;
|
||||||
import org.w3c.dom.NodeList;
|
import org.w3c.dom.NodeList;
|
||||||
|
@ -33,7 +33,7 @@ import org.w3c.dom.NodeList;
|
||||||
public class FuzzyLikeThisQueryBuilder implements QueryBuilder {
|
public class FuzzyLikeThisQueryBuilder implements QueryBuilder {
|
||||||
|
|
||||||
private static final int DEFAULT_MAX_NUM_TERMS = 50;
|
private static final int DEFAULT_MAX_NUM_TERMS = 50;
|
||||||
private static final float DEFAULT_MIN_SIMILARITY = FuzzyQuery.defaultMinSimilarity;
|
private static final float DEFAULT_MIN_SIMILARITY = SlowFuzzyQuery.defaultMinSimilarity;
|
||||||
private static final int DEFAULT_PREFIX_LENGTH = 1;
|
private static final int DEFAULT_PREFIX_LENGTH = 1;
|
||||||
private static final boolean DEFAULT_IGNORE_TF = false;
|
private static final boolean DEFAULT_IGNORE_TF = false;
|
||||||
|
|
||||||
|
|
|
@ -59,8 +59,8 @@ public class TestAnalyzingQueryParser extends LuceneTestCase {
|
||||||
fuzzyInput = new String[] { "Übersetzung Übersetzung~0.9",
|
fuzzyInput = new String[] { "Übersetzung Übersetzung~0.9",
|
||||||
"Mötley Crüe Mötley~0.75 Crüe~0.5",
|
"Mötley Crüe Mötley~0.75 Crüe~0.5",
|
||||||
"Renée Zellweger Renée~0.9 Zellweger~" };
|
"Renée Zellweger Renée~0.9 Zellweger~" };
|
||||||
fuzzyExpected = new String[] { "ubersetzung ubersetzung~0.9",
|
fuzzyExpected = new String[] { "ubersetzung ubersetzung~1",
|
||||||
"motley crue motley~0.75 crue~0.5", "renee zellweger renee~0.9 zellweger~2.0" };
|
"motley crue motley~1 crue~2", "renee zellweger renee~0 zellweger~2" };
|
||||||
|
|
||||||
a = new ASCIIAnalyzer();
|
a = new ASCIIAnalyzer();
|
||||||
}
|
}
|
||||||
|
|
|
@ -85,10 +85,10 @@ public class TestMultiFieldQueryParser extends LuceneTestCase {
|
||||||
assertEquals("((b:one t:one)^2.0) (b:two t:two)", q.toString());
|
assertEquals("((b:one t:one)^2.0) (b:two t:two)", q.toString());
|
||||||
|
|
||||||
q = mfqp.parse("one~ two");
|
q = mfqp.parse("one~ two");
|
||||||
assertEquals("(b:one~2.0 t:one~2.0) (b:two t:two)", q.toString());
|
assertEquals("(b:one~2 t:one~2) (b:two t:two)", q.toString());
|
||||||
|
|
||||||
q = mfqp.parse("one~0.8 two^2");
|
q = mfqp.parse("one~0.8 two^2");
|
||||||
assertEquals("(b:one~0.8 t:one~0.8) ((b:two t:two)^2.0)", q.toString());
|
assertEquals("(b:one~0 t:one~0) ((b:two t:two)^2.0)", q.toString());
|
||||||
|
|
||||||
q = mfqp.parse("one* two*");
|
q = mfqp.parse("one* two*");
|
||||||
assertEquals("(b:one* t:one*) (b:two* t:two*)", q.toString());
|
assertEquals("(b:one* t:one*) (b:two* t:two*)", q.toString());
|
||||||
|
@ -272,7 +272,7 @@ public class TestMultiFieldQueryParser extends LuceneTestCase {
|
||||||
q = parser.parse("bla*");
|
q = parser.parse("bla*");
|
||||||
assertEquals("f1:bla* f2:bla* f3:bla*", q.toString());
|
assertEquals("f1:bla* f2:bla* f3:bla*", q.toString());
|
||||||
q = parser.parse("bla~");
|
q = parser.parse("bla~");
|
||||||
assertEquals("f1:bla~2.0 f2:bla~2.0 f3:bla~2.0", q.toString());
|
assertEquals("f1:bla~2 f2:bla~2 f3:bla~2", q.toString());
|
||||||
q = parser.parse("[a TO c]");
|
q = parser.parse("[a TO c]");
|
||||||
assertEquals("f1:[a TO c] f2:[a TO c] f3:[a TO c]", q.toString());
|
assertEquals("f1:[a TO c] f2:[a TO c] f3:[a TO c]", q.toString());
|
||||||
}
|
}
|
||||||
|
|
|
@ -282,10 +282,10 @@ public class TestPrecedenceQueryParser extends LuceneTestCase {
|
||||||
public void testWildcard() throws Exception {
|
public void testWildcard() throws Exception {
|
||||||
assertQueryEquals("term*", null, "term*");
|
assertQueryEquals("term*", null, "term*");
|
||||||
assertQueryEquals("term*^2", null, "term*^2.0");
|
assertQueryEquals("term*^2", null, "term*^2.0");
|
||||||
assertQueryEquals("term~", null, "term~2.0");
|
assertQueryEquals("term~", null, "term~2");
|
||||||
assertQueryEquals("term~0.7", null, "term~0.7");
|
assertQueryEquals("term~0.7", null, "term~1");
|
||||||
assertQueryEquals("term~^3", null, "term~2.0^3.0");
|
assertQueryEquals("term~^3", null, "term~2^3.0");
|
||||||
assertQueryEquals("term^3~", null, "term~2.0^3.0");
|
assertQueryEquals("term^3~", null, "term~2^3.0");
|
||||||
assertQueryEquals("term*germ", null, "term*germ");
|
assertQueryEquals("term*germ", null, "term*germ");
|
||||||
assertQueryEquals("term*germ^3", null, "term*germ^3.0");
|
assertQueryEquals("term*germ^3", null, "term*germ^3.0");
|
||||||
|
|
||||||
|
@ -294,10 +294,10 @@ public class TestPrecedenceQueryParser extends LuceneTestCase {
|
||||||
assertTrue(getQuery("term~", null) instanceof FuzzyQuery);
|
assertTrue(getQuery("term~", null) instanceof FuzzyQuery);
|
||||||
assertTrue(getQuery("term~0.7", null) instanceof FuzzyQuery);
|
assertTrue(getQuery("term~0.7", null) instanceof FuzzyQuery);
|
||||||
FuzzyQuery fq = (FuzzyQuery) getQuery("term~0.7", null);
|
FuzzyQuery fq = (FuzzyQuery) getQuery("term~0.7", null);
|
||||||
assertEquals(0.7f, fq.getMinSimilarity(), 0.1f);
|
assertEquals(1, fq.getMaxEdits());
|
||||||
assertEquals(FuzzyQuery.defaultPrefixLength, fq.getPrefixLength());
|
assertEquals(FuzzyQuery.defaultPrefixLength, fq.getPrefixLength());
|
||||||
fq = (FuzzyQuery) getQuery("term~", null);
|
fq = (FuzzyQuery) getQuery("term~", null);
|
||||||
assertEquals(2.0f, fq.getMinSimilarity(), 0.1f);
|
assertEquals(2, fq.getMaxEdits());
|
||||||
assertEquals(FuzzyQuery.defaultPrefixLength, fq.getPrefixLength());
|
assertEquals(FuzzyQuery.defaultPrefixLength, fq.getPrefixLength());
|
||||||
try {
|
try {
|
||||||
getQuery("term~1.1", null); // value > 1, throws exception
|
getQuery("term~1.1", null); // value > 1, throws exception
|
||||||
|
@ -336,9 +336,9 @@ public class TestPrecedenceQueryParser extends LuceneTestCase {
|
||||||
assertWildcardQueryEquals("TE?M", false, "TE?M");
|
assertWildcardQueryEquals("TE?M", false, "TE?M");
|
||||||
assertWildcardQueryEquals("Te?m*gerM", false, "Te?m*gerM");
|
assertWildcardQueryEquals("Te?m*gerM", false, "Te?m*gerM");
|
||||||
// Fuzzy queries:
|
// Fuzzy queries:
|
||||||
assertWildcardQueryEquals("Term~", "term~2.0");
|
assertWildcardQueryEquals("Term~", "term~2");
|
||||||
assertWildcardQueryEquals("Term~", true, "term~2.0");
|
assertWildcardQueryEquals("Term~", true, "term~2");
|
||||||
assertWildcardQueryEquals("Term~", false, "Term~2.0");
|
assertWildcardQueryEquals("Term~", false, "Term~2");
|
||||||
// Range queries:
|
// Range queries:
|
||||||
assertWildcardQueryEquals("[A TO C]", "[a TO c]");
|
assertWildcardQueryEquals("[A TO C]", "[a TO c]");
|
||||||
assertWildcardQueryEquals("[A TO C]", true, "[a TO c]");
|
assertWildcardQueryEquals("[A TO C]", true, "[a TO c]");
|
||||||
|
@ -498,10 +498,10 @@ public class TestPrecedenceQueryParser extends LuceneTestCase {
|
||||||
|
|
||||||
assertQueryEquals("a:b\\\\?c", a, "a:b\\?c");
|
assertQueryEquals("a:b\\\\?c", a, "a:b\\?c");
|
||||||
|
|
||||||
assertQueryEquals("a:b\\-c~", a, "a:b-c~2.0");
|
assertQueryEquals("a:b\\-c~", a, "a:b-c~2");
|
||||||
assertQueryEquals("a:b\\+c~", a, "a:b+c~2.0");
|
assertQueryEquals("a:b\\+c~", a, "a:b+c~2");
|
||||||
assertQueryEquals("a:b\\:c~", a, "a:b:c~2.0");
|
assertQueryEquals("a:b\\:c~", a, "a:b:c~2");
|
||||||
assertQueryEquals("a:b\\\\c~", a, "a:b\\c~2.0");
|
assertQueryEquals("a:b\\\\c~", a, "a:b\\c~2");
|
||||||
|
|
||||||
assertQueryEquals("[ a\\- TO a\\+ ]", null, "[a- TO a+]");
|
assertQueryEquals("[ a\\- TO a\\+ ]", null, "[a- TO a+]");
|
||||||
assertQueryEquals("[ a\\: TO a\\~ ]", null, "[a: TO a~]");
|
assertQueryEquals("[ a\\: TO a\\~ ]", null, "[a: TO a~]");
|
||||||
|
|
|
@ -100,10 +100,10 @@ public class TestMultiFieldQPHelper extends LuceneTestCase {
|
||||||
assertEquals("((b:one t:one)^2.0) (b:two t:two)", q.toString());
|
assertEquals("((b:one t:one)^2.0) (b:two t:two)", q.toString());
|
||||||
|
|
||||||
q = mfqp.parse("one~ two", null);
|
q = mfqp.parse("one~ two", null);
|
||||||
assertEquals("(b:one~2.0 t:one~2.0) (b:two t:two)", q.toString());
|
assertEquals("(b:one~2 t:one~2) (b:two t:two)", q.toString());
|
||||||
|
|
||||||
q = mfqp.parse("one~0.8 two^2", null);
|
q = mfqp.parse("one~0.8 two^2", null);
|
||||||
assertEquals("(b:one~0.8 t:one~0.8) ((b:two t:two)^2.0)", q.toString());
|
assertEquals("(b:one~0 t:one~0) ((b:two t:two)^2.0)", q.toString());
|
||||||
|
|
||||||
q = mfqp.parse("one* two*", null);
|
q = mfqp.parse("one* two*", null);
|
||||||
assertEquals("(b:one* t:one*) (b:two* t:two*)", q.toString());
|
assertEquals("(b:one* t:one*) (b:two* t:two*)", q.toString());
|
||||||
|
@ -311,7 +311,7 @@ public class TestMultiFieldQPHelper extends LuceneTestCase {
|
||||||
q = parser.parse("bla*", null);
|
q = parser.parse("bla*", null);
|
||||||
assertEquals("f1:bla* f2:bla* f3:bla*", q.toString());
|
assertEquals("f1:bla* f2:bla* f3:bla*", q.toString());
|
||||||
q = parser.parse("bla~", null);
|
q = parser.parse("bla~", null);
|
||||||
assertEquals("f1:bla~2.0 f2:bla~2.0 f3:bla~2.0", q.toString());
|
assertEquals("f1:bla~2 f2:bla~2 f3:bla~2", q.toString());
|
||||||
q = parser.parse("[a TO c]", null);
|
q = parser.parse("[a TO c]", null);
|
||||||
assertEquals("f1:[a TO c] f2:[a TO c] f3:[a TO c]", q.toString());
|
assertEquals("f1:[a TO c] f2:[a TO c] f3:[a TO c]", q.toString());
|
||||||
}
|
}
|
||||||
|
|
|
@ -514,12 +514,12 @@ public class TestQPHelper extends LuceneTestCase {
|
||||||
public void testWildcard() throws Exception {
|
public void testWildcard() throws Exception {
|
||||||
assertQueryEquals("term*", null, "term*");
|
assertQueryEquals("term*", null, "term*");
|
||||||
assertQueryEquals("term*^2", null, "term*^2.0");
|
assertQueryEquals("term*^2", null, "term*^2.0");
|
||||||
assertQueryEquals("term~", null, "term~2.0");
|
assertQueryEquals("term~", null, "term~2");
|
||||||
assertQueryEquals("term~0.7", null, "term~0.7");
|
assertQueryEquals("term~0.7", null, "term~1");
|
||||||
|
|
||||||
assertQueryEquals("term~^3", null, "term~2.0^3.0");
|
assertQueryEquals("term~^3", null, "term~2^3.0");
|
||||||
|
|
||||||
assertQueryEquals("term^3~", null, "term~2.0^3.0");
|
assertQueryEquals("term^3~", null, "term~2^3.0");
|
||||||
assertQueryEquals("term*germ", null, "term*germ");
|
assertQueryEquals("term*germ", null, "term*germ");
|
||||||
assertQueryEquals("term*germ^3", null, "term*germ^3.0");
|
assertQueryEquals("term*germ^3", null, "term*germ^3.0");
|
||||||
|
|
||||||
|
@ -528,10 +528,10 @@ public class TestQPHelper extends LuceneTestCase {
|
||||||
assertTrue(getQuery("term~", null) instanceof FuzzyQuery);
|
assertTrue(getQuery("term~", null) instanceof FuzzyQuery);
|
||||||
assertTrue(getQuery("term~0.7", null) instanceof FuzzyQuery);
|
assertTrue(getQuery("term~0.7", null) instanceof FuzzyQuery);
|
||||||
FuzzyQuery fq = (FuzzyQuery) getQuery("term~0.7", null);
|
FuzzyQuery fq = (FuzzyQuery) getQuery("term~0.7", null);
|
||||||
assertEquals(0.7f, fq.getMinSimilarity(), 0.1f);
|
assertEquals(1, fq.getMaxEdits());
|
||||||
assertEquals(FuzzyQuery.defaultPrefixLength, fq.getPrefixLength());
|
assertEquals(FuzzyQuery.defaultPrefixLength, fq.getPrefixLength());
|
||||||
fq = (FuzzyQuery) getQuery("term~", null);
|
fq = (FuzzyQuery) getQuery("term~", null);
|
||||||
assertEquals(2.0f, fq.getMinSimilarity(), 0.1f);
|
assertEquals(2, fq.getMaxEdits());
|
||||||
assertEquals(FuzzyQuery.defaultPrefixLength, fq.getPrefixLength());
|
assertEquals(FuzzyQuery.defaultPrefixLength, fq.getPrefixLength());
|
||||||
|
|
||||||
assertQueryNodeException("term~1.1"); // value > 1, throws exception
|
assertQueryNodeException("term~1.1"); // value > 1, throws exception
|
||||||
|
@ -567,9 +567,9 @@ public class TestQPHelper extends LuceneTestCase {
|
||||||
assertWildcardQueryEquals("TE?M", false, "TE?M");
|
assertWildcardQueryEquals("TE?M", false, "TE?M");
|
||||||
assertWildcardQueryEquals("Te?m*gerM", false, "Te?m*gerM");
|
assertWildcardQueryEquals("Te?m*gerM", false, "Te?m*gerM");
|
||||||
// Fuzzy queries:
|
// Fuzzy queries:
|
||||||
assertWildcardQueryEquals("Term~", "term~2.0");
|
assertWildcardQueryEquals("Term~", "term~2");
|
||||||
assertWildcardQueryEquals("Term~", true, "term~2.0");
|
assertWildcardQueryEquals("Term~", true, "term~2");
|
||||||
assertWildcardQueryEquals("Term~", false, "Term~2.0");
|
assertWildcardQueryEquals("Term~", false, "Term~2");
|
||||||
// Range queries:
|
// Range queries:
|
||||||
|
|
||||||
// TODO: implement this on QueryParser
|
// TODO: implement this on QueryParser
|
||||||
|
@ -805,10 +805,10 @@ public class TestQPHelper extends LuceneTestCase {
|
||||||
|
|
||||||
assertQueryEquals("a:b\\\\?c", a, "a:b\\?c");
|
assertQueryEquals("a:b\\\\?c", a, "a:b\\?c");
|
||||||
|
|
||||||
assertQueryEquals("a:b\\-c~", a, "a:b-c~2.0");
|
assertQueryEquals("a:b\\-c~", a, "a:b-c~2");
|
||||||
assertQueryEquals("a:b\\+c~", a, "a:b+c~2.0");
|
assertQueryEquals("a:b\\+c~", a, "a:b+c~2");
|
||||||
assertQueryEquals("a:b\\:c~", a, "a:b:c~2.0");
|
assertQueryEquals("a:b\\:c~", a, "a:b:c~2");
|
||||||
assertQueryEquals("a:b\\\\c~", a, "a:b\\c~2.0");
|
assertQueryEquals("a:b\\\\c~", a, "a:b\\c~2");
|
||||||
|
|
||||||
// TODO: implement Range queries on QueryParser
|
// TODO: implement Range queries on QueryParser
|
||||||
assertQueryEquals("[ a\\- TO a\\+ ]", null, "[a- TO a+]");
|
assertQueryEquals("[ a\\- TO a\\+ ]", null, "[a- TO a+]");
|
||||||
|
|
|
@ -420,10 +420,10 @@ public abstract class QueryParserTestBase extends LuceneTestCase {
|
||||||
public void testWildcard() throws Exception {
|
public void testWildcard() throws Exception {
|
||||||
assertQueryEquals("term*", null, "term*");
|
assertQueryEquals("term*", null, "term*");
|
||||||
assertQueryEquals("term*^2", null, "term*^2.0");
|
assertQueryEquals("term*^2", null, "term*^2.0");
|
||||||
assertQueryEquals("term~", null, "term~2.0");
|
assertQueryEquals("term~", null, "term~2");
|
||||||
assertQueryEquals("term~0.7", null, "term~0.7");
|
assertQueryEquals("term~0.7", null, "term~1");
|
||||||
assertQueryEquals("term~^3", null, "term~2.0^3.0");
|
assertQueryEquals("term~^3", null, "term~2^3.0");
|
||||||
assertQueryEquals("term^3~", null, "term~2.0^3.0");
|
assertQueryEquals("term^3~", null, "term~2^3.0");
|
||||||
assertQueryEquals("term*germ", null, "term*germ");
|
assertQueryEquals("term*germ", null, "term*germ");
|
||||||
assertQueryEquals("term*germ^3", null, "term*germ^3.0");
|
assertQueryEquals("term*germ^3", null, "term*germ^3.0");
|
||||||
|
|
||||||
|
@ -432,10 +432,10 @@ public abstract class QueryParserTestBase extends LuceneTestCase {
|
||||||
assertTrue(getQuery("term~", null) instanceof FuzzyQuery);
|
assertTrue(getQuery("term~", null) instanceof FuzzyQuery);
|
||||||
assertTrue(getQuery("term~0.7", null) instanceof FuzzyQuery);
|
assertTrue(getQuery("term~0.7", null) instanceof FuzzyQuery);
|
||||||
FuzzyQuery fq = (FuzzyQuery)getQuery("term~0.7", null);
|
FuzzyQuery fq = (FuzzyQuery)getQuery("term~0.7", null);
|
||||||
assertEquals(0.7f, fq.getMinSimilarity(), 0.1f);
|
assertEquals(1, fq.getMaxEdits());
|
||||||
assertEquals(FuzzyQuery.defaultPrefixLength, fq.getPrefixLength());
|
assertEquals(FuzzyQuery.defaultPrefixLength, fq.getPrefixLength());
|
||||||
fq = (FuzzyQuery)getQuery("term~", null);
|
fq = (FuzzyQuery)getQuery("term~", null);
|
||||||
assertEquals(2.0f, fq.getMinSimilarity(), 0.1f);
|
assertEquals(2, fq.getMaxEdits());
|
||||||
assertEquals(FuzzyQuery.defaultPrefixLength, fq.getPrefixLength());
|
assertEquals(FuzzyQuery.defaultPrefixLength, fq.getPrefixLength());
|
||||||
|
|
||||||
assertParseException("term~1.1"); // value > 1, throws exception
|
assertParseException("term~1.1"); // value > 1, throws exception
|
||||||
|
@ -470,9 +470,9 @@ public abstract class QueryParserTestBase extends LuceneTestCase {
|
||||||
assertWildcardQueryEquals("TE?M", false, "TE?M");
|
assertWildcardQueryEquals("TE?M", false, "TE?M");
|
||||||
assertWildcardQueryEquals("Te?m*gerM", false, "Te?m*gerM");
|
assertWildcardQueryEquals("Te?m*gerM", false, "Te?m*gerM");
|
||||||
// Fuzzy queries:
|
// Fuzzy queries:
|
||||||
assertWildcardQueryEquals("Term~", "term~2.0");
|
assertWildcardQueryEquals("Term~", "term~2");
|
||||||
assertWildcardQueryEquals("Term~", true, "term~2.0");
|
assertWildcardQueryEquals("Term~", true, "term~2");
|
||||||
assertWildcardQueryEquals("Term~", false, "Term~2.0");
|
assertWildcardQueryEquals("Term~", false, "Term~2");
|
||||||
// Range queries:
|
// Range queries:
|
||||||
assertWildcardQueryEquals("[A TO C]", "[a TO c]");
|
assertWildcardQueryEquals("[A TO C]", "[a TO c]");
|
||||||
assertWildcardQueryEquals("[A TO C]", true, "[a TO c]");
|
assertWildcardQueryEquals("[A TO C]", true, "[a TO c]");
|
||||||
|
@ -693,10 +693,10 @@ public abstract class QueryParserTestBase extends LuceneTestCase {
|
||||||
|
|
||||||
assertQueryEquals("a:b\\\\?c", a, "a:b\\\\?c");
|
assertQueryEquals("a:b\\\\?c", a, "a:b\\\\?c");
|
||||||
|
|
||||||
assertQueryEquals("a:b\\-c~", a, "a:b-c~2.0");
|
assertQueryEquals("a:b\\-c~", a, "a:b-c~2");
|
||||||
assertQueryEquals("a:b\\+c~", a, "a:b+c~2.0");
|
assertQueryEquals("a:b\\+c~", a, "a:b+c~2");
|
||||||
assertQueryEquals("a:b\\:c~", a, "a:b:c~2.0");
|
assertQueryEquals("a:b\\:c~", a, "a:b:c~2");
|
||||||
assertQueryEquals("a:b\\\\c~", a, "a:b\\c~2.0");
|
assertQueryEquals("a:b\\\\c~", a, "a:b\\c~2");
|
||||||
|
|
||||||
assertQueryEquals("[ a\\- TO a\\+ ]", null, "[a- TO a+]");
|
assertQueryEquals("[ a\\- TO a\\+ ]", null, "[a- TO a+]");
|
||||||
assertQueryEquals("[ a\\: TO a\\~ ]", null, "[a: TO a~]");
|
assertQueryEquals("[ a\\: TO a\\~ ]", null, "[a: TO a~]");
|
||||||
|
@ -1271,7 +1271,7 @@ public abstract class QueryParserTestBase extends LuceneTestCase {
|
||||||
public void testDistanceAsEditsParsing() throws Exception {
|
public void testDistanceAsEditsParsing() throws Exception {
|
||||||
QueryParser qp = new QueryParser(TEST_VERSION_CURRENT, "field", new MockAnalyzer(random()));
|
QueryParser qp = new QueryParser(TEST_VERSION_CURRENT, "field", new MockAnalyzer(random()));
|
||||||
FuzzyQuery q = (FuzzyQuery) qp.parse("foobar~2");
|
FuzzyQuery q = (FuzzyQuery) qp.parse("foobar~2");
|
||||||
assertEquals(2f, q.getMinSimilarity(), 0.0001f);
|
assertEquals(2, q.getMaxEdits());
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testPhraseQueryToString() throws ParseException {
|
public void testPhraseQueryToString() throws ParseException {
|
||||||
|
|
|
@ -211,7 +211,7 @@ public class FuzzyLikeThisQuery extends Query
|
||||||
AttributeSource atts = new AttributeSource();
|
AttributeSource atts = new AttributeSource();
|
||||||
MaxNonCompetitiveBoostAttribute maxBoostAtt =
|
MaxNonCompetitiveBoostAttribute maxBoostAtt =
|
||||||
atts.addAttribute(MaxNonCompetitiveBoostAttribute.class);
|
atts.addAttribute(MaxNonCompetitiveBoostAttribute.class);
|
||||||
FuzzyTermsEnum fe = new FuzzyTermsEnum(MultiFields.getTerms(reader, startTerm.field()), atts, startTerm, f.minSimilarity, f.prefixLength, false);
|
SlowFuzzyTermsEnum fe = new SlowFuzzyTermsEnum(MultiFields.getTerms(reader, startTerm.field()), atts, startTerm, f.minSimilarity, f.prefixLength);
|
||||||
//store the df so all variants use same idf
|
//store the df so all variants use same idf
|
||||||
int df = reader.docFreq(startTerm);
|
int df = reader.docFreq(startTerm);
|
||||||
int numVariants=0;
|
int numVariants=0;
|
||||||
|
|
|
@ -0,0 +1,204 @@
|
||||||
|
package org.apache.lucene.sandbox.queries;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.index.SingleTermsEnum;
|
||||||
|
import org.apache.lucene.index.Term;
|
||||||
|
import org.apache.lucene.index.Terms;
|
||||||
|
import org.apache.lucene.index.TermsEnum;
|
||||||
|
import org.apache.lucene.search.BooleanQuery; // javadocs
|
||||||
|
import org.apache.lucene.search.FuzzyQuery; // javadocs
|
||||||
|
import org.apache.lucene.search.MultiTermQuery;
|
||||||
|
import org.apache.lucene.util.AttributeSource;
|
||||||
|
import org.apache.lucene.util.ToStringUtils;
|
||||||
|
import org.apache.lucene.util.automaton.LevenshteinAutomata;
|
||||||
|
|
||||||
|
/** Implements the classic fuzzy search query. The similarity measurement
|
||||||
|
* is based on the Levenshtein (edit distance) algorithm.
|
||||||
|
* <p>
|
||||||
|
* Note that, unlike {@link FuzzyQuery}, this query will silently allow
|
||||||
|
* for a (possibly huge) number of edit distances in comparisons, and may
|
||||||
|
* be extremely slow (comparing every term in the index).
|
||||||
|
*
|
||||||
|
* @deprecated Use {@link FuzzyQuery} instead.
|
||||||
|
*/
|
||||||
|
@Deprecated
|
||||||
|
public class SlowFuzzyQuery extends MultiTermQuery {
|
||||||
|
|
||||||
|
public final static float defaultMinSimilarity = LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE;
|
||||||
|
public final static int defaultPrefixLength = 0;
|
||||||
|
public final static int defaultMaxExpansions = 50;
|
||||||
|
|
||||||
|
private float minimumSimilarity;
|
||||||
|
private int prefixLength;
|
||||||
|
private boolean termLongEnough = false;
|
||||||
|
|
||||||
|
protected Term term;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new SlowFuzzyQuery that will match terms with a similarity
|
||||||
|
* of at least <code>minimumSimilarity</code> to <code>term</code>.
|
||||||
|
* If a <code>prefixLength</code> > 0 is specified, a common prefix
|
||||||
|
* of that length is also required.
|
||||||
|
*
|
||||||
|
* @param term the term to search for
|
||||||
|
* @param minimumSimilarity a value between 0 and 1 to set the required similarity
|
||||||
|
* between the query term and the matching terms. For example, for a
|
||||||
|
* <code>minimumSimilarity</code> of <code>0.5</code> a term of the same length
|
||||||
|
* as the query term is considered similar to the query term if the edit distance
|
||||||
|
* between both terms is less than <code>length(term)*0.5</code>
|
||||||
|
* <p>
|
||||||
|
* Alternatively, if <code>minimumSimilarity</code> is >= 1f, it is interpreted
|
||||||
|
* as a pure Levenshtein edit distance. For example, a value of <code>2f</code>
|
||||||
|
* will match all terms within an edit distance of <code>2</code> from the
|
||||||
|
* query term. Edit distances specified in this way may not be fractional.
|
||||||
|
*
|
||||||
|
* @param prefixLength length of common (non-fuzzy) prefix
|
||||||
|
* @param maxExpansions the maximum number of terms to match. If this number is
|
||||||
|
* greater than {@link BooleanQuery#getMaxClauseCount} when the query is rewritten,
|
||||||
|
* then the maxClauseCount will be used instead.
|
||||||
|
* @throws IllegalArgumentException if minimumSimilarity is >= 1 or < 0
|
||||||
|
* or if prefixLength < 0
|
||||||
|
*/
|
||||||
|
public SlowFuzzyQuery(Term term, float minimumSimilarity, int prefixLength,
|
||||||
|
int maxExpansions) {
|
||||||
|
super(term.field());
|
||||||
|
this.term = term;
|
||||||
|
|
||||||
|
if (minimumSimilarity >= 1.0f && minimumSimilarity != (int)minimumSimilarity)
|
||||||
|
throw new IllegalArgumentException("fractional edit distances are not allowed");
|
||||||
|
if (minimumSimilarity < 0.0f)
|
||||||
|
throw new IllegalArgumentException("minimumSimilarity < 0");
|
||||||
|
if (prefixLength < 0)
|
||||||
|
throw new IllegalArgumentException("prefixLength < 0");
|
||||||
|
if (maxExpansions < 0)
|
||||||
|
throw new IllegalArgumentException("maxExpansions < 0");
|
||||||
|
|
||||||
|
setRewriteMethod(new MultiTermQuery.TopTermsScoringBooleanQueryRewrite(maxExpansions));
|
||||||
|
|
||||||
|
String text = term.text();
|
||||||
|
int len = text.codePointCount(0, text.length());
|
||||||
|
if (len > 0 && (minimumSimilarity >= 1f || len > 1.0f / (1.0f - minimumSimilarity))) {
|
||||||
|
this.termLongEnough = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
this.minimumSimilarity = minimumSimilarity;
|
||||||
|
this.prefixLength = prefixLength;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Calls {@link #SlowFuzzyQuery(Term, float) SlowFuzzyQuery(term, minimumSimilarity, prefixLength, defaultMaxExpansions)}.
|
||||||
|
*/
|
||||||
|
public SlowFuzzyQuery(Term term, float minimumSimilarity, int prefixLength) {
|
||||||
|
this(term, minimumSimilarity, prefixLength, defaultMaxExpansions);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Calls {@link #SlowFuzzyQuery(Term, float) SlowFuzzyQuery(term, minimumSimilarity, 0, defaultMaxExpansions)}.
|
||||||
|
*/
|
||||||
|
public SlowFuzzyQuery(Term term, float minimumSimilarity) {
|
||||||
|
this(term, minimumSimilarity, defaultPrefixLength, defaultMaxExpansions);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Calls {@link #SlowFuzzyQuery(Term, float) SlowFuzzyQuery(term, defaultMinSimilarity, 0, defaultMaxExpansions)}.
|
||||||
|
*/
|
||||||
|
public SlowFuzzyQuery(Term term) {
|
||||||
|
this(term, defaultMinSimilarity, defaultPrefixLength, defaultMaxExpansions);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the minimum similarity that is required for this query to match.
|
||||||
|
* @return float value between 0.0 and 1.0
|
||||||
|
*/
|
||||||
|
public float getMinSimilarity() {
|
||||||
|
return minimumSimilarity;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the non-fuzzy prefix length. This is the number of characters at the start
|
||||||
|
* of a term that must be identical (not fuzzy) to the query term if the query
|
||||||
|
* is to match that term.
|
||||||
|
*/
|
||||||
|
public int getPrefixLength() {
|
||||||
|
return prefixLength;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
|
||||||
|
if (!termLongEnough) { // can only match if it's exact
|
||||||
|
return new SingleTermsEnum(terms.iterator(null), term.bytes());
|
||||||
|
}
|
||||||
|
return new SlowFuzzyTermsEnum(terms, atts, getTerm(), minimumSimilarity, prefixLength);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the pattern term.
|
||||||
|
*/
|
||||||
|
public Term getTerm() {
|
||||||
|
return term;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString(String field) {
|
||||||
|
final StringBuilder buffer = new StringBuilder();
|
||||||
|
if (!term.field().equals(field)) {
|
||||||
|
buffer.append(term.field());
|
||||||
|
buffer.append(":");
|
||||||
|
}
|
||||||
|
buffer.append(term.text());
|
||||||
|
buffer.append('~');
|
||||||
|
buffer.append(Float.toString(minimumSimilarity));
|
||||||
|
buffer.append(ToStringUtils.boost(getBoost()));
|
||||||
|
return buffer.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int hashCode() {
|
||||||
|
final int prime = 31;
|
||||||
|
int result = super.hashCode();
|
||||||
|
result = prime * result + Float.floatToIntBits(minimumSimilarity);
|
||||||
|
result = prime * result + prefixLength;
|
||||||
|
result = prime * result + ((term == null) ? 0 : term.hashCode());
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean equals(Object obj) {
|
||||||
|
if (this == obj)
|
||||||
|
return true;
|
||||||
|
if (!super.equals(obj))
|
||||||
|
return false;
|
||||||
|
if (getClass() != obj.getClass())
|
||||||
|
return false;
|
||||||
|
SlowFuzzyQuery other = (SlowFuzzyQuery) obj;
|
||||||
|
if (Float.floatToIntBits(minimumSimilarity) != Float
|
||||||
|
.floatToIntBits(other.minimumSimilarity))
|
||||||
|
return false;
|
||||||
|
if (prefixLength != other.prefixLength)
|
||||||
|
return false;
|
||||||
|
if (term == null) {
|
||||||
|
if (other.term != null)
|
||||||
|
return false;
|
||||||
|
} else if (!term.equals(other.term))
|
||||||
|
return false;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,249 @@
|
||||||
|
package org.apache.lucene.sandbox.queries;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.index.Term;
|
||||||
|
import org.apache.lucene.index.Terms;
|
||||||
|
import org.apache.lucene.index.TermsEnum;
|
||||||
|
import org.apache.lucene.index.FilteredTermsEnum;
|
||||||
|
import org.apache.lucene.search.BoostAttribute;
|
||||||
|
import org.apache.lucene.search.FuzzyTermsEnum;
|
||||||
|
import org.apache.lucene.util.AttributeSource;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.IntsRef;
|
||||||
|
import org.apache.lucene.util.StringHelper;
|
||||||
|
import org.apache.lucene.util.UnicodeUtil;
|
||||||
|
|
||||||
|
/** Classic fuzzy TermsEnum for enumerating all terms that are similar
|
||||||
|
* to the specified filter term.
|
||||||
|
*
|
||||||
|
* <p>Term enumerations are always ordered by
|
||||||
|
* {@link #getComparator}. Each term in the enumeration is
|
||||||
|
* greater than all that precede it.</p>
|
||||||
|
*
|
||||||
|
* @deprecated Use {@link FuzzyTermsEnum} instead.
|
||||||
|
*/
|
||||||
|
@Deprecated
|
||||||
|
public final class SlowFuzzyTermsEnum extends FuzzyTermsEnum {
|
||||||
|
|
||||||
|
public SlowFuzzyTermsEnum(Terms terms, AttributeSource atts, Term term,
|
||||||
|
float minSimilarity, int prefixLength) throws IOException {
|
||||||
|
super(terms, atts, term, minSimilarity, prefixLength, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void maxEditDistanceChanged(BytesRef lastTerm, int maxEdits, boolean init)
|
||||||
|
throws IOException {
|
||||||
|
TermsEnum newEnum = getAutomatonEnum(maxEdits, lastTerm);
|
||||||
|
if (newEnum != null) {
|
||||||
|
setEnum(newEnum);
|
||||||
|
} else if (init) {
|
||||||
|
setEnum(new LinearFuzzyTermsEnum());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Implement fuzzy enumeration with linear brute force.
|
||||||
|
*/
|
||||||
|
private class LinearFuzzyTermsEnum extends FilteredTermsEnum {
|
||||||
|
/* Allows us save time required to create a new array
|
||||||
|
* every time similarity is called.
|
||||||
|
*/
|
||||||
|
private int[] d;
|
||||||
|
private int[] p;
|
||||||
|
|
||||||
|
// this is the text, minus the prefix
|
||||||
|
private final int[] text;
|
||||||
|
|
||||||
|
private final BoostAttribute boostAtt =
|
||||||
|
attributes().addAttribute(BoostAttribute.class);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Constructor for enumeration of all terms from specified <code>reader</code> which share a prefix of
|
||||||
|
* length <code>prefixLength</code> with <code>term</code> and which have a fuzzy similarity >
|
||||||
|
* <code>minSimilarity</code>.
|
||||||
|
* <p>
|
||||||
|
* After calling the constructor the enumeration is already pointing to the first
|
||||||
|
* valid term if such a term exists.
|
||||||
|
*
|
||||||
|
* @throws IOException
|
||||||
|
*/
|
||||||
|
public LinearFuzzyTermsEnum() throws IOException {
|
||||||
|
super(terms.iterator(null));
|
||||||
|
|
||||||
|
this.text = new int[termLength - realPrefixLength];
|
||||||
|
System.arraycopy(termText, realPrefixLength, text, 0, text.length);
|
||||||
|
final String prefix = UnicodeUtil.newString(termText, 0, realPrefixLength);
|
||||||
|
prefixBytesRef = new BytesRef(prefix);
|
||||||
|
this.d = new int[this.text.length + 1];
|
||||||
|
this.p = new int[this.text.length + 1];
|
||||||
|
|
||||||
|
setInitialSeekTerm(prefixBytesRef);
|
||||||
|
}
|
||||||
|
|
||||||
|
private final BytesRef prefixBytesRef;
|
||||||
|
// used for unicode conversion from BytesRef byte[] to int[]
|
||||||
|
private final IntsRef utf32 = new IntsRef(20);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The termCompare method in FuzzyTermEnum uses Levenshtein distance to
|
||||||
|
* calculate the distance between the given term and the comparing term.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
protected final AcceptStatus accept(BytesRef term) {
|
||||||
|
if (StringHelper.startsWith(term, prefixBytesRef)) {
|
||||||
|
UnicodeUtil.UTF8toUTF32(term, utf32);
|
||||||
|
final float similarity = similarity(utf32.ints, realPrefixLength, utf32.length - realPrefixLength);
|
||||||
|
if (similarity > minSimilarity) {
|
||||||
|
boostAtt.setBoost((similarity - minSimilarity) * scale_factor);
|
||||||
|
return AcceptStatus.YES;
|
||||||
|
} else return AcceptStatus.NO;
|
||||||
|
} else {
|
||||||
|
return AcceptStatus.END;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/******************************
|
||||||
|
* Compute Levenshtein distance
|
||||||
|
******************************/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* <p>Similarity returns a number that is 1.0f or less (including negative numbers)
|
||||||
|
* based on how similar the Term is compared to a target term. It returns
|
||||||
|
* exactly 0.0f when
|
||||||
|
* <pre>
|
||||||
|
* editDistance > maximumEditDistance</pre>
|
||||||
|
* Otherwise it returns:
|
||||||
|
* <pre>
|
||||||
|
* 1 - (editDistance / length)</pre>
|
||||||
|
* where length is the length of the shortest term (text or target) including a
|
||||||
|
* prefix that are identical and editDistance is the Levenshtein distance for
|
||||||
|
* the two words.</p>
|
||||||
|
*
|
||||||
|
* <p>Embedded within this algorithm is a fail-fast Levenshtein distance
|
||||||
|
* algorithm. The fail-fast algorithm differs from the standard Levenshtein
|
||||||
|
* distance algorithm in that it is aborted if it is discovered that the
|
||||||
|
* minimum distance between the words is greater than some threshold.
|
||||||
|
*
|
||||||
|
* <p>To calculate the maximum distance threshold we use the following formula:
|
||||||
|
* <pre>
|
||||||
|
* (1 - minimumSimilarity) * length</pre>
|
||||||
|
* where length is the shortest term including any prefix that is not part of the
|
||||||
|
* similarity comparison. This formula was derived by solving for what maximum value
|
||||||
|
* of distance returns false for the following statements:
|
||||||
|
* <pre>
|
||||||
|
* similarity = 1 - ((float)distance / (float) (prefixLength + Math.min(textlen, targetlen)));
|
||||||
|
* return (similarity > minimumSimilarity);</pre>
|
||||||
|
* where distance is the Levenshtein distance for the two words.
|
||||||
|
* </p>
|
||||||
|
* <p>Levenshtein distance (also known as edit distance) is a measure of similarity
|
||||||
|
* between two strings where the distance is measured as the number of character
|
||||||
|
* deletions, insertions or substitutions required to transform one string to
|
||||||
|
* the other string.
|
||||||
|
* @param target the target word or phrase
|
||||||
|
* @return the similarity, 0.0 or less indicates that it matches less than the required
|
||||||
|
* threshold and 1.0 indicates that the text and target are identical
|
||||||
|
*/
|
||||||
|
private final float similarity(final int[] target, int offset, int length) {
|
||||||
|
final int m = length;
|
||||||
|
final int n = text.length;
|
||||||
|
if (n == 0) {
|
||||||
|
//we don't have anything to compare. That means if we just add
|
||||||
|
//the letters for m we get the new word
|
||||||
|
return realPrefixLength == 0 ? 0.0f : 1.0f - ((float) m / realPrefixLength);
|
||||||
|
}
|
||||||
|
if (m == 0) {
|
||||||
|
return realPrefixLength == 0 ? 0.0f : 1.0f - ((float) n / realPrefixLength);
|
||||||
|
}
|
||||||
|
|
||||||
|
final int maxDistance = calculateMaxDistance(m);
|
||||||
|
|
||||||
|
if (maxDistance < Math.abs(m-n)) {
|
||||||
|
//just adding the characters of m to n or vice-versa results in
|
||||||
|
//too many edits
|
||||||
|
//for example "pre" length is 3 and "prefixes" length is 8. We can see that
|
||||||
|
//given this optimal circumstance, the edit distance cannot be less than 5.
|
||||||
|
//which is 8-3 or more precisely Math.abs(3-8).
|
||||||
|
//if our maximum edit distance is 4, then we can discard this word
|
||||||
|
//without looking at it.
|
||||||
|
return Float.NEGATIVE_INFINITY;
|
||||||
|
}
|
||||||
|
|
||||||
|
// init matrix d
|
||||||
|
for (int i = 0; i <=n; ++i) {
|
||||||
|
p[i] = i;
|
||||||
|
}
|
||||||
|
|
||||||
|
// start computing edit distance
|
||||||
|
for (int j = 1; j<=m; ++j) { // iterates through target
|
||||||
|
int bestPossibleEditDistance = m;
|
||||||
|
final int t_j = target[offset+j-1]; // jth character of t
|
||||||
|
d[0] = j;
|
||||||
|
|
||||||
|
for (int i=1; i<=n; ++i) { // iterates through text
|
||||||
|
// minimum of cell to the left+1, to the top+1, diagonally left and up +(0|1)
|
||||||
|
if (t_j != text[i-1]) {
|
||||||
|
d[i] = Math.min(Math.min(d[i-1], p[i]), p[i-1]) + 1;
|
||||||
|
} else {
|
||||||
|
d[i] = Math.min(Math.min(d[i-1]+1, p[i]+1), p[i-1]);
|
||||||
|
}
|
||||||
|
bestPossibleEditDistance = Math.min(bestPossibleEditDistance, d[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
//After calculating row i, the best possible edit distance
|
||||||
|
//can be found by found by finding the smallest value in a given column.
|
||||||
|
//If the bestPossibleEditDistance is greater than the max distance, abort.
|
||||||
|
|
||||||
|
if (j > maxDistance && bestPossibleEditDistance > maxDistance) { //equal is okay, but not greater
|
||||||
|
//the closest the target can be to the text is just too far away.
|
||||||
|
//this target is leaving the party early.
|
||||||
|
return Float.NEGATIVE_INFINITY;
|
||||||
|
}
|
||||||
|
|
||||||
|
// copy current distance counts to 'previous row' distance counts: swap p and d
|
||||||
|
int _d[] = p;
|
||||||
|
p = d;
|
||||||
|
d = _d;
|
||||||
|
}
|
||||||
|
|
||||||
|
// our last action in the above loop was to switch d and p, so p now
|
||||||
|
// actually has the most recent cost counts
|
||||||
|
|
||||||
|
// this will return less than 0.0 when the edit distance is
|
||||||
|
// greater than the number of characters in the shorter word.
|
||||||
|
// but this was the formula that was previously used in FuzzyTermEnum,
|
||||||
|
// so it has not been changed (even though minimumSimilarity must be
|
||||||
|
// greater than 0.0)
|
||||||
|
return 1.0f - ((float)p[n] / (float) (realPrefixLength + Math.min(n, m)));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The max Distance is the maximum Levenshtein distance for the text
|
||||||
|
* compared to some other value that results in score that is
|
||||||
|
* better than the minimum similarity.
|
||||||
|
* @param m the length of the "other value"
|
||||||
|
* @return the maximum levenshtein distance that we care about
|
||||||
|
*/
|
||||||
|
private int calculateMaxDistance(int m) {
|
||||||
|
return raw ? maxEdits : Math.min(maxEdits,
|
||||||
|
(int)((1-minSimilarity) * (Math.min(text.length, m) + realPrefixLength)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,468 @@
|
||||||
|
package org.apache.lucene.sandbox.queries;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.MockAnalyzer;
|
||||||
|
import org.apache.lucene.document.Document;
|
||||||
|
import org.apache.lucene.document.TextField;
|
||||||
|
import org.apache.lucene.index.IndexReader;
|
||||||
|
import org.apache.lucene.index.MultiReader;
|
||||||
|
import org.apache.lucene.index.RandomIndexWriter;
|
||||||
|
import org.apache.lucene.index.Term;
|
||||||
|
import org.apache.lucene.search.IndexSearcher;
|
||||||
|
import org.apache.lucene.search.MultiTermQuery;
|
||||||
|
import org.apache.lucene.search.Query;
|
||||||
|
import org.apache.lucene.search.ScoreDoc;
|
||||||
|
import org.apache.lucene.search.TopDocs;
|
||||||
|
import org.apache.lucene.store.Directory;
|
||||||
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tests {@link SlowFuzzyQuery}.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public class TestSlowFuzzyQuery extends LuceneTestCase {
|
||||||
|
|
||||||
|
public void testFuzziness() throws Exception {
|
||||||
|
Directory directory = newDirectory();
|
||||||
|
RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
|
||||||
|
addDoc("aaaaa", writer);
|
||||||
|
addDoc("aaaab", writer);
|
||||||
|
addDoc("aaabb", writer);
|
||||||
|
addDoc("aabbb", writer);
|
||||||
|
addDoc("abbbb", writer);
|
||||||
|
addDoc("bbbbb", writer);
|
||||||
|
addDoc("ddddd", writer);
|
||||||
|
|
||||||
|
IndexReader reader = writer.getReader();
|
||||||
|
IndexSearcher searcher = newSearcher(reader);
|
||||||
|
writer.close();
|
||||||
|
|
||||||
|
SlowFuzzyQuery query = new SlowFuzzyQuery(new Term("field", "aaaaa"), SlowFuzzyQuery.defaultMinSimilarity, 0);
|
||||||
|
ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
|
assertEquals(3, hits.length);
|
||||||
|
|
||||||
|
// same with prefix
|
||||||
|
query = new SlowFuzzyQuery(new Term("field", "aaaaa"), SlowFuzzyQuery.defaultMinSimilarity, 1);
|
||||||
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
|
assertEquals(3, hits.length);
|
||||||
|
query = new SlowFuzzyQuery(new Term("field", "aaaaa"), SlowFuzzyQuery.defaultMinSimilarity, 2);
|
||||||
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
|
assertEquals(3, hits.length);
|
||||||
|
query = new SlowFuzzyQuery(new Term("field", "aaaaa"), SlowFuzzyQuery.defaultMinSimilarity, 3);
|
||||||
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
|
assertEquals(3, hits.length);
|
||||||
|
query = new SlowFuzzyQuery(new Term("field", "aaaaa"), SlowFuzzyQuery.defaultMinSimilarity, 4);
|
||||||
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
|
assertEquals(2, hits.length);
|
||||||
|
query = new SlowFuzzyQuery(new Term("field", "aaaaa"), SlowFuzzyQuery.defaultMinSimilarity, 5);
|
||||||
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
|
assertEquals(1, hits.length);
|
||||||
|
query = new SlowFuzzyQuery(new Term("field", "aaaaa"), SlowFuzzyQuery.defaultMinSimilarity, 6);
|
||||||
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
|
assertEquals(1, hits.length);
|
||||||
|
|
||||||
|
// test scoring
|
||||||
|
query = new SlowFuzzyQuery(new Term("field", "bbbbb"), SlowFuzzyQuery.defaultMinSimilarity, 0);
|
||||||
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
|
assertEquals("3 documents should match", 3, hits.length);
|
||||||
|
List<String> order = Arrays.asList("bbbbb","abbbb","aabbb");
|
||||||
|
for (int i = 0; i < hits.length; i++) {
|
||||||
|
final String term = searcher.doc(hits[i].doc).get("field");
|
||||||
|
//System.out.println(hits[i].score);
|
||||||
|
assertEquals(order.get(i), term);
|
||||||
|
}
|
||||||
|
|
||||||
|
// test pq size by supplying maxExpansions=2
|
||||||
|
// This query would normally return 3 documents, because 3 terms match (see above):
|
||||||
|
query = new SlowFuzzyQuery(new Term("field", "bbbbb"), SlowFuzzyQuery.defaultMinSimilarity, 0, 2);
|
||||||
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
|
assertEquals("only 2 documents should match", 2, hits.length);
|
||||||
|
order = Arrays.asList("bbbbb","abbbb");
|
||||||
|
for (int i = 0; i < hits.length; i++) {
|
||||||
|
final String term = searcher.doc(hits[i].doc).get("field");
|
||||||
|
//System.out.println(hits[i].score);
|
||||||
|
assertEquals(order.get(i), term);
|
||||||
|
}
|
||||||
|
|
||||||
|
// not similar enough:
|
||||||
|
query = new SlowFuzzyQuery(new Term("field", "xxxxx"), SlowFuzzyQuery.defaultMinSimilarity, 0);
|
||||||
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
|
assertEquals(0, hits.length);
|
||||||
|
query = new SlowFuzzyQuery(new Term("field", "aaccc"), SlowFuzzyQuery.defaultMinSimilarity, 0); // edit distance to "aaaaa" = 3
|
||||||
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
|
assertEquals(0, hits.length);
|
||||||
|
|
||||||
|
// query identical to a word in the index:
|
||||||
|
query = new SlowFuzzyQuery(new Term("field", "aaaaa"), SlowFuzzyQuery.defaultMinSimilarity, 0);
|
||||||
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
|
assertEquals(3, hits.length);
|
||||||
|
assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
|
||||||
|
// default allows for up to two edits:
|
||||||
|
assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab"));
|
||||||
|
assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb"));
|
||||||
|
|
||||||
|
// query similar to a word in the index:
|
||||||
|
query = new SlowFuzzyQuery(new Term("field", "aaaac"), SlowFuzzyQuery.defaultMinSimilarity, 0);
|
||||||
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
|
assertEquals(3, hits.length);
|
||||||
|
assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
|
||||||
|
assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab"));
|
||||||
|
assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb"));
|
||||||
|
|
||||||
|
// now with prefix
|
||||||
|
query = new SlowFuzzyQuery(new Term("field", "aaaac"), SlowFuzzyQuery.defaultMinSimilarity, 1);
|
||||||
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
|
assertEquals(3, hits.length);
|
||||||
|
assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
|
||||||
|
assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab"));
|
||||||
|
assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb"));
|
||||||
|
query = new SlowFuzzyQuery(new Term("field", "aaaac"), SlowFuzzyQuery.defaultMinSimilarity, 2);
|
||||||
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
|
assertEquals(3, hits.length);
|
||||||
|
assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
|
||||||
|
assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab"));
|
||||||
|
assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb"));
|
||||||
|
query = new SlowFuzzyQuery(new Term("field", "aaaac"), SlowFuzzyQuery.defaultMinSimilarity, 3);
|
||||||
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
|
assertEquals(3, hits.length);
|
||||||
|
assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
|
||||||
|
assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab"));
|
||||||
|
assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb"));
|
||||||
|
query = new SlowFuzzyQuery(new Term("field", "aaaac"), SlowFuzzyQuery.defaultMinSimilarity, 4);
|
||||||
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
|
assertEquals(2, hits.length);
|
||||||
|
assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
|
||||||
|
assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab"));
|
||||||
|
query = new SlowFuzzyQuery(new Term("field", "aaaac"), SlowFuzzyQuery.defaultMinSimilarity, 5);
|
||||||
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
|
assertEquals(0, hits.length);
|
||||||
|
|
||||||
|
|
||||||
|
query = new SlowFuzzyQuery(new Term("field", "ddddX"), SlowFuzzyQuery.defaultMinSimilarity, 0);
|
||||||
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
|
assertEquals(1, hits.length);
|
||||||
|
assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd"));
|
||||||
|
|
||||||
|
// now with prefix
|
||||||
|
query = new SlowFuzzyQuery(new Term("field", "ddddX"), SlowFuzzyQuery.defaultMinSimilarity, 1);
|
||||||
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
|
assertEquals(1, hits.length);
|
||||||
|
assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd"));
|
||||||
|
query = new SlowFuzzyQuery(new Term("field", "ddddX"), SlowFuzzyQuery.defaultMinSimilarity, 2);
|
||||||
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
|
assertEquals(1, hits.length);
|
||||||
|
assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd"));
|
||||||
|
query = new SlowFuzzyQuery(new Term("field", "ddddX"), SlowFuzzyQuery.defaultMinSimilarity, 3);
|
||||||
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
|
assertEquals(1, hits.length);
|
||||||
|
assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd"));
|
||||||
|
query = new SlowFuzzyQuery(new Term("field", "ddddX"), SlowFuzzyQuery.defaultMinSimilarity, 4);
|
||||||
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
|
assertEquals(1, hits.length);
|
||||||
|
assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd"));
|
||||||
|
query = new SlowFuzzyQuery(new Term("field", "ddddX"), SlowFuzzyQuery.defaultMinSimilarity, 5);
|
||||||
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
|
assertEquals(0, hits.length);
|
||||||
|
|
||||||
|
|
||||||
|
// different field = no match:
|
||||||
|
query = new SlowFuzzyQuery(new Term("anotherfield", "ddddX"), SlowFuzzyQuery.defaultMinSimilarity, 0);
|
||||||
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
|
assertEquals(0, hits.length);
|
||||||
|
|
||||||
|
reader.close();
|
||||||
|
directory.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testFuzzinessLong() throws Exception {
|
||||||
|
Directory directory = newDirectory();
|
||||||
|
RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
|
||||||
|
addDoc("aaaaaaa", writer);
|
||||||
|
addDoc("segment", writer);
|
||||||
|
|
||||||
|
IndexReader reader = writer.getReader();
|
||||||
|
IndexSearcher searcher = newSearcher(reader);
|
||||||
|
writer.close();
|
||||||
|
|
||||||
|
SlowFuzzyQuery query;
|
||||||
|
// not similar enough:
|
||||||
|
query = new SlowFuzzyQuery(new Term("field", "xxxxx"), 0.5f, 0);
|
||||||
|
ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
|
assertEquals(0, hits.length);
|
||||||
|
// edit distance to "aaaaaaa" = 3, this matches because the string is longer than
|
||||||
|
// in testDefaultFuzziness so a bigger difference is allowed:
|
||||||
|
query = new SlowFuzzyQuery(new Term("field", "aaaaccc"), 0.5f, 0);
|
||||||
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
|
assertEquals(1, hits.length);
|
||||||
|
assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaaaa"));
|
||||||
|
|
||||||
|
// now with prefix
|
||||||
|
query = new SlowFuzzyQuery(new Term("field", "aaaaccc"), 0.5f, 1);
|
||||||
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
|
assertEquals(1, hits.length);
|
||||||
|
assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaaaa"));
|
||||||
|
query = new SlowFuzzyQuery(new Term("field", "aaaaccc"), 0.5f, 4);
|
||||||
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
|
assertEquals(1, hits.length);
|
||||||
|
assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaaaa"));
|
||||||
|
query = new SlowFuzzyQuery(new Term("field", "aaaaccc"), 0.5f, 5);
|
||||||
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
|
assertEquals(0, hits.length);
|
||||||
|
|
||||||
|
// no match, more than half of the characters is wrong:
|
||||||
|
query = new SlowFuzzyQuery(new Term("field", "aaacccc"), 0.5f, 0);
|
||||||
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
|
assertEquals(0, hits.length);
|
||||||
|
|
||||||
|
// now with prefix
|
||||||
|
query = new SlowFuzzyQuery(new Term("field", "aaacccc"), 0.5f, 2);
|
||||||
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
|
assertEquals(0, hits.length);
|
||||||
|
|
||||||
|
// "student" and "stellent" are indeed similar to "segment" by default:
|
||||||
|
query = new SlowFuzzyQuery(new Term("field", "student"), 0.5f, 0);
|
||||||
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
|
assertEquals(1, hits.length);
|
||||||
|
query = new SlowFuzzyQuery(new Term("field", "stellent"), 0.5f, 0);
|
||||||
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
|
assertEquals(1, hits.length);
|
||||||
|
|
||||||
|
// now with prefix
|
||||||
|
query = new SlowFuzzyQuery(new Term("field", "student"), 0.5f, 1);
|
||||||
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
|
assertEquals(1, hits.length);
|
||||||
|
query = new SlowFuzzyQuery(new Term("field", "stellent"), 0.5f, 1);
|
||||||
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
|
assertEquals(1, hits.length);
|
||||||
|
query = new SlowFuzzyQuery(new Term("field", "student"), 0.5f, 2);
|
||||||
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
|
assertEquals(0, hits.length);
|
||||||
|
query = new SlowFuzzyQuery(new Term("field", "stellent"), 0.5f, 2);
|
||||||
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
|
assertEquals(0, hits.length);
|
||||||
|
|
||||||
|
// "student" doesn't match anymore thanks to increased minimum similarity:
|
||||||
|
query = new SlowFuzzyQuery(new Term("field", "student"), 0.6f, 0);
|
||||||
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
|
assertEquals(0, hits.length);
|
||||||
|
|
||||||
|
try {
|
||||||
|
query = new SlowFuzzyQuery(new Term("field", "student"), 1.1f);
|
||||||
|
fail("Expected IllegalArgumentException");
|
||||||
|
} catch (IllegalArgumentException e) {
|
||||||
|
// expecting exception
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
query = new SlowFuzzyQuery(new Term("field", "student"), -0.1f);
|
||||||
|
fail("Expected IllegalArgumentException");
|
||||||
|
} catch (IllegalArgumentException e) {
|
||||||
|
// expecting exception
|
||||||
|
}
|
||||||
|
|
||||||
|
reader.close();
|
||||||
|
directory.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* MultiTermQuery provides (via attribute) information about which values
|
||||||
|
* must be competitive to enter the priority queue.
|
||||||
|
*
|
||||||
|
* SlowFuzzyQuery optimizes itself around this information, if the attribute
|
||||||
|
* is not implemented correctly, there will be problems!
|
||||||
|
*/
|
||||||
|
public void testTieBreaker() throws Exception {
|
||||||
|
Directory directory = newDirectory();
|
||||||
|
RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
|
||||||
|
addDoc("a123456", writer);
|
||||||
|
addDoc("c123456", writer);
|
||||||
|
addDoc("d123456", writer);
|
||||||
|
addDoc("e123456", writer);
|
||||||
|
|
||||||
|
Directory directory2 = newDirectory();
|
||||||
|
RandomIndexWriter writer2 = new RandomIndexWriter(random(), directory2);
|
||||||
|
addDoc("a123456", writer2);
|
||||||
|
addDoc("b123456", writer2);
|
||||||
|
addDoc("b123456", writer2);
|
||||||
|
addDoc("b123456", writer2);
|
||||||
|
addDoc("c123456", writer2);
|
||||||
|
addDoc("f123456", writer2);
|
||||||
|
|
||||||
|
IndexReader ir1 = writer.getReader();
|
||||||
|
IndexReader ir2 = writer2.getReader();
|
||||||
|
|
||||||
|
MultiReader mr = new MultiReader(ir1, ir2);
|
||||||
|
IndexSearcher searcher = newSearcher(mr);
|
||||||
|
SlowFuzzyQuery fq = new SlowFuzzyQuery(new Term("field", "z123456"), 1f, 0, 2);
|
||||||
|
TopDocs docs = searcher.search(fq, 2);
|
||||||
|
assertEquals(5, docs.totalHits); // 5 docs, from the a and b's
|
||||||
|
mr.close();
|
||||||
|
ir1.close();
|
||||||
|
ir2.close();
|
||||||
|
writer.close();
|
||||||
|
writer2.close();
|
||||||
|
directory.close();
|
||||||
|
directory2.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testTokenLengthOpt() throws IOException {
|
||||||
|
Directory directory = newDirectory();
|
||||||
|
RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
|
||||||
|
addDoc("12345678911", writer);
|
||||||
|
addDoc("segment", writer);
|
||||||
|
|
||||||
|
IndexReader reader = writer.getReader();
|
||||||
|
IndexSearcher searcher = newSearcher(reader);
|
||||||
|
writer.close();
|
||||||
|
|
||||||
|
Query query;
|
||||||
|
// term not over 10 chars, so optimization shortcuts
|
||||||
|
query = new SlowFuzzyQuery(new Term("field", "1234569"), 0.9f);
|
||||||
|
ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
|
assertEquals(0, hits.length);
|
||||||
|
|
||||||
|
// 10 chars, so no optimization
|
||||||
|
query = new SlowFuzzyQuery(new Term("field", "1234567891"), 0.9f);
|
||||||
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
|
assertEquals(0, hits.length);
|
||||||
|
|
||||||
|
// over 10 chars, so no optimization
|
||||||
|
query = new SlowFuzzyQuery(new Term("field", "12345678911"), 0.9f);
|
||||||
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
|
assertEquals(1, hits.length);
|
||||||
|
|
||||||
|
// over 10 chars, no match
|
||||||
|
query = new SlowFuzzyQuery(new Term("field", "sdfsdfsdfsdf"), 0.9f);
|
||||||
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
|
assertEquals(0, hits.length);
|
||||||
|
|
||||||
|
reader.close();
|
||||||
|
directory.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Test the TopTermsBoostOnlyBooleanQueryRewrite rewrite method. */
|
||||||
|
public void testBoostOnlyRewrite() throws Exception {
|
||||||
|
Directory directory = newDirectory();
|
||||||
|
RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
|
||||||
|
addDoc("Lucene", writer);
|
||||||
|
addDoc("Lucene", writer);
|
||||||
|
addDoc("Lucenne", writer);
|
||||||
|
|
||||||
|
IndexReader reader = writer.getReader();
|
||||||
|
IndexSearcher searcher = newSearcher(reader);
|
||||||
|
writer.close();
|
||||||
|
|
||||||
|
SlowFuzzyQuery query = new SlowFuzzyQuery(new Term("field", "lucene"));
|
||||||
|
query.setRewriteMethod(new MultiTermQuery.TopTermsBoostOnlyBooleanQueryRewrite(50));
|
||||||
|
ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
|
assertEquals(3, hits.length);
|
||||||
|
// normally, 'Lucenne' would be the first result as IDF will skew the score.
|
||||||
|
assertEquals("Lucene", reader.document(hits[0].doc).get("field"));
|
||||||
|
assertEquals("Lucene", reader.document(hits[1].doc).get("field"));
|
||||||
|
assertEquals("Lucenne", reader.document(hits[2].doc).get("field"));
|
||||||
|
reader.close();
|
||||||
|
directory.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testGiga() throws Exception {
|
||||||
|
|
||||||
|
MockAnalyzer analyzer = new MockAnalyzer(random());
|
||||||
|
Directory index = newDirectory();
|
||||||
|
RandomIndexWriter w = new RandomIndexWriter(random(), index);
|
||||||
|
|
||||||
|
addDoc("Lucene in Action", w);
|
||||||
|
addDoc("Lucene for Dummies", w);
|
||||||
|
|
||||||
|
//addDoc("Giga", w);
|
||||||
|
addDoc("Giga byte", w);
|
||||||
|
|
||||||
|
addDoc("ManagingGigabytesManagingGigabyte", w);
|
||||||
|
addDoc("ManagingGigabytesManagingGigabytes", w);
|
||||||
|
|
||||||
|
addDoc("The Art of Computer Science", w);
|
||||||
|
addDoc("J. K. Rowling", w);
|
||||||
|
addDoc("JK Rowling", w);
|
||||||
|
addDoc("Joanne K Roling", w);
|
||||||
|
addDoc("Bruce Willis", w);
|
||||||
|
addDoc("Willis bruce", w);
|
||||||
|
addDoc("Brute willis", w);
|
||||||
|
addDoc("B. willis", w);
|
||||||
|
IndexReader r = w.getReader();
|
||||||
|
w.close();
|
||||||
|
|
||||||
|
Query q = new SlowFuzzyQuery(new Term("field", "giga"), 0.9f);
|
||||||
|
|
||||||
|
// 3. search
|
||||||
|
IndexSearcher searcher = newSearcher(r);
|
||||||
|
ScoreDoc[] hits = searcher.search(q, 10).scoreDocs;
|
||||||
|
assertEquals(1, hits.length);
|
||||||
|
assertEquals("Giga byte", searcher.doc(hits[0].doc).get("field"));
|
||||||
|
r.close();
|
||||||
|
index.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testDistanceAsEditsSearching() throws Exception {
|
||||||
|
Directory index = newDirectory();
|
||||||
|
RandomIndexWriter w = new RandomIndexWriter(random(), index);
|
||||||
|
addDoc("foobar", w);
|
||||||
|
addDoc("test", w);
|
||||||
|
addDoc("working", w);
|
||||||
|
IndexReader reader = w.getReader();
|
||||||
|
IndexSearcher searcher = newSearcher(reader);
|
||||||
|
w.close();
|
||||||
|
|
||||||
|
SlowFuzzyQuery q = new SlowFuzzyQuery(new Term("field", "fouba"), 2);
|
||||||
|
ScoreDoc[] hits = searcher.search(q, 10).scoreDocs;
|
||||||
|
assertEquals(1, hits.length);
|
||||||
|
assertEquals("foobar", searcher.doc(hits[0].doc).get("field"));
|
||||||
|
|
||||||
|
q = new SlowFuzzyQuery(new Term("field", "foubara"), 2);
|
||||||
|
hits = searcher.search(q, 10).scoreDocs;
|
||||||
|
assertEquals(1, hits.length);
|
||||||
|
assertEquals("foobar", searcher.doc(hits[0].doc).get("field"));
|
||||||
|
|
||||||
|
q = new SlowFuzzyQuery(new Term("field", "t"), 3);
|
||||||
|
hits = searcher.search(q, 10).scoreDocs;
|
||||||
|
assertEquals(1, hits.length);
|
||||||
|
assertEquals("test", searcher.doc(hits[0].doc).get("field"));
|
||||||
|
|
||||||
|
q = new SlowFuzzyQuery(new Term("field", "a"), 4f, 0, 50);
|
||||||
|
hits = searcher.search(q, 10).scoreDocs;
|
||||||
|
assertEquals(1, hits.length);
|
||||||
|
assertEquals("test", searcher.doc(hits[0].doc).get("field"));
|
||||||
|
|
||||||
|
q = new SlowFuzzyQuery(new Term("field", "a"), 6f, 0, 50);
|
||||||
|
hits = searcher.search(q, 10).scoreDocs;
|
||||||
|
assertEquals(2, hits.length);
|
||||||
|
assertEquals("test", searcher.doc(hits[0].doc).get("field"));
|
||||||
|
assertEquals("foobar", searcher.doc(hits[1].doc).get("field"));
|
||||||
|
|
||||||
|
reader.close();
|
||||||
|
index.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void addDoc(String text, RandomIndexWriter writer) throws IOException {
|
||||||
|
Document doc = new Document();
|
||||||
|
doc.add(newField("field", text, TextField.TYPE_STORED));
|
||||||
|
writer.addDocument(doc);
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,4 +1,4 @@
|
||||||
package org.apache.lucene.search;
|
package org.apache.lucene.sandbox.queries;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
@ -29,6 +29,9 @@ import org.apache.lucene.document.TextField;
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.index.RandomIndexWriter;
|
import org.apache.lucene.index.RandomIndexWriter;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
|
import org.apache.lucene.search.IndexSearcher;
|
||||||
|
import org.apache.lucene.search.MultiTermQuery;
|
||||||
|
import org.apache.lucene.search.TopDocs;
|
||||||
import org.apache.lucene.search.similarities.DefaultSimilarity;
|
import org.apache.lucene.search.similarities.DefaultSimilarity;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
@ -55,7 +58,7 @@ import org.apache.lucene.util.LuceneTestCase;
|
||||||
*
|
*
|
||||||
* results line: comma-separated docID, score pair
|
* results line: comma-separated docID, score pair
|
||||||
**/
|
**/
|
||||||
public class TestFuzzyQuery2 extends LuceneTestCase {
|
public class TestSlowFuzzyQuery2 extends LuceneTestCase {
|
||||||
/** epsilon for score comparisons */
|
/** epsilon for score comparisons */
|
||||||
static final float epsilon = 0.00001f;
|
static final float epsilon = 0.00001f;
|
||||||
|
|
||||||
|
@ -115,7 +118,7 @@ public class TestFuzzyQuery2 extends LuceneTestCase {
|
||||||
int prefix = Integer.parseInt(params[1]);
|
int prefix = Integer.parseInt(params[1]);
|
||||||
int pqSize = Integer.parseInt(params[2]);
|
int pqSize = Integer.parseInt(params[2]);
|
||||||
float minScore = Float.parseFloat(params[3]);
|
float minScore = Float.parseFloat(params[3]);
|
||||||
FuzzyQuery q = new FuzzyQuery(new Term("field", query), minScore, prefix);
|
SlowFuzzyQuery q = new SlowFuzzyQuery(new Term("field", query), minScore, prefix);
|
||||||
q.setRewriteMethod(new MultiTermQuery.TopTermsBoostOnlyBooleanQueryRewrite(pqSize));
|
q.setRewriteMethod(new MultiTermQuery.TopTermsBoostOnlyBooleanQueryRewrite(pqSize));
|
||||||
int expectedResults = Integer.parseInt(reader.readLine());
|
int expectedResults = Integer.parseInt(reader.readLine());
|
||||||
TopDocs docs = searcher.search(q, expectedResults);
|
TopDocs docs = searcher.search(q, expectedResults);
|
Loading…
Reference in New Issue