LUCENE-4024: FuzzyQuery should never do edit distance > 2

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1334819 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2012-05-06 23:08:12 +00:00
parent 4c151d54e0
commit 8f7c1da3ba
21 changed files with 1134 additions and 503 deletions

View File

@ -28,7 +28,7 @@ import org.apache.lucene.util.ToStringUtils;
import org.apache.lucene.util.automaton.LevenshteinAutomata; import org.apache.lucene.util.automaton.LevenshteinAutomata;
/** Implements the fuzzy search query. The similarity measurement /** Implements the fuzzy search query. The similarity measurement
* is based on the Levenshtein (edit distance) algorithm. * is based on the Damerau-Levenshtein (optimal string alignment) algorithm.
* *
* <p>This query uses {@link MultiTermQuery.TopTermsScoringBooleanQueryRewrite} * <p>This query uses {@link MultiTermQuery.TopTermsScoringBooleanQueryRewrite}
* as default. So terms will be collected and scored according to their * as default. So terms will be collected and scored according to their
@ -37,94 +37,81 @@ import org.apache.lucene.util.automaton.LevenshteinAutomata;
*/ */
public class FuzzyQuery extends MultiTermQuery { public class FuzzyQuery extends MultiTermQuery {
public final static float defaultMinSimilarity = LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE; public final static int defaultMaxEdits = LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE;
public final static int defaultPrefixLength = 0; public final static int defaultPrefixLength = 0;
public final static int defaultMaxExpansions = 50; public final static int defaultMaxExpansions = 50;
public final static boolean defaultTranspositions = true;
private float minimumSimilarity; private final int maxEdits;
private int prefixLength; private final int maxExpansions;
private boolean termLongEnough = false; private final boolean transpositions;
private final int prefixLength;
protected Term term; private final Term term;
/** /**
* Create a new FuzzyQuery that will match terms with a similarity * Create a new FuzzyQuery that will match terms with an edit distance
* of at least <code>minimumSimilarity</code> to <code>term</code>. * of at most <code>maxEdits</code> to <code>term</code>.
* If a <code>prefixLength</code> &gt; 0 is specified, a common prefix * If a <code>prefixLength</code> &gt; 0 is specified, a common prefix
* of that length is also required. * of that length is also required.
* *
* @param term the term to search for * @param term the term to search for
* @param minimumSimilarity a value between 0 and 1 to set the required similarity * @param maxEdits must be >= 0 and <= {@link LevenshteinAutomata#MAXIMUM_SUPPORTED_DISTANCE}.
* between the query term and the matching terms. For example, for a
* <code>minimumSimilarity</code> of <code>0.5</code> a term of the same length
* as the query term is considered similar to the query term if the edit distance
* between both terms is less than <code>length(term)*0.5</code>
* <p>
* Alternatively, if <code>minimumSimilarity</code> is >= 1f, it is interpreted
* as a pure Levenshtein edit distance. For example, a value of <code>2f</code>
* will match all terms within an edit distance of <code>2</code> from the
* query term. Edit distances specified in this way may not be fractional.
*
* @param prefixLength length of common (non-fuzzy) prefix * @param prefixLength length of common (non-fuzzy) prefix
* @param maxExpansions the maximum number of terms to match. If this number is * @param maxExpansions the maximum number of terms to match. If this number is
* greater than {@link BooleanQuery#getMaxClauseCount} when the query is rewritten, * greater than {@link BooleanQuery#getMaxClauseCount} when the query is rewritten,
* then the maxClauseCount will be used instead. * then the maxClauseCount will be used instead.
* @throws IllegalArgumentException if minimumSimilarity is &gt;= 1 or &lt; 0 * @param transpositions true if transpositions should be treated as a primitive
* or if prefixLength &lt; 0 * edit operation. If this is false, comparisons will implement the classic
* Levenshtein algorithm.
*/ */
public FuzzyQuery(Term term, float minimumSimilarity, int prefixLength, public FuzzyQuery(Term term, int maxEdits, int prefixLength, int maxExpansions, boolean transpositions) {
int maxExpansions) {
super(term.field()); super(term.field());
this.term = term;
if (minimumSimilarity >= 1.0f && minimumSimilarity != (int)minimumSimilarity) if (maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
throw new IllegalArgumentException("fractional edit distances are not allowed"); throw new IllegalArgumentException("maxEdits must be between 0 and " + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE);
if (minimumSimilarity < 0.0f) }
throw new IllegalArgumentException("minimumSimilarity < 0"); if (prefixLength < 0) {
if (prefixLength < 0) throw new IllegalArgumentException("prefixLength cannot be negative.");
throw new IllegalArgumentException("prefixLength < 0"); }
if (maxExpansions < 0) if (maxExpansions < 0) {
throw new IllegalArgumentException("maxExpansions < 0"); throw new IllegalArgumentException("maxExpansions cannot be negative.");
setRewriteMethod(new MultiTermQuery.TopTermsScoringBooleanQueryRewrite(maxExpansions));
String text = term.text();
int len = text.codePointCount(0, text.length());
if (len > 0 && (minimumSimilarity >= 1f || len > 1.0f / (1.0f - minimumSimilarity))) {
this.termLongEnough = true;
} }
this.minimumSimilarity = minimumSimilarity; this.term = term;
this.maxEdits = maxEdits;
this.prefixLength = prefixLength; this.prefixLength = prefixLength;
this.transpositions = transpositions;
this.maxExpansions = maxExpansions;
setRewriteMethod(new MultiTermQuery.TopTermsScoringBooleanQueryRewrite(maxExpansions));
} }
/** /**
* Calls {@link #FuzzyQuery(Term, float) FuzzyQuery(term, minimumSimilarity, prefixLength, defaultMaxExpansions)}. * Calls {@link #FuzzyQuery(Term, int, int, int, boolean)
* FuzzyQuery(term, minimumSimilarity, prefixLength, defaultMaxExpansions, defaultTranspositions)}.
*/ */
public FuzzyQuery(Term term, float minimumSimilarity, int prefixLength) { public FuzzyQuery(Term term, int maxEdits, int prefixLength) {
this(term, minimumSimilarity, prefixLength, defaultMaxExpansions); this(term, maxEdits, prefixLength, defaultMaxExpansions, defaultTranspositions);
} }
/** /**
* Calls {@link #FuzzyQuery(Term, float) FuzzyQuery(term, minimumSimilarity, 0, defaultMaxExpansions)}. * Calls {@link #FuzzyQuery(Term, int, int) FuzzyQuery(term, maxEdits, defaultPrefixLength)}.
*/ */
public FuzzyQuery(Term term, float minimumSimilarity) { public FuzzyQuery(Term term, int maxEdits) {
this(term, minimumSimilarity, defaultPrefixLength, defaultMaxExpansions); this(term, maxEdits, defaultPrefixLength);
} }
/** /**
* Calls {@link #FuzzyQuery(Term, float) FuzzyQuery(term, defaultMinSimilarity, 0, defaultMaxExpansions)}. * Calls {@link #FuzzyQuery(Term, int) FuzzyQuery(term, defaultMaxEdits)}.
*/ */
public FuzzyQuery(Term term) { public FuzzyQuery(Term term) {
this(term, defaultMinSimilarity, defaultPrefixLength, defaultMaxExpansions); this(term, defaultMaxEdits);
} }
/** /**
* Returns the minimum similarity that is required for this query to match. * @return the maximum number of edit distances allowed for this query to match.
* @return float value between 0.0 and 1.0
*/ */
public float getMinSimilarity() { public int getMaxEdits() {
return minimumSimilarity; return maxEdits;
} }
/** /**
@ -138,13 +125,10 @@ public class FuzzyQuery extends MultiTermQuery {
@Override @Override
protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException { protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
if (!termLongEnough) { // can only match if it's exact if (maxEdits == 0 || prefixLength >= term.text().length()) { // can only match if it's exact
return new SingleTermsEnum(terms.iterator(null), term.bytes()); return new SingleTermsEnum(terms.iterator(null), term.bytes());
} }
// TODO: should we expose the transpositions option to this query? return new FuzzyTermsEnum(terms, atts, getTerm(), maxEdits, prefixLength, transpositions);
// maybe move the old/slowish stuff (lev without transpositions, n > 2, etc) all to contrib,
// deprecate it, and just have a faster/simpler/better one in core?
return new FuzzyTermsEnum(terms, atts, getTerm(), minimumSimilarity, prefixLength, false);
} }
/** /**
@ -163,7 +147,7 @@ public class FuzzyQuery extends MultiTermQuery {
} }
buffer.append(term.text()); buffer.append(term.text());
buffer.append('~'); buffer.append('~');
buffer.append(Float.toString(minimumSimilarity)); buffer.append(Integer.toString(maxEdits));
buffer.append(ToStringUtils.boost(getBoost())); buffer.append(ToStringUtils.boost(getBoost()));
return buffer.toString(); return buffer.toString();
} }
@ -172,8 +156,10 @@ public class FuzzyQuery extends MultiTermQuery {
public int hashCode() { public int hashCode() {
final int prime = 31; final int prime = 31;
int result = super.hashCode(); int result = super.hashCode();
result = prime * result + Float.floatToIntBits(minimumSimilarity); result = prime * result + maxEdits;
result = prime * result + prefixLength; result = prime * result + prefixLength;
result = prime * result + maxExpansions;
result = prime * result + (transpositions ? 0 : 1);
result = prime * result + ((term == null) ? 0 : term.hashCode()); result = prime * result + ((term == null) ? 0 : term.hashCode());
return result; return result;
} }
@ -187,11 +173,14 @@ public class FuzzyQuery extends MultiTermQuery {
if (getClass() != obj.getClass()) if (getClass() != obj.getClass())
return false; return false;
FuzzyQuery other = (FuzzyQuery) obj; FuzzyQuery other = (FuzzyQuery) obj;
if (Float.floatToIntBits(minimumSimilarity) != Float if (maxEdits != other.maxEdits)
.floatToIntBits(other.minimumSimilarity))
return false; return false;
if (prefixLength != other.prefixLength) if (prefixLength != other.prefixLength)
return false; return false;
if (maxExpansions != other.maxExpansions)
return false;
if (transpositions != other.transpositions)
return false;
if (term == null) { if (term == null) {
if (other.term != null) if (other.term != null)
return false; return false;
@ -199,6 +188,31 @@ public class FuzzyQuery extends MultiTermQuery {
return false; return false;
return true; return true;
} }
/**
* @deprecated pass integer edit distances instead.
*/
@Deprecated
public final static float defaultMinSimilarity = LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE;
/**
* Helper function to convert from deprecated "minimumSimilarity" fractions
* to raw edit distances.
*
* @param minimumSimilarity scaled similarity
* @param termLen length (in unicode codepoints) of the term.
* @return equivalent number of maxEdits
* @deprecated pass integer edit distances instead.
*/
@Deprecated
public static int floatToEdits(float minimumSimilarity, int termLen) {
if (minimumSimilarity > 1f) {
return (int) Math.min(minimumSimilarity, LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE);
} else if (minimumSimilarity == 0.0f) {
return 0; // 0 means exact, not infinite # of edits!
} else {
return Math.min((int) ((1D-minimumSimilarity) * termLen),
LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE);
}
}
} }

View File

@ -34,8 +34,6 @@ import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.Bits; import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.BasicAutomata; import org.apache.lucene.util.automaton.BasicAutomata;
@ -51,7 +49,7 @@ import org.apache.lucene.util.automaton.LevenshteinAutomata;
* {@link #getComparator}. Each term in the enumeration is * {@link #getComparator}. Each term in the enumeration is
* greater than all that precede it.</p> * greater than all that precede it.</p>
*/ */
public final class FuzzyTermsEnum extends TermsEnum { public class FuzzyTermsEnum extends TermsEnum {
private TermsEnum actualEnum; private TermsEnum actualEnum;
private BoostAttribute actualBoostAtt; private BoostAttribute actualBoostAtt;
@ -67,18 +65,18 @@ public final class FuzzyTermsEnum extends TermsEnum {
// TODO: chicken-and-egg // TODO: chicken-and-egg
private final Comparator<BytesRef> termComparator = BytesRef.getUTF8SortedAsUnicodeComparator(); private final Comparator<BytesRef> termComparator = BytesRef.getUTF8SortedAsUnicodeComparator();
private final float minSimilarity; protected final float minSimilarity;
private final float scale_factor; protected final float scale_factor;
private final int termLength; protected final int termLength;
private int maxEdits; protected int maxEdits;
private final boolean raw; protected final boolean raw;
private final Terms terms; protected final Terms terms;
private final Term term; private final Term term;
private final int termText[]; protected final int termText[];
private final int realPrefixLength; protected final int realPrefixLength;
private final boolean transpositions; private final boolean transpositions;
@ -95,7 +93,8 @@ public final class FuzzyTermsEnum extends TermsEnum {
* thats contains information about competitive boosts during rewrite. It is also used * thats contains information about competitive boosts during rewrite. It is also used
* to cache DFAs between segment transitions. * to cache DFAs between segment transitions.
* @param term Pattern term. * @param term Pattern term.
* @param minSimilarity Minimum required similarity for terms from the reader. * @param minSimilarity Minimum required similarity for terms from the reader. Pass an integer value
* representing edit distance. Passing a fraction is deprecated.
* @param prefixLength Length of required common prefix. Default value is 0. * @param prefixLength Length of required common prefix. Default value is 0.
* @throws IOException * @throws IOException
*/ */
@ -149,7 +148,7 @@ public final class FuzzyTermsEnum extends TermsEnum {
* return an automata-based enum for matching up to editDistance from * return an automata-based enum for matching up to editDistance from
* lastTerm, if possible * lastTerm, if possible
*/ */
private TermsEnum getAutomatonEnum(int editDistance, BytesRef lastTerm) protected TermsEnum getAutomatonEnum(int editDistance, BytesRef lastTerm)
throws IOException { throws IOException {
final List<CompiledAutomaton> runAutomata = initAutomata(editDistance); final List<CompiledAutomaton> runAutomata = initAutomata(editDistance);
if (editDistance < runAutomata.size()) { if (editDistance < runAutomata.size()) {
@ -187,7 +186,7 @@ public final class FuzzyTermsEnum extends TermsEnum {
} }
/** swap in a new actual enum to proxy to */ /** swap in a new actual enum to proxy to */
private void setEnum(TermsEnum actualEnum) { protected void setEnum(TermsEnum actualEnum) {
this.actualEnum = actualEnum; this.actualEnum = actualEnum;
this.actualBoostAtt = actualEnum.attributes().addAttribute(BoostAttribute.class); this.actualBoostAtt = actualEnum.attributes().addAttribute(BoostAttribute.class);
} }
@ -209,14 +208,21 @@ public final class FuzzyTermsEnum extends TermsEnum {
maxEdits--; maxEdits--;
if (oldMaxEdits != maxEdits || init) { // the maximum n has changed if (oldMaxEdits != maxEdits || init) { // the maximum n has changed
TermsEnum newEnum = getAutomatonEnum(maxEdits, lastTerm); maxEditDistanceChanged(lastTerm, maxEdits, init);
if (newEnum != null) {
setEnum(newEnum);
} else if (init) {
setEnum(new LinearFuzzyTermsEnum());
}
} }
} }
protected void maxEditDistanceChanged(BytesRef lastTerm, int maxEdits, boolean init)
throws IOException {
TermsEnum newEnum = getAutomatonEnum(maxEdits, lastTerm);
// instead of assert, we do a hard check in case someone uses our enum directly
// assert newEnum != null;
if (newEnum == null) {
assert maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE;
throw new IllegalArgumentException("maxEdits cannot be > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE");
}
setEnum(newEnum);
}
// for some raw min similarity and input term length, the maximum # of edits // for some raw min similarity and input term length, the maximum # of edits
private int initialMaxDistance(float minimumSimilarity, int termLen) { private int initialMaxDistance(float minimumSimilarity, int termLen) {
@ -383,194 +389,6 @@ public final class FuzzyTermsEnum extends TermsEnum {
} }
} }
/**
* Implement fuzzy enumeration with linear brute force.
*/
private class LinearFuzzyTermsEnum extends FilteredTermsEnum {
/* Allows us save time required to create a new array
* every time similarity is called.
*/
private int[] d;
private int[] p;
// this is the text, minus the prefix
private final int[] text;
private final BoostAttribute boostAtt =
attributes().addAttribute(BoostAttribute.class);
/**
* Constructor for enumeration of all terms from specified <code>reader</code> which share a prefix of
* length <code>prefixLength</code> with <code>term</code> and which have a fuzzy similarity &gt;
* <code>minSimilarity</code>.
* <p>
* After calling the constructor the enumeration is already pointing to the first
* valid term if such a term exists.
*
* @throws IOException
*/
public LinearFuzzyTermsEnum() throws IOException {
super(terms.iterator(null));
this.text = new int[termLength - realPrefixLength];
System.arraycopy(termText, realPrefixLength, text, 0, text.length);
final String prefix = UnicodeUtil.newString(termText, 0, realPrefixLength);
prefixBytesRef = new BytesRef(prefix);
this.d = new int[this.text.length + 1];
this.p = new int[this.text.length + 1];
setInitialSeekTerm(prefixBytesRef);
}
private final BytesRef prefixBytesRef;
// used for unicode conversion from BytesRef byte[] to int[]
private final IntsRef utf32 = new IntsRef(20);
/**
* The termCompare method in FuzzyTermEnum uses Levenshtein distance to
* calculate the distance between the given term and the comparing term.
*/
@Override
protected final AcceptStatus accept(BytesRef term) {
if (StringHelper.startsWith(term, prefixBytesRef)) {
UnicodeUtil.UTF8toUTF32(term, utf32);
final float similarity = similarity(utf32.ints, realPrefixLength, utf32.length - realPrefixLength);
if (similarity > minSimilarity) {
boostAtt.setBoost((similarity - minSimilarity) * scale_factor);
return AcceptStatus.YES;
} else return AcceptStatus.NO;
} else {
return AcceptStatus.END;
}
}
/******************************
* Compute Levenshtein distance
******************************/
/**
* <p>Similarity returns a number that is 1.0f or less (including negative numbers)
* based on how similar the Term is compared to a target term. It returns
* exactly 0.0f when
* <pre>
* editDistance &gt; maximumEditDistance</pre>
* Otherwise it returns:
* <pre>
* 1 - (editDistance / length)</pre>
* where length is the length of the shortest term (text or target) including a
* prefix that are identical and editDistance is the Levenshtein distance for
* the two words.</p>
*
* <p>Embedded within this algorithm is a fail-fast Levenshtein distance
* algorithm. The fail-fast algorithm differs from the standard Levenshtein
* distance algorithm in that it is aborted if it is discovered that the
* minimum distance between the words is greater than some threshold.
*
* <p>To calculate the maximum distance threshold we use the following formula:
* <pre>
* (1 - minimumSimilarity) * length</pre>
* where length is the shortest term including any prefix that is not part of the
* similarity comparison. This formula was derived by solving for what maximum value
* of distance returns false for the following statements:
* <pre>
* similarity = 1 - ((float)distance / (float) (prefixLength + Math.min(textlen, targetlen)));
* return (similarity > minimumSimilarity);</pre>
* where distance is the Levenshtein distance for the two words.
* </p>
* <p>Levenshtein distance (also known as edit distance) is a measure of similarity
* between two strings where the distance is measured as the number of character
* deletions, insertions or substitutions required to transform one string to
* the other string.
* @param target the target word or phrase
* @return the similarity, 0.0 or less indicates that it matches less than the required
* threshold and 1.0 indicates that the text and target are identical
*/
private final float similarity(final int[] target, int offset, int length) {
final int m = length;
final int n = text.length;
if (n == 0) {
//we don't have anything to compare. That means if we just add
//the letters for m we get the new word
return realPrefixLength == 0 ? 0.0f : 1.0f - ((float) m / realPrefixLength);
}
if (m == 0) {
return realPrefixLength == 0 ? 0.0f : 1.0f - ((float) n / realPrefixLength);
}
final int maxDistance = calculateMaxDistance(m);
if (maxDistance < Math.abs(m-n)) {
//just adding the characters of m to n or vice-versa results in
//too many edits
//for example "pre" length is 3 and "prefixes" length is 8. We can see that
//given this optimal circumstance, the edit distance cannot be less than 5.
//which is 8-3 or more precisely Math.abs(3-8).
//if our maximum edit distance is 4, then we can discard this word
//without looking at it.
return Float.NEGATIVE_INFINITY;
}
// init matrix d
for (int i = 0; i <=n; ++i) {
p[i] = i;
}
// start computing edit distance
for (int j = 1; j<=m; ++j) { // iterates through target
int bestPossibleEditDistance = m;
final int t_j = target[offset+j-1]; // jth character of t
d[0] = j;
for (int i=1; i<=n; ++i) { // iterates through text
// minimum of cell to the left+1, to the top+1, diagonally left and up +(0|1)
if (t_j != text[i-1]) {
d[i] = Math.min(Math.min(d[i-1], p[i]), p[i-1]) + 1;
} else {
d[i] = Math.min(Math.min(d[i-1]+1, p[i]+1), p[i-1]);
}
bestPossibleEditDistance = Math.min(bestPossibleEditDistance, d[i]);
}
//After calculating row i, the best possible edit distance
//can be found by found by finding the smallest value in a given column.
//If the bestPossibleEditDistance is greater than the max distance, abort.
if (j > maxDistance && bestPossibleEditDistance > maxDistance) { //equal is okay, but not greater
//the closest the target can be to the text is just too far away.
//this target is leaving the party early.
return Float.NEGATIVE_INFINITY;
}
// copy current distance counts to 'previous row' distance counts: swap p and d
int _d[] = p;
p = d;
d = _d;
}
// our last action in the above loop was to switch d and p, so p now
// actually has the most recent cost counts
// this will return less than 0.0 when the edit distance is
// greater than the number of characters in the shorter word.
// but this was the formula that was previously used in FuzzyTermEnum,
// so it has not been changed (even though minimumSimilarity must be
// greater than 0.0)
return 1.0f - ((float)p[n] / (float) (realPrefixLength + Math.min(n, m)));
}
/**
* The max Distance is the maximum Levenshtein distance for the text
* compared to some other value that results in score that is
* better than the minimum similarity.
* @param m the length of the "other value"
* @return the maximum levenshtein distance that we care about
*/
private int calculateMaxDistance(int m) {
return raw ? maxEdits : Math.min(maxEdits,
(int)((1-minSimilarity) * (Math.min(text.length, m) + realPrefixLength)));
}
}
/** @lucene.internal */ /** @lucene.internal */
public float getMinSimilarity() { public float getMinSimilarity() {
return minSimilarity; return minSimilarity;

View File

@ -52,32 +52,32 @@ public class TestFuzzyQuery extends LuceneTestCase {
IndexSearcher searcher = newSearcher(reader); IndexSearcher searcher = newSearcher(reader);
writer.close(); writer.close();
FuzzyQuery query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 0); FuzzyQuery query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMaxEdits, 0);
ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs; ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(3, hits.length); assertEquals(3, hits.length);
// same with prefix // same with prefix
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 1); query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMaxEdits, 1);
hits = searcher.search(query, null, 1000).scoreDocs; hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(3, hits.length); assertEquals(3, hits.length);
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 2); query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMaxEdits, 2);
hits = searcher.search(query, null, 1000).scoreDocs; hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(3, hits.length); assertEquals(3, hits.length);
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 3); query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMaxEdits, 3);
hits = searcher.search(query, null, 1000).scoreDocs; hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(3, hits.length); assertEquals(3, hits.length);
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 4); query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMaxEdits, 4);
hits = searcher.search(query, null, 1000).scoreDocs; hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(2, hits.length); assertEquals(2, hits.length);
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 5); query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMaxEdits, 5);
hits = searcher.search(query, null, 1000).scoreDocs; hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(1, hits.length); assertEquals(1, hits.length);
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 6); query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMaxEdits, 6);
hits = searcher.search(query, null, 1000).scoreDocs; hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(1, hits.length); assertEquals(1, hits.length);
// test scoring // test scoring
query = new FuzzyQuery(new Term("field", "bbbbb"), FuzzyQuery.defaultMinSimilarity, 0); query = new FuzzyQuery(new Term("field", "bbbbb"), FuzzyQuery.defaultMaxEdits, 0);
hits = searcher.search(query, null, 1000).scoreDocs; hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals("3 documents should match", 3, hits.length); assertEquals("3 documents should match", 3, hits.length);
List<String> order = Arrays.asList("bbbbb","abbbb","aabbb"); List<String> order = Arrays.asList("bbbbb","abbbb","aabbb");
@ -89,7 +89,7 @@ public class TestFuzzyQuery extends LuceneTestCase {
// test pq size by supplying maxExpansions=2 // test pq size by supplying maxExpansions=2
// This query would normally return 3 documents, because 3 terms match (see above): // This query would normally return 3 documents, because 3 terms match (see above):
query = new FuzzyQuery(new Term("field", "bbbbb"), FuzzyQuery.defaultMinSimilarity, 0, 2); query = new FuzzyQuery(new Term("field", "bbbbb"), FuzzyQuery.defaultMaxEdits, 0, 2, false);
hits = searcher.search(query, null, 1000).scoreDocs; hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals("only 2 documents should match", 2, hits.length); assertEquals("only 2 documents should match", 2, hits.length);
order = Arrays.asList("bbbbb","abbbb"); order = Arrays.asList("bbbbb","abbbb");
@ -100,15 +100,15 @@ public class TestFuzzyQuery extends LuceneTestCase {
} }
// not similar enough: // not similar enough:
query = new FuzzyQuery(new Term("field", "xxxxx"), FuzzyQuery.defaultMinSimilarity, 0); query = new FuzzyQuery(new Term("field", "xxxxx"), FuzzyQuery.defaultMaxEdits, 0);
hits = searcher.search(query, null, 1000).scoreDocs; hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(0, hits.length); assertEquals(0, hits.length);
query = new FuzzyQuery(new Term("field", "aaccc"), FuzzyQuery.defaultMinSimilarity, 0); // edit distance to "aaaaa" = 3 query = new FuzzyQuery(new Term("field", "aaccc"), FuzzyQuery.defaultMaxEdits, 0); // edit distance to "aaaaa" = 3
hits = searcher.search(query, null, 1000).scoreDocs; hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(0, hits.length); assertEquals(0, hits.length);
// query identical to a word in the index: // query identical to a word in the index:
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 0); query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMaxEdits, 0);
hits = searcher.search(query, null, 1000).scoreDocs; hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(3, hits.length); assertEquals(3, hits.length);
assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa")); assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
@ -117,7 +117,7 @@ public class TestFuzzyQuery extends LuceneTestCase {
assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb")); assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb"));
// query similar to a word in the index: // query similar to a word in the index:
query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 0); query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMaxEdits, 0);
hits = searcher.search(query, null, 1000).scoreDocs; hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(3, hits.length); assertEquals(3, hits.length);
assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa")); assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
@ -125,158 +125,69 @@ public class TestFuzzyQuery extends LuceneTestCase {
assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb")); assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb"));
// now with prefix // now with prefix
query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 1); query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMaxEdits, 1);
hits = searcher.search(query, null, 1000).scoreDocs; hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(3, hits.length); assertEquals(3, hits.length);
assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa")); assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab")); assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab"));
assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb")); assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb"));
query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 2); query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMaxEdits, 2);
hits = searcher.search(query, null, 1000).scoreDocs; hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(3, hits.length); assertEquals(3, hits.length);
assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa")); assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab")); assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab"));
assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb")); assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb"));
query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 3); query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMaxEdits, 3);
hits = searcher.search(query, null, 1000).scoreDocs; hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(3, hits.length); assertEquals(3, hits.length);
assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa")); assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab")); assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab"));
assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb")); assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb"));
query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 4); query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMaxEdits, 4);
hits = searcher.search(query, null, 1000).scoreDocs; hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(2, hits.length); assertEquals(2, hits.length);
assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa")); assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab")); assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab"));
query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 5); query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMaxEdits, 5);
hits = searcher.search(query, null, 1000).scoreDocs; hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(0, hits.length); assertEquals(0, hits.length);
query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 0); query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMaxEdits, 0);
hits = searcher.search(query, null, 1000).scoreDocs; hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(1, hits.length); assertEquals(1, hits.length);
assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd")); assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd"));
// now with prefix // now with prefix
query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 1); query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMaxEdits, 1);
hits = searcher.search(query, null, 1000).scoreDocs; hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(1, hits.length); assertEquals(1, hits.length);
assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd")); assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd"));
query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 2); query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMaxEdits, 2);
hits = searcher.search(query, null, 1000).scoreDocs; hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(1, hits.length); assertEquals(1, hits.length);
assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd")); assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd"));
query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 3); query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMaxEdits, 3);
hits = searcher.search(query, null, 1000).scoreDocs; hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(1, hits.length); assertEquals(1, hits.length);
assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd")); assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd"));
query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 4); query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMaxEdits, 4);
hits = searcher.search(query, null, 1000).scoreDocs; hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(1, hits.length); assertEquals(1, hits.length);
assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd")); assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd"));
query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 5); query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMaxEdits, 5);
hits = searcher.search(query, null, 1000).scoreDocs; hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(0, hits.length); assertEquals(0, hits.length);
// different field = no match: // different field = no match:
query = new FuzzyQuery(new Term("anotherfield", "ddddX"), FuzzyQuery.defaultMinSimilarity, 0); query = new FuzzyQuery(new Term("anotherfield", "ddddX"), FuzzyQuery.defaultMaxEdits, 0);
hits = searcher.search(query, null, 1000).scoreDocs; hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(0, hits.length); assertEquals(0, hits.length);
reader.close(); reader.close();
directory.close(); directory.close();
} }
public void testFuzzinessLong() throws Exception {
Directory directory = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
addDoc("aaaaaaa", writer);
addDoc("segment", writer);
IndexReader reader = writer.getReader();
IndexSearcher searcher = newSearcher(reader);
writer.close();
FuzzyQuery query;
// not similar enough:
query = new FuzzyQuery(new Term("field", "xxxxx"), 0.5f, 0);
ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(0, hits.length);
// edit distance to "aaaaaaa" = 3, this matches because the string is longer than
// in testDefaultFuzziness so a bigger difference is allowed:
query = new FuzzyQuery(new Term("field", "aaaaccc"), 0.5f, 0);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(1, hits.length);
assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaaaa"));
// now with prefix
query = new FuzzyQuery(new Term("field", "aaaaccc"), 0.5f, 1);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(1, hits.length);
assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaaaa"));
query = new FuzzyQuery(new Term("field", "aaaaccc"), 0.5f, 4);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(1, hits.length);
assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaaaa"));
query = new FuzzyQuery(new Term("field", "aaaaccc"), 0.5f, 5);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(0, hits.length);
// no match, more than half of the characters is wrong:
query = new FuzzyQuery(new Term("field", "aaacccc"), 0.5f, 0);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(0, hits.length);
// now with prefix
query = new FuzzyQuery(new Term("field", "aaacccc"), 0.5f, 2);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(0, hits.length);
// "student" and "stellent" are indeed similar to "segment" by default:
query = new FuzzyQuery(new Term("field", "student"), 0.5f, 0);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(1, hits.length);
query = new FuzzyQuery(new Term("field", "stellent"), 0.5f, 0);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(1, hits.length);
// now with prefix
query = new FuzzyQuery(new Term("field", "student"), 0.5f, 1);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(1, hits.length);
query = new FuzzyQuery(new Term("field", "stellent"), 0.5f, 1);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(1, hits.length);
query = new FuzzyQuery(new Term("field", "student"), 0.5f, 2);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(0, hits.length);
query = new FuzzyQuery(new Term("field", "stellent"), 0.5f, 2);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(0, hits.length);
// "student" doesn't match anymore thanks to increased minimum similarity:
query = new FuzzyQuery(new Term("field", "student"), 0.6f, 0);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(0, hits.length);
try {
query = new FuzzyQuery(new Term("field", "student"), 1.1f);
fail("Expected IllegalArgumentException");
} catch (IllegalArgumentException e) {
// expecting exception
}
try {
query = new FuzzyQuery(new Term("field", "student"), -0.1f);
fail("Expected IllegalArgumentException");
} catch (IllegalArgumentException e) {
// expecting exception
}
reader.close();
directory.close();
}
/** /**
* MultiTermQuery provides (via attribute) information about which values * MultiTermQuery provides (via attribute) information about which values
@ -307,7 +218,7 @@ public class TestFuzzyQuery extends LuceneTestCase {
MultiReader mr = new MultiReader(ir1, ir2); MultiReader mr = new MultiReader(ir1, ir2);
IndexSearcher searcher = newSearcher(mr); IndexSearcher searcher = newSearcher(mr);
FuzzyQuery fq = new FuzzyQuery(new Term("field", "z123456"), 1f, 0, 2); FuzzyQuery fq = new FuzzyQuery(new Term("field", "z123456"), 1, 0, 2, false);
TopDocs docs = searcher.search(fq, 2); TopDocs docs = searcher.search(fq, 2);
assertEquals(5, docs.totalHits); // 5 docs, from the a and b's assertEquals(5, docs.totalHits); // 5 docs, from the a and b's
mr.close(); mr.close();
@ -319,41 +230,6 @@ public class TestFuzzyQuery extends LuceneTestCase {
directory2.close(); directory2.close();
} }
public void testTokenLengthOpt() throws IOException {
Directory directory = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
addDoc("12345678911", writer);
addDoc("segment", writer);
IndexReader reader = writer.getReader();
IndexSearcher searcher = newSearcher(reader);
writer.close();
Query query;
// term not over 10 chars, so optimization shortcuts
query = new FuzzyQuery(new Term("field", "1234569"), 0.9f);
ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(0, hits.length);
// 10 chars, so no optimization
query = new FuzzyQuery(new Term("field", "1234567891"), 0.9f);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(0, hits.length);
// over 10 chars, so no optimization
query = new FuzzyQuery(new Term("field", "12345678911"), 0.9f);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(1, hits.length);
// over 10 chars, no match
query = new FuzzyQuery(new Term("field", "sdfsdfsdfsdf"), 0.9f);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(0, hits.length);
reader.close();
directory.close();
}
/** Test the TopTermsBoostOnlyBooleanQueryRewrite rewrite method. */ /** Test the TopTermsBoostOnlyBooleanQueryRewrite rewrite method. */
public void testBoostOnlyRewrite() throws Exception { public void testBoostOnlyRewrite() throws Exception {
Directory directory = newDirectory(); Directory directory = newDirectory();
@ -404,7 +280,7 @@ public class TestFuzzyQuery extends LuceneTestCase {
IndexReader r = w.getReader(); IndexReader r = w.getReader();
w.close(); w.close();
Query q = new FuzzyQuery(new Term("field", "giga"), 0.9f); Query q = new FuzzyQuery(new Term("field", "giga"), 0);
// 3. search // 3. search
IndexSearcher searcher = newSearcher(r); IndexSearcher searcher = newSearcher(r);
@ -435,26 +311,17 @@ public class TestFuzzyQuery extends LuceneTestCase {
assertEquals(1, hits.length); assertEquals(1, hits.length);
assertEquals("foobar", searcher.doc(hits[0].doc).get("field")); assertEquals("foobar", searcher.doc(hits[0].doc).get("field"));
q = new FuzzyQuery(new Term("field", "t"), 3); try {
hits = searcher.search(q, 10).scoreDocs; q = new FuzzyQuery(new Term("field", "t"), 3);
assertEquals(1, hits.length); fail();
assertEquals("test", searcher.doc(hits[0].doc).get("field")); } catch (IllegalArgumentException expected) {
// expected
q = new FuzzyQuery(new Term("field", "a"), 4f, 0, 50); }
hits = searcher.search(q, 10).scoreDocs;
assertEquals(1, hits.length);
assertEquals("test", searcher.doc(hits[0].doc).get("field"));
q = new FuzzyQuery(new Term("field", "a"), 6f, 0, 50);
hits = searcher.search(q, 10).scoreDocs;
assertEquals(2, hits.length);
assertEquals("test", searcher.doc(hits[0].doc).get("field"));
assertEquals("foobar", searcher.doc(hits[1].doc).get("field"));
reader.close(); reader.close();
index.close(); index.close();
} }
private void addDoc(String text, RandomIndexWriter writer) throws IOException { private void addDoc(String text, RandomIndexWriter writer) throws IOException {
Document doc = new Document(); Document doc = new Document();
doc.add(newField("field", text, TextField.TYPE_STORED)); doc.add(newField("field", text, TextField.TYPE_STORED));

View File

@ -90,7 +90,7 @@ public class TestSpanMultiTermQueryWrapper extends LuceneTestCase {
public void testFuzzy2() throws Exception { public void testFuzzy2() throws Exception {
// maximum of 1 term expansion // maximum of 1 term expansion
FuzzyQuery fq = new FuzzyQuery(new Term("field", "broan"), 1f, 0, 1); FuzzyQuery fq = new FuzzyQuery(new Term("field", "broan"), 1, 0, 1, false);
SpanQuery sfq = new SpanMultiTermQueryWrapper<FuzzyQuery>(fq); SpanQuery sfq = new SpanMultiTermQueryWrapper<FuzzyQuery>(fq);
// will only match jumps over lazy broun dog // will only match jumps over lazy broun dog
SpanPositionRangeQuery sprq = new SpanPositionRangeQuery(sfq, 0, 100); SpanPositionRangeQuery sprq = new SpanPositionRangeQuery(sfq, 0, 100);

View File

@ -669,12 +669,12 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
@Override @Override
public void run() throws Exception { public void run() throws Exception {
numHighlights = 0; numHighlights = 0;
FuzzyQuery fuzzyQuery = new FuzzyQuery(new Term(FIELD_NAME, "kinnedy"), 0.5f); FuzzyQuery fuzzyQuery = new FuzzyQuery(new Term(FIELD_NAME, "kinnedy"), 2);
fuzzyQuery.setRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE); fuzzyQuery.setRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE);
doSearching(fuzzyQuery); doSearching(fuzzyQuery);
doStandardHighlights(analyzer, searcher, hits, query, HighlighterTest.this, true); doStandardHighlights(analyzer, searcher, hits, query, HighlighterTest.this, true);
assertTrue("Failed to find correct number of highlights " + numHighlights + " found", assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
numHighlights == 5); numHighlights == 4);
} }
}; };

View File

@ -774,7 +774,10 @@ public abstract class QueryParserBase {
*/ */
protected Query newFuzzyQuery(Term term, float minimumSimilarity, int prefixLength) { protected Query newFuzzyQuery(Term term, float minimumSimilarity, int prefixLength) {
// FuzzyQuery doesn't yet allow constant score rewrite // FuzzyQuery doesn't yet allow constant score rewrite
return new FuzzyQuery(term,minimumSimilarity,prefixLength); String text = term.text();
int numEdits = FuzzyQuery.floatToEdits(minimumSimilarity,
text.codePointCount(0, text.length()));
return new FuzzyQuery(term,numEdits,prefixLength);
} }
// TODO: Should this be protected instead? // TODO: Should this be protected instead?

View File

@ -191,12 +191,13 @@ enabling substantial customization to how a query is created.
<p>Note: You cannot use a * or ? symbol as the first character of a search.</p> <p>Note: You cannot use a * or ? symbol as the first character of a search.</p>
<a name="N1009B"></a><a name="Fuzzy_Searches"></a> <a name="N1009B"></a><a name="Fuzzy_Searches"></a>
<h3 class="boxed">Fuzzy Searches</h3> <h3 class="boxed">Fuzzy Searches</h3>
<p>Lucene supports fuzzy searches based on the Levenshtein Distance, or Edit Distance algorithm. To do a fuzzy search use the tilde, "~", symbol at the end of a Single word Term. For example to search for a term similar in spelling to "roam" use the fuzzy search: </p> <p>Lucene supports fuzzy searches based on Damerau-Levenshtein Distance. To do a fuzzy search use the tilde, "~", symbol at the end of a Single word Term. For example to search for a term similar in spelling to "roam" use the fuzzy search: </p>
<pre class="code">roam~</pre> <pre class="code">roam~</pre>
<p>This search will find terms like foam and roams.</p> <p>This search will find terms like foam and roams.</p>
<p>Starting with Lucene 1.9 an additional (optional) parameter can specify the required similarity. The value is between 0 and 1, with a value closer to 1 only terms with a higher similarity will be matched. For example:</p> <p>An additional (optional) parameter can specify the maximum number of edits allowed. The value is between 0 and 2, For example:</p>
<pre class="code">roam~0.8</pre> <pre class="code">roam~1</pre>
<p>The default that is used if the parameter is not given is 0.5.</p> <p>The default that is used if the parameter is not given is 2 edit distances.</p>
<p>Previously, a floating point value was allowed here. This syntax is considered deprecated and will be removed in Lucene 5.0</p>
<a name="N100B4"></a><a name="Proximity_Searches"></a> <a name="N100B4"></a><a name="Proximity_Searches"></a>
<h3 class="boxed">Proximity Searches</h3> <h3 class="boxed">Proximity Searches</h3>
<p>Lucene supports finding words are a within a specific distance away. To do a proximity search use the tilde, "~", symbol at the end of a Phrase. For example to search for a "apache" and "jakarta" within 10 words of each other in a document use the search: </p> <p>Lucene supports finding words are a within a specific distance away. To do a proximity search use the tilde, "~", symbol at the end of a Phrase. For example to search for a "apache" and "jakarta" within 10 words of each other in a document use the search: </p>

View File

@ -34,9 +34,13 @@ public class FuzzyQueryNodeBuilder implements StandardQueryBuilder {
public FuzzyQuery build(QueryNode queryNode) throws QueryNodeException { public FuzzyQuery build(QueryNode queryNode) throws QueryNodeException {
FuzzyQueryNode fuzzyNode = (FuzzyQueryNode) queryNode; FuzzyQueryNode fuzzyNode = (FuzzyQueryNode) queryNode;
String text = fuzzyNode.getTextAsString();
int numEdits = FuzzyQuery.floatToEdits(fuzzyNode.getSimilarity(),
text.codePointCount(0, text.length()));
return new FuzzyQuery(new Term(fuzzyNode.getFieldAsString(), fuzzyNode return new FuzzyQuery(new Term(fuzzyNode.getFieldAsString(), fuzzyNode
.getTextAsString()), fuzzyNode.getSimilarity(), fuzzyNode .getTextAsString()), numEdits, fuzzyNode
.getPrefixLength()); .getPrefixLength());
} }

View File

@ -5,7 +5,7 @@ import org.apache.lucene.queryparser.xml.DOMUtils;
import org.apache.lucene.queryparser.xml.ParserException; import org.apache.lucene.queryparser.xml.ParserException;
import org.apache.lucene.queryparser.xml.QueryBuilder; import org.apache.lucene.queryparser.xml.QueryBuilder;
import org.apache.lucene.sandbox.queries.FuzzyLikeThisQuery; import org.apache.lucene.sandbox.queries.FuzzyLikeThisQuery;
import org.apache.lucene.search.FuzzyQuery; import org.apache.lucene.sandbox.queries.SlowFuzzyQuery;
import org.apache.lucene.search.Query; import org.apache.lucene.search.Query;
import org.w3c.dom.Element; import org.w3c.dom.Element;
import org.w3c.dom.NodeList; import org.w3c.dom.NodeList;
@ -33,7 +33,7 @@ import org.w3c.dom.NodeList;
public class FuzzyLikeThisQueryBuilder implements QueryBuilder { public class FuzzyLikeThisQueryBuilder implements QueryBuilder {
private static final int DEFAULT_MAX_NUM_TERMS = 50; private static final int DEFAULT_MAX_NUM_TERMS = 50;
private static final float DEFAULT_MIN_SIMILARITY = FuzzyQuery.defaultMinSimilarity; private static final float DEFAULT_MIN_SIMILARITY = SlowFuzzyQuery.defaultMinSimilarity;
private static final int DEFAULT_PREFIX_LENGTH = 1; private static final int DEFAULT_PREFIX_LENGTH = 1;
private static final boolean DEFAULT_IGNORE_TF = false; private static final boolean DEFAULT_IGNORE_TF = false;

View File

@ -59,8 +59,8 @@ public class TestAnalyzingQueryParser extends LuceneTestCase {
fuzzyInput = new String[] { "Übersetzung Übersetzung~0.9", fuzzyInput = new String[] { "Übersetzung Übersetzung~0.9",
"Mötley Crüe Mötley~0.75 Crüe~0.5", "Mötley Crüe Mötley~0.75 Crüe~0.5",
"Renée Zellweger Renée~0.9 Zellweger~" }; "Renée Zellweger Renée~0.9 Zellweger~" };
fuzzyExpected = new String[] { "ubersetzung ubersetzung~0.9", fuzzyExpected = new String[] { "ubersetzung ubersetzung~1",
"motley crue motley~0.75 crue~0.5", "renee zellweger renee~0.9 zellweger~2.0" }; "motley crue motley~1 crue~2", "renee zellweger renee~0 zellweger~2" };
a = new ASCIIAnalyzer(); a = new ASCIIAnalyzer();
} }

View File

@ -85,10 +85,10 @@ public class TestMultiFieldQueryParser extends LuceneTestCase {
assertEquals("((b:one t:one)^2.0) (b:two t:two)", q.toString()); assertEquals("((b:one t:one)^2.0) (b:two t:two)", q.toString());
q = mfqp.parse("one~ two"); q = mfqp.parse("one~ two");
assertEquals("(b:one~2.0 t:one~2.0) (b:two t:two)", q.toString()); assertEquals("(b:one~2 t:one~2) (b:two t:two)", q.toString());
q = mfqp.parse("one~0.8 two^2"); q = mfqp.parse("one~0.8 two^2");
assertEquals("(b:one~0.8 t:one~0.8) ((b:two t:two)^2.0)", q.toString()); assertEquals("(b:one~0 t:one~0) ((b:two t:two)^2.0)", q.toString());
q = mfqp.parse("one* two*"); q = mfqp.parse("one* two*");
assertEquals("(b:one* t:one*) (b:two* t:two*)", q.toString()); assertEquals("(b:one* t:one*) (b:two* t:two*)", q.toString());
@ -272,7 +272,7 @@ public class TestMultiFieldQueryParser extends LuceneTestCase {
q = parser.parse("bla*"); q = parser.parse("bla*");
assertEquals("f1:bla* f2:bla* f3:bla*", q.toString()); assertEquals("f1:bla* f2:bla* f3:bla*", q.toString());
q = parser.parse("bla~"); q = parser.parse("bla~");
assertEquals("f1:bla~2.0 f2:bla~2.0 f3:bla~2.0", q.toString()); assertEquals("f1:bla~2 f2:bla~2 f3:bla~2", q.toString());
q = parser.parse("[a TO c]"); q = parser.parse("[a TO c]");
assertEquals("f1:[a TO c] f2:[a TO c] f3:[a TO c]", q.toString()); assertEquals("f1:[a TO c] f2:[a TO c] f3:[a TO c]", q.toString());
} }

View File

@ -282,10 +282,10 @@ public class TestPrecedenceQueryParser extends LuceneTestCase {
public void testWildcard() throws Exception { public void testWildcard() throws Exception {
assertQueryEquals("term*", null, "term*"); assertQueryEquals("term*", null, "term*");
assertQueryEquals("term*^2", null, "term*^2.0"); assertQueryEquals("term*^2", null, "term*^2.0");
assertQueryEquals("term~", null, "term~2.0"); assertQueryEquals("term~", null, "term~2");
assertQueryEquals("term~0.7", null, "term~0.7"); assertQueryEquals("term~0.7", null, "term~1");
assertQueryEquals("term~^3", null, "term~2.0^3.0"); assertQueryEquals("term~^3", null, "term~2^3.0");
assertQueryEquals("term^3~", null, "term~2.0^3.0"); assertQueryEquals("term^3~", null, "term~2^3.0");
assertQueryEquals("term*germ", null, "term*germ"); assertQueryEquals("term*germ", null, "term*germ");
assertQueryEquals("term*germ^3", null, "term*germ^3.0"); assertQueryEquals("term*germ^3", null, "term*germ^3.0");
@ -294,10 +294,10 @@ public class TestPrecedenceQueryParser extends LuceneTestCase {
assertTrue(getQuery("term~", null) instanceof FuzzyQuery); assertTrue(getQuery("term~", null) instanceof FuzzyQuery);
assertTrue(getQuery("term~0.7", null) instanceof FuzzyQuery); assertTrue(getQuery("term~0.7", null) instanceof FuzzyQuery);
FuzzyQuery fq = (FuzzyQuery) getQuery("term~0.7", null); FuzzyQuery fq = (FuzzyQuery) getQuery("term~0.7", null);
assertEquals(0.7f, fq.getMinSimilarity(), 0.1f); assertEquals(1, fq.getMaxEdits());
assertEquals(FuzzyQuery.defaultPrefixLength, fq.getPrefixLength()); assertEquals(FuzzyQuery.defaultPrefixLength, fq.getPrefixLength());
fq = (FuzzyQuery) getQuery("term~", null); fq = (FuzzyQuery) getQuery("term~", null);
assertEquals(2.0f, fq.getMinSimilarity(), 0.1f); assertEquals(2, fq.getMaxEdits());
assertEquals(FuzzyQuery.defaultPrefixLength, fq.getPrefixLength()); assertEquals(FuzzyQuery.defaultPrefixLength, fq.getPrefixLength());
try { try {
getQuery("term~1.1", null); // value > 1, throws exception getQuery("term~1.1", null); // value > 1, throws exception
@ -336,9 +336,9 @@ public class TestPrecedenceQueryParser extends LuceneTestCase {
assertWildcardQueryEquals("TE?M", false, "TE?M"); assertWildcardQueryEquals("TE?M", false, "TE?M");
assertWildcardQueryEquals("Te?m*gerM", false, "Te?m*gerM"); assertWildcardQueryEquals("Te?m*gerM", false, "Te?m*gerM");
// Fuzzy queries: // Fuzzy queries:
assertWildcardQueryEquals("Term~", "term~2.0"); assertWildcardQueryEquals("Term~", "term~2");
assertWildcardQueryEquals("Term~", true, "term~2.0"); assertWildcardQueryEquals("Term~", true, "term~2");
assertWildcardQueryEquals("Term~", false, "Term~2.0"); assertWildcardQueryEquals("Term~", false, "Term~2");
// Range queries: // Range queries:
assertWildcardQueryEquals("[A TO C]", "[a TO c]"); assertWildcardQueryEquals("[A TO C]", "[a TO c]");
assertWildcardQueryEquals("[A TO C]", true, "[a TO c]"); assertWildcardQueryEquals("[A TO C]", true, "[a TO c]");
@ -498,10 +498,10 @@ public class TestPrecedenceQueryParser extends LuceneTestCase {
assertQueryEquals("a:b\\\\?c", a, "a:b\\?c"); assertQueryEquals("a:b\\\\?c", a, "a:b\\?c");
assertQueryEquals("a:b\\-c~", a, "a:b-c~2.0"); assertQueryEquals("a:b\\-c~", a, "a:b-c~2");
assertQueryEquals("a:b\\+c~", a, "a:b+c~2.0"); assertQueryEquals("a:b\\+c~", a, "a:b+c~2");
assertQueryEquals("a:b\\:c~", a, "a:b:c~2.0"); assertQueryEquals("a:b\\:c~", a, "a:b:c~2");
assertQueryEquals("a:b\\\\c~", a, "a:b\\c~2.0"); assertQueryEquals("a:b\\\\c~", a, "a:b\\c~2");
assertQueryEquals("[ a\\- TO a\\+ ]", null, "[a- TO a+]"); assertQueryEquals("[ a\\- TO a\\+ ]", null, "[a- TO a+]");
assertQueryEquals("[ a\\: TO a\\~ ]", null, "[a: TO a~]"); assertQueryEquals("[ a\\: TO a\\~ ]", null, "[a: TO a~]");

View File

@ -100,10 +100,10 @@ public class TestMultiFieldQPHelper extends LuceneTestCase {
assertEquals("((b:one t:one)^2.0) (b:two t:two)", q.toString()); assertEquals("((b:one t:one)^2.0) (b:two t:two)", q.toString());
q = mfqp.parse("one~ two", null); q = mfqp.parse("one~ two", null);
assertEquals("(b:one~2.0 t:one~2.0) (b:two t:two)", q.toString()); assertEquals("(b:one~2 t:one~2) (b:two t:two)", q.toString());
q = mfqp.parse("one~0.8 two^2", null); q = mfqp.parse("one~0.8 two^2", null);
assertEquals("(b:one~0.8 t:one~0.8) ((b:two t:two)^2.0)", q.toString()); assertEquals("(b:one~0 t:one~0) ((b:two t:two)^2.0)", q.toString());
q = mfqp.parse("one* two*", null); q = mfqp.parse("one* two*", null);
assertEquals("(b:one* t:one*) (b:two* t:two*)", q.toString()); assertEquals("(b:one* t:one*) (b:two* t:two*)", q.toString());
@ -311,7 +311,7 @@ public class TestMultiFieldQPHelper extends LuceneTestCase {
q = parser.parse("bla*", null); q = parser.parse("bla*", null);
assertEquals("f1:bla* f2:bla* f3:bla*", q.toString()); assertEquals("f1:bla* f2:bla* f3:bla*", q.toString());
q = parser.parse("bla~", null); q = parser.parse("bla~", null);
assertEquals("f1:bla~2.0 f2:bla~2.0 f3:bla~2.0", q.toString()); assertEquals("f1:bla~2 f2:bla~2 f3:bla~2", q.toString());
q = parser.parse("[a TO c]", null); q = parser.parse("[a TO c]", null);
assertEquals("f1:[a TO c] f2:[a TO c] f3:[a TO c]", q.toString()); assertEquals("f1:[a TO c] f2:[a TO c] f3:[a TO c]", q.toString());
} }

View File

@ -514,12 +514,12 @@ public class TestQPHelper extends LuceneTestCase {
public void testWildcard() throws Exception { public void testWildcard() throws Exception {
assertQueryEquals("term*", null, "term*"); assertQueryEquals("term*", null, "term*");
assertQueryEquals("term*^2", null, "term*^2.0"); assertQueryEquals("term*^2", null, "term*^2.0");
assertQueryEquals("term~", null, "term~2.0"); assertQueryEquals("term~", null, "term~2");
assertQueryEquals("term~0.7", null, "term~0.7"); assertQueryEquals("term~0.7", null, "term~1");
assertQueryEquals("term~^3", null, "term~2.0^3.0"); assertQueryEquals("term~^3", null, "term~2^3.0");
assertQueryEquals("term^3~", null, "term~2.0^3.0"); assertQueryEquals("term^3~", null, "term~2^3.0");
assertQueryEquals("term*germ", null, "term*germ"); assertQueryEquals("term*germ", null, "term*germ");
assertQueryEquals("term*germ^3", null, "term*germ^3.0"); assertQueryEquals("term*germ^3", null, "term*germ^3.0");
@ -528,10 +528,10 @@ public class TestQPHelper extends LuceneTestCase {
assertTrue(getQuery("term~", null) instanceof FuzzyQuery); assertTrue(getQuery("term~", null) instanceof FuzzyQuery);
assertTrue(getQuery("term~0.7", null) instanceof FuzzyQuery); assertTrue(getQuery("term~0.7", null) instanceof FuzzyQuery);
FuzzyQuery fq = (FuzzyQuery) getQuery("term~0.7", null); FuzzyQuery fq = (FuzzyQuery) getQuery("term~0.7", null);
assertEquals(0.7f, fq.getMinSimilarity(), 0.1f); assertEquals(1, fq.getMaxEdits());
assertEquals(FuzzyQuery.defaultPrefixLength, fq.getPrefixLength()); assertEquals(FuzzyQuery.defaultPrefixLength, fq.getPrefixLength());
fq = (FuzzyQuery) getQuery("term~", null); fq = (FuzzyQuery) getQuery("term~", null);
assertEquals(2.0f, fq.getMinSimilarity(), 0.1f); assertEquals(2, fq.getMaxEdits());
assertEquals(FuzzyQuery.defaultPrefixLength, fq.getPrefixLength()); assertEquals(FuzzyQuery.defaultPrefixLength, fq.getPrefixLength());
assertQueryNodeException("term~1.1"); // value > 1, throws exception assertQueryNodeException("term~1.1"); // value > 1, throws exception
@ -567,9 +567,9 @@ public class TestQPHelper extends LuceneTestCase {
assertWildcardQueryEquals("TE?M", false, "TE?M"); assertWildcardQueryEquals("TE?M", false, "TE?M");
assertWildcardQueryEquals("Te?m*gerM", false, "Te?m*gerM"); assertWildcardQueryEquals("Te?m*gerM", false, "Te?m*gerM");
// Fuzzy queries: // Fuzzy queries:
assertWildcardQueryEquals("Term~", "term~2.0"); assertWildcardQueryEquals("Term~", "term~2");
assertWildcardQueryEquals("Term~", true, "term~2.0"); assertWildcardQueryEquals("Term~", true, "term~2");
assertWildcardQueryEquals("Term~", false, "Term~2.0"); assertWildcardQueryEquals("Term~", false, "Term~2");
// Range queries: // Range queries:
// TODO: implement this on QueryParser // TODO: implement this on QueryParser
@ -805,10 +805,10 @@ public class TestQPHelper extends LuceneTestCase {
assertQueryEquals("a:b\\\\?c", a, "a:b\\?c"); assertQueryEquals("a:b\\\\?c", a, "a:b\\?c");
assertQueryEquals("a:b\\-c~", a, "a:b-c~2.0"); assertQueryEquals("a:b\\-c~", a, "a:b-c~2");
assertQueryEquals("a:b\\+c~", a, "a:b+c~2.0"); assertQueryEquals("a:b\\+c~", a, "a:b+c~2");
assertQueryEquals("a:b\\:c~", a, "a:b:c~2.0"); assertQueryEquals("a:b\\:c~", a, "a:b:c~2");
assertQueryEquals("a:b\\\\c~", a, "a:b\\c~2.0"); assertQueryEquals("a:b\\\\c~", a, "a:b\\c~2");
// TODO: implement Range queries on QueryParser // TODO: implement Range queries on QueryParser
assertQueryEquals("[ a\\- TO a\\+ ]", null, "[a- TO a+]"); assertQueryEquals("[ a\\- TO a\\+ ]", null, "[a- TO a+]");

View File

@ -420,10 +420,10 @@ public abstract class QueryParserTestBase extends LuceneTestCase {
public void testWildcard() throws Exception { public void testWildcard() throws Exception {
assertQueryEquals("term*", null, "term*"); assertQueryEquals("term*", null, "term*");
assertQueryEquals("term*^2", null, "term*^2.0"); assertQueryEquals("term*^2", null, "term*^2.0");
assertQueryEquals("term~", null, "term~2.0"); assertQueryEquals("term~", null, "term~2");
assertQueryEquals("term~0.7", null, "term~0.7"); assertQueryEquals("term~0.7", null, "term~1");
assertQueryEquals("term~^3", null, "term~2.0^3.0"); assertQueryEquals("term~^3", null, "term~2^3.0");
assertQueryEquals("term^3~", null, "term~2.0^3.0"); assertQueryEquals("term^3~", null, "term~2^3.0");
assertQueryEquals("term*germ", null, "term*germ"); assertQueryEquals("term*germ", null, "term*germ");
assertQueryEquals("term*germ^3", null, "term*germ^3.0"); assertQueryEquals("term*germ^3", null, "term*germ^3.0");
@ -432,10 +432,10 @@ public abstract class QueryParserTestBase extends LuceneTestCase {
assertTrue(getQuery("term~", null) instanceof FuzzyQuery); assertTrue(getQuery("term~", null) instanceof FuzzyQuery);
assertTrue(getQuery("term~0.7", null) instanceof FuzzyQuery); assertTrue(getQuery("term~0.7", null) instanceof FuzzyQuery);
FuzzyQuery fq = (FuzzyQuery)getQuery("term~0.7", null); FuzzyQuery fq = (FuzzyQuery)getQuery("term~0.7", null);
assertEquals(0.7f, fq.getMinSimilarity(), 0.1f); assertEquals(1, fq.getMaxEdits());
assertEquals(FuzzyQuery.defaultPrefixLength, fq.getPrefixLength()); assertEquals(FuzzyQuery.defaultPrefixLength, fq.getPrefixLength());
fq = (FuzzyQuery)getQuery("term~", null); fq = (FuzzyQuery)getQuery("term~", null);
assertEquals(2.0f, fq.getMinSimilarity(), 0.1f); assertEquals(2, fq.getMaxEdits());
assertEquals(FuzzyQuery.defaultPrefixLength, fq.getPrefixLength()); assertEquals(FuzzyQuery.defaultPrefixLength, fq.getPrefixLength());
assertParseException("term~1.1"); // value > 1, throws exception assertParseException("term~1.1"); // value > 1, throws exception
@ -470,9 +470,9 @@ public abstract class QueryParserTestBase extends LuceneTestCase {
assertWildcardQueryEquals("TE?M", false, "TE?M"); assertWildcardQueryEquals("TE?M", false, "TE?M");
assertWildcardQueryEquals("Te?m*gerM", false, "Te?m*gerM"); assertWildcardQueryEquals("Te?m*gerM", false, "Te?m*gerM");
// Fuzzy queries: // Fuzzy queries:
assertWildcardQueryEquals("Term~", "term~2.0"); assertWildcardQueryEquals("Term~", "term~2");
assertWildcardQueryEquals("Term~", true, "term~2.0"); assertWildcardQueryEquals("Term~", true, "term~2");
assertWildcardQueryEquals("Term~", false, "Term~2.0"); assertWildcardQueryEquals("Term~", false, "Term~2");
// Range queries: // Range queries:
assertWildcardQueryEquals("[A TO C]", "[a TO c]"); assertWildcardQueryEquals("[A TO C]", "[a TO c]");
assertWildcardQueryEquals("[A TO C]", true, "[a TO c]"); assertWildcardQueryEquals("[A TO C]", true, "[a TO c]");
@ -693,10 +693,10 @@ public abstract class QueryParserTestBase extends LuceneTestCase {
assertQueryEquals("a:b\\\\?c", a, "a:b\\\\?c"); assertQueryEquals("a:b\\\\?c", a, "a:b\\\\?c");
assertQueryEquals("a:b\\-c~", a, "a:b-c~2.0"); assertQueryEquals("a:b\\-c~", a, "a:b-c~2");
assertQueryEquals("a:b\\+c~", a, "a:b+c~2.0"); assertQueryEquals("a:b\\+c~", a, "a:b+c~2");
assertQueryEquals("a:b\\:c~", a, "a:b:c~2.0"); assertQueryEquals("a:b\\:c~", a, "a:b:c~2");
assertQueryEquals("a:b\\\\c~", a, "a:b\\c~2.0"); assertQueryEquals("a:b\\\\c~", a, "a:b\\c~2");
assertQueryEquals("[ a\\- TO a\\+ ]", null, "[a- TO a+]"); assertQueryEquals("[ a\\- TO a\\+ ]", null, "[a- TO a+]");
assertQueryEquals("[ a\\: TO a\\~ ]", null, "[a: TO a~]"); assertQueryEquals("[ a\\: TO a\\~ ]", null, "[a: TO a~]");
@ -1271,7 +1271,7 @@ public abstract class QueryParserTestBase extends LuceneTestCase {
public void testDistanceAsEditsParsing() throws Exception { public void testDistanceAsEditsParsing() throws Exception {
QueryParser qp = new QueryParser(TEST_VERSION_CURRENT, "field", new MockAnalyzer(random())); QueryParser qp = new QueryParser(TEST_VERSION_CURRENT, "field", new MockAnalyzer(random()));
FuzzyQuery q = (FuzzyQuery) qp.parse("foobar~2"); FuzzyQuery q = (FuzzyQuery) qp.parse("foobar~2");
assertEquals(2f, q.getMinSimilarity(), 0.0001f); assertEquals(2, q.getMaxEdits());
} }
public void testPhraseQueryToString() throws ParseException { public void testPhraseQueryToString() throws ParseException {

View File

@ -211,7 +211,7 @@ public class FuzzyLikeThisQuery extends Query
AttributeSource atts = new AttributeSource(); AttributeSource atts = new AttributeSource();
MaxNonCompetitiveBoostAttribute maxBoostAtt = MaxNonCompetitiveBoostAttribute maxBoostAtt =
atts.addAttribute(MaxNonCompetitiveBoostAttribute.class); atts.addAttribute(MaxNonCompetitiveBoostAttribute.class);
FuzzyTermsEnum fe = new FuzzyTermsEnum(MultiFields.getTerms(reader, startTerm.field()), atts, startTerm, f.minSimilarity, f.prefixLength, false); SlowFuzzyTermsEnum fe = new SlowFuzzyTermsEnum(MultiFields.getTerms(reader, startTerm.field()), atts, startTerm, f.minSimilarity, f.prefixLength);
//store the df so all variants use same idf //store the df so all variants use same idf
int df = reader.docFreq(startTerm); int df = reader.docFreq(startTerm);
int numVariants=0; int numVariants=0;

View File

@ -0,0 +1,204 @@
package org.apache.lucene.sandbox.queries;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.index.SingleTermsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.BooleanQuery; // javadocs
import org.apache.lucene.search.FuzzyQuery; // javadocs
import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.ToStringUtils;
import org.apache.lucene.util.automaton.LevenshteinAutomata;
/** Implements the classic fuzzy search query. The similarity measurement
* is based on the Levenshtein (edit distance) algorithm.
* <p>
* Note that, unlike {@link FuzzyQuery}, this query will silently allow
* for a (possibly huge) number of edit distances in comparisons, and may
* be extremely slow (comparing every term in the index).
*
* @deprecated Use {@link FuzzyQuery} instead.
*/
@Deprecated
public class SlowFuzzyQuery extends MultiTermQuery {
public final static float defaultMinSimilarity = LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE;
public final static int defaultPrefixLength = 0;
public final static int defaultMaxExpansions = 50;
private float minimumSimilarity;
private int prefixLength;
private boolean termLongEnough = false;
protected Term term;
/**
* Create a new SlowFuzzyQuery that will match terms with a similarity
* of at least <code>minimumSimilarity</code> to <code>term</code>.
* If a <code>prefixLength</code> &gt; 0 is specified, a common prefix
* of that length is also required.
*
* @param term the term to search for
* @param minimumSimilarity a value between 0 and 1 to set the required similarity
* between the query term and the matching terms. For example, for a
* <code>minimumSimilarity</code> of <code>0.5</code> a term of the same length
* as the query term is considered similar to the query term if the edit distance
* between both terms is less than <code>length(term)*0.5</code>
* <p>
* Alternatively, if <code>minimumSimilarity</code> is >= 1f, it is interpreted
* as a pure Levenshtein edit distance. For example, a value of <code>2f</code>
* will match all terms within an edit distance of <code>2</code> from the
* query term. Edit distances specified in this way may not be fractional.
*
* @param prefixLength length of common (non-fuzzy) prefix
* @param maxExpansions the maximum number of terms to match. If this number is
* greater than {@link BooleanQuery#getMaxClauseCount} when the query is rewritten,
* then the maxClauseCount will be used instead.
* @throws IllegalArgumentException if minimumSimilarity is &gt;= 1 or &lt; 0
* or if prefixLength &lt; 0
*/
public SlowFuzzyQuery(Term term, float minimumSimilarity, int prefixLength,
int maxExpansions) {
super(term.field());
this.term = term;
if (minimumSimilarity >= 1.0f && minimumSimilarity != (int)minimumSimilarity)
throw new IllegalArgumentException("fractional edit distances are not allowed");
if (minimumSimilarity < 0.0f)
throw new IllegalArgumentException("minimumSimilarity < 0");
if (prefixLength < 0)
throw new IllegalArgumentException("prefixLength < 0");
if (maxExpansions < 0)
throw new IllegalArgumentException("maxExpansions < 0");
setRewriteMethod(new MultiTermQuery.TopTermsScoringBooleanQueryRewrite(maxExpansions));
String text = term.text();
int len = text.codePointCount(0, text.length());
if (len > 0 && (minimumSimilarity >= 1f || len > 1.0f / (1.0f - minimumSimilarity))) {
this.termLongEnough = true;
}
this.minimumSimilarity = minimumSimilarity;
this.prefixLength = prefixLength;
}
/**
* Calls {@link #SlowFuzzyQuery(Term, float) SlowFuzzyQuery(term, minimumSimilarity, prefixLength, defaultMaxExpansions)}.
*/
public SlowFuzzyQuery(Term term, float minimumSimilarity, int prefixLength) {
this(term, minimumSimilarity, prefixLength, defaultMaxExpansions);
}
/**
* Calls {@link #SlowFuzzyQuery(Term, float) SlowFuzzyQuery(term, minimumSimilarity, 0, defaultMaxExpansions)}.
*/
public SlowFuzzyQuery(Term term, float minimumSimilarity) {
this(term, minimumSimilarity, defaultPrefixLength, defaultMaxExpansions);
}
/**
* Calls {@link #SlowFuzzyQuery(Term, float) SlowFuzzyQuery(term, defaultMinSimilarity, 0, defaultMaxExpansions)}.
*/
public SlowFuzzyQuery(Term term) {
this(term, defaultMinSimilarity, defaultPrefixLength, defaultMaxExpansions);
}
/**
* Returns the minimum similarity that is required for this query to match.
* @return float value between 0.0 and 1.0
*/
public float getMinSimilarity() {
return minimumSimilarity;
}
/**
* Returns the non-fuzzy prefix length. This is the number of characters at the start
* of a term that must be identical (not fuzzy) to the query term if the query
* is to match that term.
*/
public int getPrefixLength() {
return prefixLength;
}
@Override
protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
if (!termLongEnough) { // can only match if it's exact
return new SingleTermsEnum(terms.iterator(null), term.bytes());
}
return new SlowFuzzyTermsEnum(terms, atts, getTerm(), minimumSimilarity, prefixLength);
}
/**
* Returns the pattern term.
*/
public Term getTerm() {
return term;
}
@Override
public String toString(String field) {
final StringBuilder buffer = new StringBuilder();
if (!term.field().equals(field)) {
buffer.append(term.field());
buffer.append(":");
}
buffer.append(term.text());
buffer.append('~');
buffer.append(Float.toString(minimumSimilarity));
buffer.append(ToStringUtils.boost(getBoost()));
return buffer.toString();
}
@Override
public int hashCode() {
final int prime = 31;
int result = super.hashCode();
result = prime * result + Float.floatToIntBits(minimumSimilarity);
result = prime * result + prefixLength;
result = prime * result + ((term == null) ? 0 : term.hashCode());
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (!super.equals(obj))
return false;
if (getClass() != obj.getClass())
return false;
SlowFuzzyQuery other = (SlowFuzzyQuery) obj;
if (Float.floatToIntBits(minimumSimilarity) != Float
.floatToIntBits(other.minimumSimilarity))
return false;
if (prefixLength != other.prefixLength)
return false;
if (term == null) {
if (other.term != null)
return false;
} else if (!term.equals(other.term))
return false;
return true;
}
}

View File

@ -0,0 +1,249 @@
package org.apache.lucene.sandbox.queries;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.FilteredTermsEnum;
import org.apache.lucene.search.BoostAttribute;
import org.apache.lucene.search.FuzzyTermsEnum;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.UnicodeUtil;
/** Classic fuzzy TermsEnum for enumerating all terms that are similar
* to the specified filter term.
*
* <p>Term enumerations are always ordered by
* {@link #getComparator}. Each term in the enumeration is
* greater than all that precede it.</p>
*
* @deprecated Use {@link FuzzyTermsEnum} instead.
*/
@Deprecated
public final class SlowFuzzyTermsEnum extends FuzzyTermsEnum {
public SlowFuzzyTermsEnum(Terms terms, AttributeSource atts, Term term,
float minSimilarity, int prefixLength) throws IOException {
super(terms, atts, term, minSimilarity, prefixLength, false);
}
@Override
protected void maxEditDistanceChanged(BytesRef lastTerm, int maxEdits, boolean init)
throws IOException {
TermsEnum newEnum = getAutomatonEnum(maxEdits, lastTerm);
if (newEnum != null) {
setEnum(newEnum);
} else if (init) {
setEnum(new LinearFuzzyTermsEnum());
}
}
/**
* Implement fuzzy enumeration with linear brute force.
*/
private class LinearFuzzyTermsEnum extends FilteredTermsEnum {
/* Allows us save time required to create a new array
* every time similarity is called.
*/
private int[] d;
private int[] p;
// this is the text, minus the prefix
private final int[] text;
private final BoostAttribute boostAtt =
attributes().addAttribute(BoostAttribute.class);
/**
* Constructor for enumeration of all terms from specified <code>reader</code> which share a prefix of
* length <code>prefixLength</code> with <code>term</code> and which have a fuzzy similarity &gt;
* <code>minSimilarity</code>.
* <p>
* After calling the constructor the enumeration is already pointing to the first
* valid term if such a term exists.
*
* @throws IOException
*/
public LinearFuzzyTermsEnum() throws IOException {
super(terms.iterator(null));
this.text = new int[termLength - realPrefixLength];
System.arraycopy(termText, realPrefixLength, text, 0, text.length);
final String prefix = UnicodeUtil.newString(termText, 0, realPrefixLength);
prefixBytesRef = new BytesRef(prefix);
this.d = new int[this.text.length + 1];
this.p = new int[this.text.length + 1];
setInitialSeekTerm(prefixBytesRef);
}
private final BytesRef prefixBytesRef;
// used for unicode conversion from BytesRef byte[] to int[]
private final IntsRef utf32 = new IntsRef(20);
/**
* The termCompare method in FuzzyTermEnum uses Levenshtein distance to
* calculate the distance between the given term and the comparing term.
*/
@Override
protected final AcceptStatus accept(BytesRef term) {
if (StringHelper.startsWith(term, prefixBytesRef)) {
UnicodeUtil.UTF8toUTF32(term, utf32);
final float similarity = similarity(utf32.ints, realPrefixLength, utf32.length - realPrefixLength);
if (similarity > minSimilarity) {
boostAtt.setBoost((similarity - minSimilarity) * scale_factor);
return AcceptStatus.YES;
} else return AcceptStatus.NO;
} else {
return AcceptStatus.END;
}
}
/******************************
* Compute Levenshtein distance
******************************/
/**
* <p>Similarity returns a number that is 1.0f or less (including negative numbers)
* based on how similar the Term is compared to a target term. It returns
* exactly 0.0f when
* <pre>
* editDistance &gt; maximumEditDistance</pre>
* Otherwise it returns:
* <pre>
* 1 - (editDistance / length)</pre>
* where length is the length of the shortest term (text or target) including a
* prefix that are identical and editDistance is the Levenshtein distance for
* the two words.</p>
*
* <p>Embedded within this algorithm is a fail-fast Levenshtein distance
* algorithm. The fail-fast algorithm differs from the standard Levenshtein
* distance algorithm in that it is aborted if it is discovered that the
* minimum distance between the words is greater than some threshold.
*
* <p>To calculate the maximum distance threshold we use the following formula:
* <pre>
* (1 - minimumSimilarity) * length</pre>
* where length is the shortest term including any prefix that is not part of the
* similarity comparison. This formula was derived by solving for what maximum value
* of distance returns false for the following statements:
* <pre>
* similarity = 1 - ((float)distance / (float) (prefixLength + Math.min(textlen, targetlen)));
* return (similarity > minimumSimilarity);</pre>
* where distance is the Levenshtein distance for the two words.
* </p>
* <p>Levenshtein distance (also known as edit distance) is a measure of similarity
* between two strings where the distance is measured as the number of character
* deletions, insertions or substitutions required to transform one string to
* the other string.
* @param target the target word or phrase
* @return the similarity, 0.0 or less indicates that it matches less than the required
* threshold and 1.0 indicates that the text and target are identical
*/
private final float similarity(final int[] target, int offset, int length) {
final int m = length;
final int n = text.length;
if (n == 0) {
//we don't have anything to compare. That means if we just add
//the letters for m we get the new word
return realPrefixLength == 0 ? 0.0f : 1.0f - ((float) m / realPrefixLength);
}
if (m == 0) {
return realPrefixLength == 0 ? 0.0f : 1.0f - ((float) n / realPrefixLength);
}
final int maxDistance = calculateMaxDistance(m);
if (maxDistance < Math.abs(m-n)) {
//just adding the characters of m to n or vice-versa results in
//too many edits
//for example "pre" length is 3 and "prefixes" length is 8. We can see that
//given this optimal circumstance, the edit distance cannot be less than 5.
//which is 8-3 or more precisely Math.abs(3-8).
//if our maximum edit distance is 4, then we can discard this word
//without looking at it.
return Float.NEGATIVE_INFINITY;
}
// init matrix d
for (int i = 0; i <=n; ++i) {
p[i] = i;
}
// start computing edit distance
for (int j = 1; j<=m; ++j) { // iterates through target
int bestPossibleEditDistance = m;
final int t_j = target[offset+j-1]; // jth character of t
d[0] = j;
for (int i=1; i<=n; ++i) { // iterates through text
// minimum of cell to the left+1, to the top+1, diagonally left and up +(0|1)
if (t_j != text[i-1]) {
d[i] = Math.min(Math.min(d[i-1], p[i]), p[i-1]) + 1;
} else {
d[i] = Math.min(Math.min(d[i-1]+1, p[i]+1), p[i-1]);
}
bestPossibleEditDistance = Math.min(bestPossibleEditDistance, d[i]);
}
//After calculating row i, the best possible edit distance
//can be found by found by finding the smallest value in a given column.
//If the bestPossibleEditDistance is greater than the max distance, abort.
if (j > maxDistance && bestPossibleEditDistance > maxDistance) { //equal is okay, but not greater
//the closest the target can be to the text is just too far away.
//this target is leaving the party early.
return Float.NEGATIVE_INFINITY;
}
// copy current distance counts to 'previous row' distance counts: swap p and d
int _d[] = p;
p = d;
d = _d;
}
// our last action in the above loop was to switch d and p, so p now
// actually has the most recent cost counts
// this will return less than 0.0 when the edit distance is
// greater than the number of characters in the shorter word.
// but this was the formula that was previously used in FuzzyTermEnum,
// so it has not been changed (even though minimumSimilarity must be
// greater than 0.0)
return 1.0f - ((float)p[n] / (float) (realPrefixLength + Math.min(n, m)));
}
/**
* The max Distance is the maximum Levenshtein distance for the text
* compared to some other value that results in score that is
* better than the minimum similarity.
* @param m the length of the "other value"
* @return the maximum levenshtein distance that we care about
*/
private int calculateMaxDistance(int m) {
return raw ? maxEdits : Math.min(maxEdits,
(int)((1-minSimilarity) * (Math.min(text.length, m) + realPrefixLength)));
}
}
}

View File

@ -0,0 +1,468 @@
package org.apache.lucene.sandbox.queries;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.List;
import java.util.Arrays;
import java.io.IOException;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiReader;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
/**
* Tests {@link SlowFuzzyQuery}.
*
*/
public class TestSlowFuzzyQuery extends LuceneTestCase {
public void testFuzziness() throws Exception {
Directory directory = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
addDoc("aaaaa", writer);
addDoc("aaaab", writer);
addDoc("aaabb", writer);
addDoc("aabbb", writer);
addDoc("abbbb", writer);
addDoc("bbbbb", writer);
addDoc("ddddd", writer);
IndexReader reader = writer.getReader();
IndexSearcher searcher = newSearcher(reader);
writer.close();
SlowFuzzyQuery query = new SlowFuzzyQuery(new Term("field", "aaaaa"), SlowFuzzyQuery.defaultMinSimilarity, 0);
ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(3, hits.length);
// same with prefix
query = new SlowFuzzyQuery(new Term("field", "aaaaa"), SlowFuzzyQuery.defaultMinSimilarity, 1);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(3, hits.length);
query = new SlowFuzzyQuery(new Term("field", "aaaaa"), SlowFuzzyQuery.defaultMinSimilarity, 2);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(3, hits.length);
query = new SlowFuzzyQuery(new Term("field", "aaaaa"), SlowFuzzyQuery.defaultMinSimilarity, 3);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(3, hits.length);
query = new SlowFuzzyQuery(new Term("field", "aaaaa"), SlowFuzzyQuery.defaultMinSimilarity, 4);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(2, hits.length);
query = new SlowFuzzyQuery(new Term("field", "aaaaa"), SlowFuzzyQuery.defaultMinSimilarity, 5);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(1, hits.length);
query = new SlowFuzzyQuery(new Term("field", "aaaaa"), SlowFuzzyQuery.defaultMinSimilarity, 6);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(1, hits.length);
// test scoring
query = new SlowFuzzyQuery(new Term("field", "bbbbb"), SlowFuzzyQuery.defaultMinSimilarity, 0);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals("3 documents should match", 3, hits.length);
List<String> order = Arrays.asList("bbbbb","abbbb","aabbb");
for (int i = 0; i < hits.length; i++) {
final String term = searcher.doc(hits[i].doc).get("field");
//System.out.println(hits[i].score);
assertEquals(order.get(i), term);
}
// test pq size by supplying maxExpansions=2
// This query would normally return 3 documents, because 3 terms match (see above):
query = new SlowFuzzyQuery(new Term("field", "bbbbb"), SlowFuzzyQuery.defaultMinSimilarity, 0, 2);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals("only 2 documents should match", 2, hits.length);
order = Arrays.asList("bbbbb","abbbb");
for (int i = 0; i < hits.length; i++) {
final String term = searcher.doc(hits[i].doc).get("field");
//System.out.println(hits[i].score);
assertEquals(order.get(i), term);
}
// not similar enough:
query = new SlowFuzzyQuery(new Term("field", "xxxxx"), SlowFuzzyQuery.defaultMinSimilarity, 0);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(0, hits.length);
query = new SlowFuzzyQuery(new Term("field", "aaccc"), SlowFuzzyQuery.defaultMinSimilarity, 0); // edit distance to "aaaaa" = 3
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(0, hits.length);
// query identical to a word in the index:
query = new SlowFuzzyQuery(new Term("field", "aaaaa"), SlowFuzzyQuery.defaultMinSimilarity, 0);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(3, hits.length);
assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
// default allows for up to two edits:
assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab"));
assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb"));
// query similar to a word in the index:
query = new SlowFuzzyQuery(new Term("field", "aaaac"), SlowFuzzyQuery.defaultMinSimilarity, 0);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(3, hits.length);
assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab"));
assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb"));
// now with prefix
query = new SlowFuzzyQuery(new Term("field", "aaaac"), SlowFuzzyQuery.defaultMinSimilarity, 1);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(3, hits.length);
assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab"));
assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb"));
query = new SlowFuzzyQuery(new Term("field", "aaaac"), SlowFuzzyQuery.defaultMinSimilarity, 2);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(3, hits.length);
assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab"));
assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb"));
query = new SlowFuzzyQuery(new Term("field", "aaaac"), SlowFuzzyQuery.defaultMinSimilarity, 3);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(3, hits.length);
assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab"));
assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb"));
query = new SlowFuzzyQuery(new Term("field", "aaaac"), SlowFuzzyQuery.defaultMinSimilarity, 4);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(2, hits.length);
assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab"));
query = new SlowFuzzyQuery(new Term("field", "aaaac"), SlowFuzzyQuery.defaultMinSimilarity, 5);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(0, hits.length);
query = new SlowFuzzyQuery(new Term("field", "ddddX"), SlowFuzzyQuery.defaultMinSimilarity, 0);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(1, hits.length);
assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd"));
// now with prefix
query = new SlowFuzzyQuery(new Term("field", "ddddX"), SlowFuzzyQuery.defaultMinSimilarity, 1);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(1, hits.length);
assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd"));
query = new SlowFuzzyQuery(new Term("field", "ddddX"), SlowFuzzyQuery.defaultMinSimilarity, 2);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(1, hits.length);
assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd"));
query = new SlowFuzzyQuery(new Term("field", "ddddX"), SlowFuzzyQuery.defaultMinSimilarity, 3);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(1, hits.length);
assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd"));
query = new SlowFuzzyQuery(new Term("field", "ddddX"), SlowFuzzyQuery.defaultMinSimilarity, 4);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(1, hits.length);
assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd"));
query = new SlowFuzzyQuery(new Term("field", "ddddX"), SlowFuzzyQuery.defaultMinSimilarity, 5);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(0, hits.length);
// different field = no match:
query = new SlowFuzzyQuery(new Term("anotherfield", "ddddX"), SlowFuzzyQuery.defaultMinSimilarity, 0);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(0, hits.length);
reader.close();
directory.close();
}
public void testFuzzinessLong() throws Exception {
Directory directory = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
addDoc("aaaaaaa", writer);
addDoc("segment", writer);
IndexReader reader = writer.getReader();
IndexSearcher searcher = newSearcher(reader);
writer.close();
SlowFuzzyQuery query;
// not similar enough:
query = new SlowFuzzyQuery(new Term("field", "xxxxx"), 0.5f, 0);
ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(0, hits.length);
// edit distance to "aaaaaaa" = 3, this matches because the string is longer than
// in testDefaultFuzziness so a bigger difference is allowed:
query = new SlowFuzzyQuery(new Term("field", "aaaaccc"), 0.5f, 0);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(1, hits.length);
assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaaaa"));
// now with prefix
query = new SlowFuzzyQuery(new Term("field", "aaaaccc"), 0.5f, 1);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(1, hits.length);
assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaaaa"));
query = new SlowFuzzyQuery(new Term("field", "aaaaccc"), 0.5f, 4);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(1, hits.length);
assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaaaa"));
query = new SlowFuzzyQuery(new Term("field", "aaaaccc"), 0.5f, 5);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(0, hits.length);
// no match, more than half of the characters is wrong:
query = new SlowFuzzyQuery(new Term("field", "aaacccc"), 0.5f, 0);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(0, hits.length);
// now with prefix
query = new SlowFuzzyQuery(new Term("field", "aaacccc"), 0.5f, 2);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(0, hits.length);
// "student" and "stellent" are indeed similar to "segment" by default:
query = new SlowFuzzyQuery(new Term("field", "student"), 0.5f, 0);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(1, hits.length);
query = new SlowFuzzyQuery(new Term("field", "stellent"), 0.5f, 0);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(1, hits.length);
// now with prefix
query = new SlowFuzzyQuery(new Term("field", "student"), 0.5f, 1);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(1, hits.length);
query = new SlowFuzzyQuery(new Term("field", "stellent"), 0.5f, 1);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(1, hits.length);
query = new SlowFuzzyQuery(new Term("field", "student"), 0.5f, 2);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(0, hits.length);
query = new SlowFuzzyQuery(new Term("field", "stellent"), 0.5f, 2);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(0, hits.length);
// "student" doesn't match anymore thanks to increased minimum similarity:
query = new SlowFuzzyQuery(new Term("field", "student"), 0.6f, 0);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(0, hits.length);
try {
query = new SlowFuzzyQuery(new Term("field", "student"), 1.1f);
fail("Expected IllegalArgumentException");
} catch (IllegalArgumentException e) {
// expecting exception
}
try {
query = new SlowFuzzyQuery(new Term("field", "student"), -0.1f);
fail("Expected IllegalArgumentException");
} catch (IllegalArgumentException e) {
// expecting exception
}
reader.close();
directory.close();
}
/**
* MultiTermQuery provides (via attribute) information about which values
* must be competitive to enter the priority queue.
*
* SlowFuzzyQuery optimizes itself around this information, if the attribute
* is not implemented correctly, there will be problems!
*/
public void testTieBreaker() throws Exception {
Directory directory = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
addDoc("a123456", writer);
addDoc("c123456", writer);
addDoc("d123456", writer);
addDoc("e123456", writer);
Directory directory2 = newDirectory();
RandomIndexWriter writer2 = new RandomIndexWriter(random(), directory2);
addDoc("a123456", writer2);
addDoc("b123456", writer2);
addDoc("b123456", writer2);
addDoc("b123456", writer2);
addDoc("c123456", writer2);
addDoc("f123456", writer2);
IndexReader ir1 = writer.getReader();
IndexReader ir2 = writer2.getReader();
MultiReader mr = new MultiReader(ir1, ir2);
IndexSearcher searcher = newSearcher(mr);
SlowFuzzyQuery fq = new SlowFuzzyQuery(new Term("field", "z123456"), 1f, 0, 2);
TopDocs docs = searcher.search(fq, 2);
assertEquals(5, docs.totalHits); // 5 docs, from the a and b's
mr.close();
ir1.close();
ir2.close();
writer.close();
writer2.close();
directory.close();
directory2.close();
}
public void testTokenLengthOpt() throws IOException {
Directory directory = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
addDoc("12345678911", writer);
addDoc("segment", writer);
IndexReader reader = writer.getReader();
IndexSearcher searcher = newSearcher(reader);
writer.close();
Query query;
// term not over 10 chars, so optimization shortcuts
query = new SlowFuzzyQuery(new Term("field", "1234569"), 0.9f);
ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(0, hits.length);
// 10 chars, so no optimization
query = new SlowFuzzyQuery(new Term("field", "1234567891"), 0.9f);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(0, hits.length);
// over 10 chars, so no optimization
query = new SlowFuzzyQuery(new Term("field", "12345678911"), 0.9f);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(1, hits.length);
// over 10 chars, no match
query = new SlowFuzzyQuery(new Term("field", "sdfsdfsdfsdf"), 0.9f);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(0, hits.length);
reader.close();
directory.close();
}
/** Test the TopTermsBoostOnlyBooleanQueryRewrite rewrite method. */
public void testBoostOnlyRewrite() throws Exception {
Directory directory = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
addDoc("Lucene", writer);
addDoc("Lucene", writer);
addDoc("Lucenne", writer);
IndexReader reader = writer.getReader();
IndexSearcher searcher = newSearcher(reader);
writer.close();
SlowFuzzyQuery query = new SlowFuzzyQuery(new Term("field", "lucene"));
query.setRewriteMethod(new MultiTermQuery.TopTermsBoostOnlyBooleanQueryRewrite(50));
ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(3, hits.length);
// normally, 'Lucenne' would be the first result as IDF will skew the score.
assertEquals("Lucene", reader.document(hits[0].doc).get("field"));
assertEquals("Lucene", reader.document(hits[1].doc).get("field"));
assertEquals("Lucenne", reader.document(hits[2].doc).get("field"));
reader.close();
directory.close();
}
public void testGiga() throws Exception {
MockAnalyzer analyzer = new MockAnalyzer(random());
Directory index = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), index);
addDoc("Lucene in Action", w);
addDoc("Lucene for Dummies", w);
//addDoc("Giga", w);
addDoc("Giga byte", w);
addDoc("ManagingGigabytesManagingGigabyte", w);
addDoc("ManagingGigabytesManagingGigabytes", w);
addDoc("The Art of Computer Science", w);
addDoc("J. K. Rowling", w);
addDoc("JK Rowling", w);
addDoc("Joanne K Roling", w);
addDoc("Bruce Willis", w);
addDoc("Willis bruce", w);
addDoc("Brute willis", w);
addDoc("B. willis", w);
IndexReader r = w.getReader();
w.close();
Query q = new SlowFuzzyQuery(new Term("field", "giga"), 0.9f);
// 3. search
IndexSearcher searcher = newSearcher(r);
ScoreDoc[] hits = searcher.search(q, 10).scoreDocs;
assertEquals(1, hits.length);
assertEquals("Giga byte", searcher.doc(hits[0].doc).get("field"));
r.close();
index.close();
}
public void testDistanceAsEditsSearching() throws Exception {
Directory index = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), index);
addDoc("foobar", w);
addDoc("test", w);
addDoc("working", w);
IndexReader reader = w.getReader();
IndexSearcher searcher = newSearcher(reader);
w.close();
SlowFuzzyQuery q = new SlowFuzzyQuery(new Term("field", "fouba"), 2);
ScoreDoc[] hits = searcher.search(q, 10).scoreDocs;
assertEquals(1, hits.length);
assertEquals("foobar", searcher.doc(hits[0].doc).get("field"));
q = new SlowFuzzyQuery(new Term("field", "foubara"), 2);
hits = searcher.search(q, 10).scoreDocs;
assertEquals(1, hits.length);
assertEquals("foobar", searcher.doc(hits[0].doc).get("field"));
q = new SlowFuzzyQuery(new Term("field", "t"), 3);
hits = searcher.search(q, 10).scoreDocs;
assertEquals(1, hits.length);
assertEquals("test", searcher.doc(hits[0].doc).get("field"));
q = new SlowFuzzyQuery(new Term("field", "a"), 4f, 0, 50);
hits = searcher.search(q, 10).scoreDocs;
assertEquals(1, hits.length);
assertEquals("test", searcher.doc(hits[0].doc).get("field"));
q = new SlowFuzzyQuery(new Term("field", "a"), 6f, 0, 50);
hits = searcher.search(q, 10).scoreDocs;
assertEquals(2, hits.length);
assertEquals("test", searcher.doc(hits[0].doc).get("field"));
assertEquals("foobar", searcher.doc(hits[1].doc).get("field"));
reader.close();
index.close();
}
private void addDoc(String text, RandomIndexWriter writer) throws IOException {
Document doc = new Document();
doc.add(newField("field", text, TextField.TYPE_STORED));
writer.addDocument(doc);
}
}

View File

@ -1,4 +1,4 @@
package org.apache.lucene.search; package org.apache.lucene.sandbox.queries;
/** /**
* Licensed to the Apache Software Foundation (ASF) under one or more * Licensed to the Apache Software Foundation (ASF) under one or more
@ -29,6 +29,9 @@ import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term; import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.similarities.DefaultSimilarity; import org.apache.lucene.search.similarities.DefaultSimilarity;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.LuceneTestCase;
@ -55,7 +58,7 @@ import org.apache.lucene.util.LuceneTestCase;
* *
* results line: comma-separated docID, score pair * results line: comma-separated docID, score pair
**/ **/
public class TestFuzzyQuery2 extends LuceneTestCase { public class TestSlowFuzzyQuery2 extends LuceneTestCase {
/** epsilon for score comparisons */ /** epsilon for score comparisons */
static final float epsilon = 0.00001f; static final float epsilon = 0.00001f;
@ -115,7 +118,7 @@ public class TestFuzzyQuery2 extends LuceneTestCase {
int prefix = Integer.parseInt(params[1]); int prefix = Integer.parseInt(params[1]);
int pqSize = Integer.parseInt(params[2]); int pqSize = Integer.parseInt(params[2]);
float minScore = Float.parseFloat(params[3]); float minScore = Float.parseFloat(params[3]);
FuzzyQuery q = new FuzzyQuery(new Term("field", query), minScore, prefix); SlowFuzzyQuery q = new SlowFuzzyQuery(new Term("field", query), minScore, prefix);
q.setRewriteMethod(new MultiTermQuery.TopTermsBoostOnlyBooleanQueryRewrite(pqSize)); q.setRewriteMethod(new MultiTermQuery.TopTermsBoostOnlyBooleanQueryRewrite(pqSize));
int expectedResults = Integer.parseInt(reader.readLine()); int expectedResults = Integer.parseInt(reader.readLine());
TopDocs docs = searcher.search(q, expectedResults); TopDocs docs = searcher.search(q, expectedResults);