mirror of https://github.com/apache/lucene.git
LUCENE-5033: SlowFuzzyQuery was accepting too many documents when provided minSimilarity is an int > 1
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1491088 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
92ec6ec337
commit
b9dc8ef8f6
|
@ -138,6 +138,9 @@ Bug Fixes
|
|||
* LUCENE-4933: SweetSpotSimilarity didn't apply its tf function to some
|
||||
queries (SloppyPhraseQuery, SpanQueries). (Robert Muir)
|
||||
|
||||
* LUCENE-5033: SlowFuzzyQuery was accepting too many terms (documents) when
|
||||
provided minSimilarity is an int > 1 (Tim Allison via Mike McCandless)
|
||||
|
||||
Optimizations
|
||||
|
||||
* LUCENE-4936: Improve numeric doc values compression in case all values share
|
||||
|
|
|
@ -31,9 +31,12 @@ import org.apache.lucene.util.IntsRef;
|
|||
import org.apache.lucene.util.StringHelper;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
|
||||
/** Classic fuzzy TermsEnum for enumerating all terms that are similar
|
||||
/** Potentially slow fuzzy TermsEnum for enumerating all terms that are similar
|
||||
* to the specified filter term.
|
||||
*
|
||||
* <p> If the minSimilarity or maxEdits is greater than the Automaton's
|
||||
* allowable range, this backs off to the classic (brute force)
|
||||
* fuzzy terms enum method by calling FuzzyTermsEnum's getAutomatonEnum.
|
||||
* </p>
|
||||
* <p>Term enumerations are always ordered by
|
||||
* {@link #getComparator}. Each term in the enumeration is
|
||||
* greater than all that precede it.</p>
|
||||
|
@ -103,18 +106,43 @@ public final class SlowFuzzyTermsEnum extends FuzzyTermsEnum {
|
|||
private final IntsRef utf32 = new IntsRef(20);
|
||||
|
||||
/**
|
||||
* The termCompare method in FuzzyTermEnum uses Levenshtein distance to
|
||||
* <p>The termCompare method in FuzzyTermEnum uses Levenshtein distance to
|
||||
* calculate the distance between the given term and the comparing term.
|
||||
* </p>
|
||||
* <p>If the minSimilarity is >= 1.0, this uses the maxEdits as the comparison.
|
||||
* Otherwise, this method uses the following logic to calculate similarity.
|
||||
* <pre>
|
||||
* similarity = 1 - ((float)distance / (float) (prefixLength + Math.min(textlen, targetlen)));
|
||||
* </pre>
|
||||
* where distance is the Levenshtein distance for the two words.
|
||||
* </p>
|
||||
*
|
||||
*/
|
||||
@Override
|
||||
protected final AcceptStatus accept(BytesRef term) {
|
||||
if (StringHelper.startsWith(term, prefixBytesRef)) {
|
||||
UnicodeUtil.UTF8toUTF32(term, utf32);
|
||||
final float similarity = similarity(utf32.ints, realPrefixLength, utf32.length - realPrefixLength);
|
||||
if (similarity > minSimilarity) {
|
||||
final int distance = calcDistance(utf32.ints, realPrefixLength, utf32.length - realPrefixLength);
|
||||
|
||||
//Integer.MIN_VALUE is the sentinel that Levenshtein stopped early
|
||||
if (distance == Integer.MIN_VALUE){
|
||||
return AcceptStatus.NO;
|
||||
}
|
||||
//no need to calc similarity, if raw is true and distance > maxEdits
|
||||
if (raw == true && distance > maxEdits){
|
||||
return AcceptStatus.NO;
|
||||
}
|
||||
final float similarity = calcSimilarity(distance, (utf32.length - realPrefixLength), text.length);
|
||||
|
||||
//if raw is true, then distance must also be <= maxEdits by now
|
||||
//given the previous if statement
|
||||
if (raw == true ||
|
||||
(raw == false && similarity > minSimilarity)) {
|
||||
boostAtt.setBoost((similarity - minSimilarity) * scale_factor);
|
||||
return AcceptStatus.YES;
|
||||
} else return AcceptStatus.NO;
|
||||
} else {
|
||||
return AcceptStatus.NO;
|
||||
}
|
||||
} else {
|
||||
return AcceptStatus.END;
|
||||
}
|
||||
|
@ -125,52 +153,34 @@ public final class SlowFuzzyTermsEnum extends FuzzyTermsEnum {
|
|||
******************************/
|
||||
|
||||
/**
|
||||
* <p>Similarity returns a number that is 1.0f or less (including negative numbers)
|
||||
* based on how similar the Term is compared to a target term. It returns
|
||||
* exactly 0.0f when
|
||||
* <pre>
|
||||
* editDistance > maximumEditDistance</pre>
|
||||
* Otherwise it returns:
|
||||
* <pre>
|
||||
* 1 - (editDistance / length)</pre>
|
||||
* where length is the length of the shortest term (text or target) including a
|
||||
* prefix that are identical and editDistance is the Levenshtein distance for
|
||||
* the two words.</p>
|
||||
*
|
||||
* <p>calcDistance returns the Levenshtein distance between the query term
|
||||
* and the target term.</p>
|
||||
*
|
||||
* <p>Embedded within this algorithm is a fail-fast Levenshtein distance
|
||||
* algorithm. The fail-fast algorithm differs from the standard Levenshtein
|
||||
* distance algorithm in that it is aborted if it is discovered that the
|
||||
* minimum distance between the words is greater than some threshold.
|
||||
*
|
||||
* <p>To calculate the maximum distance threshold we use the following formula:
|
||||
* <pre>
|
||||
* (1 - minimumSimilarity) * length</pre>
|
||||
* where length is the shortest term including any prefix that is not part of the
|
||||
* similarity comparison. This formula was derived by solving for what maximum value
|
||||
* of distance returns false for the following statements:
|
||||
* <pre>
|
||||
* similarity = 1 - ((float)distance / (float) (prefixLength + Math.min(textlen, targetlen)));
|
||||
* return (similarity > minimumSimilarity);</pre>
|
||||
* where distance is the Levenshtein distance for the two words.
|
||||
* </p>
|
||||
|
||||
* <p>Levenshtein distance (also known as edit distance) is a measure of similarity
|
||||
* between two strings where the distance is measured as the number of character
|
||||
* deletions, insertions or substitutions required to transform one string to
|
||||
* the other string.
|
||||
* @param target the target word or phrase
|
||||
* @return the similarity, 0.0 or less indicates that it matches less than the required
|
||||
* threshold and 1.0 indicates that the text and target are identical
|
||||
* @param offset the offset at which to start the comparison
|
||||
* @param length the length of what's left of the string to compare
|
||||
* @return the number of edits or Integer.MIN_VALUE if the edit distance is
|
||||
* greater than maxDistance.
|
||||
*/
|
||||
private final float similarity(final int[] target, int offset, int length) {
|
||||
private final int calcDistance(final int[] target, int offset, int length) {
|
||||
final int m = length;
|
||||
final int n = text.length;
|
||||
if (n == 0) {
|
||||
//we don't have anything to compare. That means if we just add
|
||||
//the letters for m we get the new word
|
||||
return realPrefixLength == 0 ? 0.0f : 1.0f - ((float) m / realPrefixLength);
|
||||
return m;
|
||||
}
|
||||
if (m == 0) {
|
||||
return realPrefixLength == 0 ? 0.0f : 1.0f - ((float) n / realPrefixLength);
|
||||
return n;
|
||||
}
|
||||
|
||||
final int maxDistance = calculateMaxDistance(m);
|
||||
|
@ -183,7 +193,7 @@ public final class SlowFuzzyTermsEnum extends FuzzyTermsEnum {
|
|||
//which is 8-3 or more precisely Math.abs(3-8).
|
||||
//if our maximum edit distance is 4, then we can discard this word
|
||||
//without looking at it.
|
||||
return Float.NEGATIVE_INFINITY;
|
||||
return Integer.MIN_VALUE;
|
||||
}
|
||||
|
||||
// init matrix d
|
||||
|
@ -214,7 +224,7 @@ public final class SlowFuzzyTermsEnum extends FuzzyTermsEnum {
|
|||
if (j > maxDistance && bestPossibleEditDistance > maxDistance) { //equal is okay, but not greater
|
||||
//the closest the target can be to the text is just too far away.
|
||||
//this target is leaving the party early.
|
||||
return Float.NEGATIVE_INFINITY;
|
||||
return Integer.MIN_VALUE;
|
||||
}
|
||||
|
||||
// copy current distance counts to 'previous row' distance counts: swap p and d
|
||||
|
@ -226,12 +236,17 @@ public final class SlowFuzzyTermsEnum extends FuzzyTermsEnum {
|
|||
// our last action in the above loop was to switch d and p, so p now
|
||||
// actually has the most recent cost counts
|
||||
|
||||
return p[n];
|
||||
}
|
||||
|
||||
private float calcSimilarity(int edits, int m, int n){
|
||||
// this will return less than 0.0 when the edit distance is
|
||||
// greater than the number of characters in the shorter word.
|
||||
// but this was the formula that was previously used in FuzzyTermEnum,
|
||||
// so it has not been changed (even though minimumSimilarity must be
|
||||
// greater than 0.0)
|
||||
return 1.0f - ((float)p[n] / (float) (realPrefixLength + Math.min(n, m)));
|
||||
|
||||
return 1.0f - ((float)edits / (float) (realPrefixLength + Math.min(n, m)));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -43,6 +43,9 @@ import org.apache.lucene.util.LuceneTestCase;
|
|||
public class TestSlowFuzzyQuery extends LuceneTestCase {
|
||||
|
||||
public void testFuzziness() throws Exception {
|
||||
//every test with SlowFuzzyQuery.defaultMinSimilarity
|
||||
//is exercising the Automaton, not the brute force linear method
|
||||
|
||||
Directory directory = newDirectory();
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
|
||||
addDoc("aaaaa", writer);
|
||||
|
@ -194,6 +197,30 @@ public class TestSlowFuzzyQuery extends LuceneTestCase {
|
|||
directory.close();
|
||||
}
|
||||
|
||||
public void testFuzzinessLong2() throws Exception {
|
||||
//Lucene-5033
|
||||
Directory directory = newDirectory();
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
|
||||
addDoc("abcdef", writer);
|
||||
addDoc("segment", writer);
|
||||
|
||||
IndexReader reader = writer.getReader();
|
||||
IndexSearcher searcher = newSearcher(reader);
|
||||
writer.close();
|
||||
|
||||
SlowFuzzyQuery query;
|
||||
|
||||
query = new SlowFuzzyQuery(new Term("field", "abcxxxx"), 3f, 0);
|
||||
ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
|
||||
assertEquals(0, hits.length);
|
||||
|
||||
query = new SlowFuzzyQuery(new Term("field", "abcxxxx"), 4f, 0);
|
||||
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||
assertEquals(1, hits.length);
|
||||
reader.close();
|
||||
directory.close();
|
||||
}
|
||||
|
||||
public void testFuzzinessLong() throws Exception {
|
||||
Directory directory = newDirectory();
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
|
||||
|
@ -385,7 +412,6 @@ public class TestSlowFuzzyQuery extends LuceneTestCase {
|
|||
|
||||
public void testGiga() throws Exception {
|
||||
|
||||
MockAnalyzer analyzer = new MockAnalyzer(random());
|
||||
Directory index = newDirectory();
|
||||
RandomIndexWriter w = new RandomIndexWriter(random(), index);
|
||||
|
||||
|
@ -440,25 +466,21 @@ public class TestSlowFuzzyQuery extends LuceneTestCase {
|
|||
assertEquals(1, hits.length);
|
||||
assertEquals("foobar", searcher.doc(hits[0].doc).get("field"));
|
||||
|
||||
// TODO: cannot really be supported given the legacy scoring
|
||||
// system which scores negative, if the distance > min term len,
|
||||
// so such matches were always impossible with lucene 3.x, etc
|
||||
//
|
||||
//q = new SlowFuzzyQuery(new Term("field", "t"), 3);
|
||||
//hits = searcher.search(q, 10).scoreDocs;
|
||||
//assertEquals(1, hits.length);
|
||||
//assertEquals("test", searcher.doc(hits[0].doc).get("field"));
|
||||
q = new SlowFuzzyQuery(new Term("field", "t"), 3);
|
||||
hits = searcher.search(q, 10).scoreDocs;
|
||||
assertEquals(1, hits.length);
|
||||
assertEquals("test", searcher.doc(hits[0].doc).get("field"));
|
||||
|
||||
// q = new SlowFuzzyQuery(new Term("field", "a"), 4f, 0, 50);
|
||||
// hits = searcher.search(q, 10).scoreDocs;
|
||||
// assertEquals(1, hits.length);
|
||||
// assertEquals("test", searcher.doc(hits[0].doc).get("field"));
|
||||
q = new SlowFuzzyQuery(new Term("field", "a"), 4f, 0, 50);
|
||||
hits = searcher.search(q, 10).scoreDocs;
|
||||
assertEquals(1, hits.length);
|
||||
assertEquals("test", searcher.doc(hits[0].doc).get("field"));
|
||||
|
||||
// q = new SlowFuzzyQuery(new Term("field", "a"), 6f, 0, 50);
|
||||
// hits = searcher.search(q, 10).scoreDocs;
|
||||
// assertEquals(2, hits.length);
|
||||
// assertEquals("test", searcher.doc(hits[0].doc).get("field"));
|
||||
// assertEquals("foobar", searcher.doc(hits[1].doc).get("field"));
|
||||
q = new SlowFuzzyQuery(new Term("field", "a"), 6f, 0, 50);
|
||||
hits = searcher.search(q, 10).scoreDocs;
|
||||
assertEquals(2, hits.length);
|
||||
assertEquals("test", searcher.doc(hits[0].doc).get("field"));
|
||||
assertEquals("foobar", searcher.doc(hits[1].doc).get("field"));
|
||||
|
||||
reader.close();
|
||||
index.close();
|
||||
|
|
Loading…
Reference in New Issue