mirror of https://github.com/apache/lucene.git
LUCENE-5033: SlowFuzzyQuery was accepting too many documents when provided minSimilarity is an int > 1
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1491088 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
92ec6ec337
commit
b9dc8ef8f6
|
@ -138,6 +138,9 @@ Bug Fixes
|
||||||
* LUCENE-4933: SweetSpotSimilarity didn't apply its tf function to some
|
* LUCENE-4933: SweetSpotSimilarity didn't apply its tf function to some
|
||||||
queries (SloppyPhraseQuery, SpanQueries). (Robert Muir)
|
queries (SloppyPhraseQuery, SpanQueries). (Robert Muir)
|
||||||
|
|
||||||
|
* LUCENE-5033: SlowFuzzyQuery was accepting too many terms (documents) when
|
||||||
|
provided minSimilarity is an int > 1 (Tim Allison via Mike McCandless)
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
|
|
||||||
* LUCENE-4936: Improve numeric doc values compression in case all values share
|
* LUCENE-4936: Improve numeric doc values compression in case all values share
|
||||||
|
|
|
@ -31,9 +31,12 @@ import org.apache.lucene.util.IntsRef;
|
||||||
import org.apache.lucene.util.StringHelper;
|
import org.apache.lucene.util.StringHelper;
|
||||||
import org.apache.lucene.util.UnicodeUtil;
|
import org.apache.lucene.util.UnicodeUtil;
|
||||||
|
|
||||||
/** Classic fuzzy TermsEnum for enumerating all terms that are similar
|
/** Potentially slow fuzzy TermsEnum for enumerating all terms that are similar
|
||||||
* to the specified filter term.
|
* to the specified filter term.
|
||||||
*
|
* <p> If the minSimilarity or maxEdits is greater than the Automaton's
|
||||||
|
* allowable range, this backs off to the classic (brute force)
|
||||||
|
* fuzzy terms enum method by calling FuzzyTermsEnum's getAutomatonEnum.
|
||||||
|
* </p>
|
||||||
* <p>Term enumerations are always ordered by
|
* <p>Term enumerations are always ordered by
|
||||||
* {@link #getComparator}. Each term in the enumeration is
|
* {@link #getComparator}. Each term in the enumeration is
|
||||||
* greater than all that precede it.</p>
|
* greater than all that precede it.</p>
|
||||||
|
@ -103,18 +106,43 @@ public final class SlowFuzzyTermsEnum extends FuzzyTermsEnum {
|
||||||
private final IntsRef utf32 = new IntsRef(20);
|
private final IntsRef utf32 = new IntsRef(20);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The termCompare method in FuzzyTermEnum uses Levenshtein distance to
|
* <p>The termCompare method in FuzzyTermEnum uses Levenshtein distance to
|
||||||
* calculate the distance between the given term and the comparing term.
|
* calculate the distance between the given term and the comparing term.
|
||||||
|
* </p>
|
||||||
|
* <p>If the minSimilarity is >= 1.0, this uses the maxEdits as the comparison.
|
||||||
|
* Otherwise, this method uses the following logic to calculate similarity.
|
||||||
|
* <pre>
|
||||||
|
* similarity = 1 - ((float)distance / (float) (prefixLength + Math.min(textlen, targetlen)));
|
||||||
|
* </pre>
|
||||||
|
* where distance is the Levenshtein distance for the two words.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
protected final AcceptStatus accept(BytesRef term) {
|
protected final AcceptStatus accept(BytesRef term) {
|
||||||
if (StringHelper.startsWith(term, prefixBytesRef)) {
|
if (StringHelper.startsWith(term, prefixBytesRef)) {
|
||||||
UnicodeUtil.UTF8toUTF32(term, utf32);
|
UnicodeUtil.UTF8toUTF32(term, utf32);
|
||||||
final float similarity = similarity(utf32.ints, realPrefixLength, utf32.length - realPrefixLength);
|
final int distance = calcDistance(utf32.ints, realPrefixLength, utf32.length - realPrefixLength);
|
||||||
if (similarity > minSimilarity) {
|
|
||||||
|
//Integer.MIN_VALUE is the sentinel that Levenshtein stopped early
|
||||||
|
if (distance == Integer.MIN_VALUE){
|
||||||
|
return AcceptStatus.NO;
|
||||||
|
}
|
||||||
|
//no need to calc similarity, if raw is true and distance > maxEdits
|
||||||
|
if (raw == true && distance > maxEdits){
|
||||||
|
return AcceptStatus.NO;
|
||||||
|
}
|
||||||
|
final float similarity = calcSimilarity(distance, (utf32.length - realPrefixLength), text.length);
|
||||||
|
|
||||||
|
//if raw is true, then distance must also be <= maxEdits by now
|
||||||
|
//given the previous if statement
|
||||||
|
if (raw == true ||
|
||||||
|
(raw == false && similarity > minSimilarity)) {
|
||||||
boostAtt.setBoost((similarity - minSimilarity) * scale_factor);
|
boostAtt.setBoost((similarity - minSimilarity) * scale_factor);
|
||||||
return AcceptStatus.YES;
|
return AcceptStatus.YES;
|
||||||
} else return AcceptStatus.NO;
|
} else {
|
||||||
|
return AcceptStatus.NO;
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
return AcceptStatus.END;
|
return AcceptStatus.END;
|
||||||
}
|
}
|
||||||
|
@ -125,52 +153,34 @@ public final class SlowFuzzyTermsEnum extends FuzzyTermsEnum {
|
||||||
******************************/
|
******************************/
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* <p>Similarity returns a number that is 1.0f or less (including negative numbers)
|
* <p>calcDistance returns the Levenshtein distance between the query term
|
||||||
* based on how similar the Term is compared to a target term. It returns
|
* and the target term.</p>
|
||||||
* exactly 0.0f when
|
|
||||||
* <pre>
|
|
||||||
* editDistance > maximumEditDistance</pre>
|
|
||||||
* Otherwise it returns:
|
|
||||||
* <pre>
|
|
||||||
* 1 - (editDistance / length)</pre>
|
|
||||||
* where length is the length of the shortest term (text or target) including a
|
|
||||||
* prefix that are identical and editDistance is the Levenshtein distance for
|
|
||||||
* the two words.</p>
|
|
||||||
*
|
*
|
||||||
* <p>Embedded within this algorithm is a fail-fast Levenshtein distance
|
* <p>Embedded within this algorithm is a fail-fast Levenshtein distance
|
||||||
* algorithm. The fail-fast algorithm differs from the standard Levenshtein
|
* algorithm. The fail-fast algorithm differs from the standard Levenshtein
|
||||||
* distance algorithm in that it is aborted if it is discovered that the
|
* distance algorithm in that it is aborted if it is discovered that the
|
||||||
* minimum distance between the words is greater than some threshold.
|
* minimum distance between the words is greater than some threshold.
|
||||||
*
|
|
||||||
* <p>To calculate the maximum distance threshold we use the following formula:
|
|
||||||
* <pre>
|
|
||||||
* (1 - minimumSimilarity) * length</pre>
|
|
||||||
* where length is the shortest term including any prefix that is not part of the
|
|
||||||
* similarity comparison. This formula was derived by solving for what maximum value
|
|
||||||
* of distance returns false for the following statements:
|
|
||||||
* <pre>
|
|
||||||
* similarity = 1 - ((float)distance / (float) (prefixLength + Math.min(textlen, targetlen)));
|
|
||||||
* return (similarity > minimumSimilarity);</pre>
|
|
||||||
* where distance is the Levenshtein distance for the two words.
|
|
||||||
* </p>
|
|
||||||
* <p>Levenshtein distance (also known as edit distance) is a measure of similarity
|
* <p>Levenshtein distance (also known as edit distance) is a measure of similarity
|
||||||
* between two strings where the distance is measured as the number of character
|
* between two strings where the distance is measured as the number of character
|
||||||
* deletions, insertions or substitutions required to transform one string to
|
* deletions, insertions or substitutions required to transform one string to
|
||||||
* the other string.
|
* the other string.
|
||||||
* @param target the target word or phrase
|
* @param target the target word or phrase
|
||||||
* @return the similarity, 0.0 or less indicates that it matches less than the required
|
* @param offset the offset at which to start the comparison
|
||||||
* threshold and 1.0 indicates that the text and target are identical
|
* @param length the length of what's left of the string to compare
|
||||||
|
* @return the number of edits or Integer.MIN_VALUE if the edit distance is
|
||||||
|
* greater than maxDistance.
|
||||||
*/
|
*/
|
||||||
private final float similarity(final int[] target, int offset, int length) {
|
private final int calcDistance(final int[] target, int offset, int length) {
|
||||||
final int m = length;
|
final int m = length;
|
||||||
final int n = text.length;
|
final int n = text.length;
|
||||||
if (n == 0) {
|
if (n == 0) {
|
||||||
//we don't have anything to compare. That means if we just add
|
//we don't have anything to compare. That means if we just add
|
||||||
//the letters for m we get the new word
|
//the letters for m we get the new word
|
||||||
return realPrefixLength == 0 ? 0.0f : 1.0f - ((float) m / realPrefixLength);
|
return m;
|
||||||
}
|
}
|
||||||
if (m == 0) {
|
if (m == 0) {
|
||||||
return realPrefixLength == 0 ? 0.0f : 1.0f - ((float) n / realPrefixLength);
|
return n;
|
||||||
}
|
}
|
||||||
|
|
||||||
final int maxDistance = calculateMaxDistance(m);
|
final int maxDistance = calculateMaxDistance(m);
|
||||||
|
@ -183,7 +193,7 @@ public final class SlowFuzzyTermsEnum extends FuzzyTermsEnum {
|
||||||
//which is 8-3 or more precisely Math.abs(3-8).
|
//which is 8-3 or more precisely Math.abs(3-8).
|
||||||
//if our maximum edit distance is 4, then we can discard this word
|
//if our maximum edit distance is 4, then we can discard this word
|
||||||
//without looking at it.
|
//without looking at it.
|
||||||
return Float.NEGATIVE_INFINITY;
|
return Integer.MIN_VALUE;
|
||||||
}
|
}
|
||||||
|
|
||||||
// init matrix d
|
// init matrix d
|
||||||
|
@ -214,7 +224,7 @@ public final class SlowFuzzyTermsEnum extends FuzzyTermsEnum {
|
||||||
if (j > maxDistance && bestPossibleEditDistance > maxDistance) { //equal is okay, but not greater
|
if (j > maxDistance && bestPossibleEditDistance > maxDistance) { //equal is okay, but not greater
|
||||||
//the closest the target can be to the text is just too far away.
|
//the closest the target can be to the text is just too far away.
|
||||||
//this target is leaving the party early.
|
//this target is leaving the party early.
|
||||||
return Float.NEGATIVE_INFINITY;
|
return Integer.MIN_VALUE;
|
||||||
}
|
}
|
||||||
|
|
||||||
// copy current distance counts to 'previous row' distance counts: swap p and d
|
// copy current distance counts to 'previous row' distance counts: swap p and d
|
||||||
|
@ -226,12 +236,17 @@ public final class SlowFuzzyTermsEnum extends FuzzyTermsEnum {
|
||||||
// our last action in the above loop was to switch d and p, so p now
|
// our last action in the above loop was to switch d and p, so p now
|
||||||
// actually has the most recent cost counts
|
// actually has the most recent cost counts
|
||||||
|
|
||||||
|
return p[n];
|
||||||
|
}
|
||||||
|
|
||||||
|
private float calcSimilarity(int edits, int m, int n){
|
||||||
// this will return less than 0.0 when the edit distance is
|
// this will return less than 0.0 when the edit distance is
|
||||||
// greater than the number of characters in the shorter word.
|
// greater than the number of characters in the shorter word.
|
||||||
// but this was the formula that was previously used in FuzzyTermEnum,
|
// but this was the formula that was previously used in FuzzyTermEnum,
|
||||||
// so it has not been changed (even though minimumSimilarity must be
|
// so it has not been changed (even though minimumSimilarity must be
|
||||||
// greater than 0.0)
|
// greater than 0.0)
|
||||||
return 1.0f - ((float)p[n] / (float) (realPrefixLength + Math.min(n, m)));
|
|
||||||
|
return 1.0f - ((float)edits / (float) (realPrefixLength + Math.min(n, m)));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -43,6 +43,9 @@ import org.apache.lucene.util.LuceneTestCase;
|
||||||
public class TestSlowFuzzyQuery extends LuceneTestCase {
|
public class TestSlowFuzzyQuery extends LuceneTestCase {
|
||||||
|
|
||||||
public void testFuzziness() throws Exception {
|
public void testFuzziness() throws Exception {
|
||||||
|
//every test with SlowFuzzyQuery.defaultMinSimilarity
|
||||||
|
//is exercising the Automaton, not the brute force linear method
|
||||||
|
|
||||||
Directory directory = newDirectory();
|
Directory directory = newDirectory();
|
||||||
RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
|
RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
|
||||||
addDoc("aaaaa", writer);
|
addDoc("aaaaa", writer);
|
||||||
|
@ -194,6 +197,30 @@ public class TestSlowFuzzyQuery extends LuceneTestCase {
|
||||||
directory.close();
|
directory.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testFuzzinessLong2() throws Exception {
|
||||||
|
//Lucene-5033
|
||||||
|
Directory directory = newDirectory();
|
||||||
|
RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
|
||||||
|
addDoc("abcdef", writer);
|
||||||
|
addDoc("segment", writer);
|
||||||
|
|
||||||
|
IndexReader reader = writer.getReader();
|
||||||
|
IndexSearcher searcher = newSearcher(reader);
|
||||||
|
writer.close();
|
||||||
|
|
||||||
|
SlowFuzzyQuery query;
|
||||||
|
|
||||||
|
query = new SlowFuzzyQuery(new Term("field", "abcxxxx"), 3f, 0);
|
||||||
|
ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
|
assertEquals(0, hits.length);
|
||||||
|
|
||||||
|
query = new SlowFuzzyQuery(new Term("field", "abcxxxx"), 4f, 0);
|
||||||
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
|
assertEquals(1, hits.length);
|
||||||
|
reader.close();
|
||||||
|
directory.close();
|
||||||
|
}
|
||||||
|
|
||||||
public void testFuzzinessLong() throws Exception {
|
public void testFuzzinessLong() throws Exception {
|
||||||
Directory directory = newDirectory();
|
Directory directory = newDirectory();
|
||||||
RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
|
RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
|
||||||
|
@ -385,7 +412,6 @@ public class TestSlowFuzzyQuery extends LuceneTestCase {
|
||||||
|
|
||||||
public void testGiga() throws Exception {
|
public void testGiga() throws Exception {
|
||||||
|
|
||||||
MockAnalyzer analyzer = new MockAnalyzer(random());
|
|
||||||
Directory index = newDirectory();
|
Directory index = newDirectory();
|
||||||
RandomIndexWriter w = new RandomIndexWriter(random(), index);
|
RandomIndexWriter w = new RandomIndexWriter(random(), index);
|
||||||
|
|
||||||
|
@ -440,25 +466,21 @@ public class TestSlowFuzzyQuery extends LuceneTestCase {
|
||||||
assertEquals(1, hits.length);
|
assertEquals(1, hits.length);
|
||||||
assertEquals("foobar", searcher.doc(hits[0].doc).get("field"));
|
assertEquals("foobar", searcher.doc(hits[0].doc).get("field"));
|
||||||
|
|
||||||
// TODO: cannot really be supported given the legacy scoring
|
q = new SlowFuzzyQuery(new Term("field", "t"), 3);
|
||||||
// system which scores negative, if the distance > min term len,
|
hits = searcher.search(q, 10).scoreDocs;
|
||||||
// so such matches were always impossible with lucene 3.x, etc
|
assertEquals(1, hits.length);
|
||||||
//
|
assertEquals("test", searcher.doc(hits[0].doc).get("field"));
|
||||||
//q = new SlowFuzzyQuery(new Term("field", "t"), 3);
|
|
||||||
//hits = searcher.search(q, 10).scoreDocs;
|
|
||||||
//assertEquals(1, hits.length);
|
|
||||||
//assertEquals("test", searcher.doc(hits[0].doc).get("field"));
|
|
||||||
|
|
||||||
// q = new SlowFuzzyQuery(new Term("field", "a"), 4f, 0, 50);
|
q = new SlowFuzzyQuery(new Term("field", "a"), 4f, 0, 50);
|
||||||
// hits = searcher.search(q, 10).scoreDocs;
|
hits = searcher.search(q, 10).scoreDocs;
|
||||||
// assertEquals(1, hits.length);
|
assertEquals(1, hits.length);
|
||||||
// assertEquals("test", searcher.doc(hits[0].doc).get("field"));
|
assertEquals("test", searcher.doc(hits[0].doc).get("field"));
|
||||||
|
|
||||||
// q = new SlowFuzzyQuery(new Term("field", "a"), 6f, 0, 50);
|
q = new SlowFuzzyQuery(new Term("field", "a"), 6f, 0, 50);
|
||||||
// hits = searcher.search(q, 10).scoreDocs;
|
hits = searcher.search(q, 10).scoreDocs;
|
||||||
// assertEquals(2, hits.length);
|
assertEquals(2, hits.length);
|
||||||
// assertEquals("test", searcher.doc(hits[0].doc).get("field"));
|
assertEquals("test", searcher.doc(hits[0].doc).get("field"));
|
||||||
// assertEquals("foobar", searcher.doc(hits[1].doc).get("field"));
|
assertEquals("foobar", searcher.doc(hits[1].doc).get("field"));
|
||||||
|
|
||||||
reader.close();
|
reader.close();
|
||||||
index.close();
|
index.close();
|
||||||
|
|
Loading…
Reference in New Issue