LUCENE-329: Fix FuzzyQuery defaults to rank exact matches highest

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1680522 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Mark Harwood 2015-05-20 11:08:32 +00:00
parent 4fbbdf465d
commit 463d453abf
6 changed files with 219 additions and 7 deletions

View File

@ -135,6 +135,8 @@ Optimizations
like a disjunction. (Adrien Grand)
Bug Fixes
* LUCENE-329: Fix FuzzyQuery defaults to rank exact matches highest.
(Mark Harwood, Adrien Grand)
* LUCENE-6378: Fix all RuntimeExceptions to throw the underlying root cause.
(Varun Thacker, Adrien Grand, Mike McCandless)

View File

@ -117,16 +117,31 @@ public final class TermContext {
* should be derived from a {@link IndexReaderContext}'s leaf ord.
*/
public void register(TermState state, final int ord, final int docFreq, final long totalTermFreq) {
register(state, ord);
accumulateStatistics(docFreq, totalTermFreq);
}
/**
* Expert: Registers and associates a {@link TermState} with an leaf ordinal. The
* leaf ordinal should be derived from a {@link IndexReaderContext}'s leaf ord.
* On the contrary to {@link #register(TermState, int, int, long)} this method
* does NOT update term statistics.
*/
public void register(TermState state, final int ord) {
assert state != null : "state must not be null";
assert ord >= 0 && ord < states.length;
assert states[ord] == null : "state for ord: " + ord
+ " already registered";
states[ord] = state;
}
/** Expert: Accumulate term statistics. */
public void accumulateStatistics(final int docFreq, final long totalTermFreq) {
this.docFreq += docFreq;
if (this.totalTermFreq >= 0 && totalTermFreq >= 0)
this.totalTermFreq += totalTermFreq;
else
this.totalTermFreq = -1;
states[ord] = state;
}
/**

View File

@ -98,7 +98,7 @@ public class FuzzyQuery extends MultiTermQuery {
this.prefixLength = prefixLength;
this.transpositions = transpositions;
this.maxExpansions = maxExpansions;
setRewriteMethod(new MultiTermQuery.TopTermsScoringBooleanQueryRewrite(maxExpansions));
setRewriteMethod(new MultiTermQuery.TopTermsBlendedFreqScoringRewrite(maxExpansions));
}
/**

View File

@ -18,13 +18,16 @@ package org.apache.lucene.search;
*/
import java.io.IOException;
import java.util.List;
import java.util.Objects;
import org.apache.lucene.index.FilteredTermsEnum; // javadocs
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.SingleTermsEnum; // javadocs
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.AttributeSource;
@ -113,7 +116,7 @@ public abstract class MultiTermQuery extends Query {
*
* @see #setRewriteMethod */
public final static RewriteMethod SCORING_BOOLEAN_REWRITE = ScoringRewrite.SCORING_BOOLEAN_REWRITE;
/** Like {@link #SCORING_BOOLEAN_REWRITE} except
* scores are not computed. Instead, each matching
* document receives a constant score equal to the
@ -169,6 +172,106 @@ public abstract class MultiTermQuery extends Query {
}
}
/**
* A rewrite method that first translates each term into
* {@link BooleanClause.Occur#SHOULD} clause in a BooleanQuery, but adjusts
* the frequencies used for scoring to be blended across the terms, otherwise
* the rarest term typically ranks highest (often not useful eg in the set of
* expanded terms in a FuzzyQuery).
*
* <p>
* This rewrite method only uses the top scoring terms so it will not overflow
* the boolean max clause count.
*
* @see #setRewriteMethod
*/
public static final class TopTermsBlendedFreqScoringRewrite extends
TopTermsRewrite<BooleanQuery> {
/**
* Create a TopTermsBlendedScoringBooleanQueryRewrite for at most
* <code>size</code> terms.
* <p>
* NOTE: if {@link BooleanQuery#getMaxClauseCount} is smaller than
* <code>size</code>, then it will be used instead.
*/
public TopTermsBlendedFreqScoringRewrite(int size) {
super(size);
}
@Override
protected int getMaxSize() {
return BooleanQuery.getMaxClauseCount();
}
@Override
protected BooleanQuery getTopLevelQuery() {
return new BooleanQuery(true);
}
@Override
protected void addClause(BooleanQuery topLevel, Term term, int docCount,
float boost, TermContext states) {
final TermQuery tq = new TermQuery(term, states);
tq.setBoost(boost);
topLevel.add(tq, BooleanClause.Occur.SHOULD);
}
@Override
void adjustScoreTerms(IndexReader reader, ScoreTerm[] scoreTerms) {
if (scoreTerms.length <= 1) {
return;
}
int maxDoc = reader.maxDoc();
int maxDf = 0;
long maxTtf = 0;
for (ScoreTerm scoreTerm : scoreTerms) {
TermContext ctx = scoreTerm.termState;
int df = ctx.docFreq();
maxDf = Math.max(df, maxDf);
long ttf = ctx.totalTermFreq();
maxTtf = ttf == -1 || maxTtf == -1 ? -1 : Math.max(ttf, maxTtf);
}
assert maxDf >= 0 : "DF must be >= 0";
if (maxDf == 0) {
return; // we are done that term doesn't exist at all
}
assert (maxTtf == -1) || (maxTtf >= maxDf);
for (int i = 0; i < scoreTerms.length; i++) {
TermContext ctx = scoreTerms[i].termState;
ctx = adjustFrequencies(ctx, maxDf, maxTtf);
ScoreTerm adjustedScoreTerm = new ScoreTerm(ctx);
adjustedScoreTerm.boost = scoreTerms[i].boost;
adjustedScoreTerm.bytes.copyBytes(scoreTerms[i].bytes);
scoreTerms[i] = adjustedScoreTerm;
}
}
}
private static TermContext adjustFrequencies(TermContext ctx, int artificialDf,
long artificialTtf) {
List<LeafReaderContext> leaves = ctx.topReaderContext.leaves();
final int len;
if (leaves == null) {
len = 1;
} else {
len = leaves.size();
}
TermContext newCtx = new TermContext(ctx.topReaderContext);
for (int i = 0; i < len; ++i) {
TermState termState = ctx.get(i);
if (termState == null) {
continue;
}
newCtx.register(termState, i);
}
newCtx.accumulateStatistics(artificialDf, artificialTtf);
return newCtx;
}
/**
* A rewrite method that first translates each term into
* {@link BooleanClause.Occur#SHOULD} clause in a BooleanQuery, but the scores

View File

@ -18,10 +18,10 @@ package org.apache.lucene.search;
*/
import java.io.IOException;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Map;
import java.util.PriorityQueue;
import java.util.Comparator;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
@ -158,14 +158,19 @@ public abstract class TopTermsRewrite<Q extends Query> extends TermCollectingRew
final ScoreTerm[] scoreTerms = stQueue.toArray(new ScoreTerm[stQueue.size()]);
ArrayUtil.timSort(scoreTerms, scoreTermSortByTermComp);
adjustScoreTerms(reader, scoreTerms);
for (final ScoreTerm st : scoreTerms) {
final Term term = new Term(query.field, st.bytes.toBytesRef());
assert reader.docFreq(term) == st.termState.docFreq() : "reader DF is " + reader.docFreq(term) + " vs " + st.termState.docFreq() + " term=" + term;
addClause(q, term, st.termState.docFreq(), query.getBoost() * st.boost, st.termState); // add to query
}
return q;
}
void adjustScoreTerms(IndexReader reader, ScoreTerm[] scoreTerms) {
//no-op but allows subclasses the ability to tweak the score terms used in ranking e.g. balancing IDF.
}
@Override
public int hashCode() {
return 31 * size;

View File

@ -17,9 +17,9 @@ package org.apache.lucene.search;
* limitations under the License.
*/
import java.util.List;
import java.util.Arrays;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
@ -28,7 +28,10 @@ import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiReader;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.StoredDocument;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.similarities.DefaultSimilarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.automaton.LevenshteinAutomata;
@ -241,6 +244,90 @@ public class TestFuzzyQuery extends LuceneTestCase {
directory.close();
}
public void testSingleQueryExactMatchScoresHighest() throws Exception {
//See issue LUCENE-329 - IDF shouldn't wreck similarity ranking
Directory directory = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
addDoc("smith", writer);
addDoc("smith", writer);
addDoc("smith", writer);
addDoc("smith", writer);
addDoc("smith", writer);
addDoc("smith", writer);
addDoc("smythe", writer);
addDoc("smdssasd", writer);
IndexReader reader = writer.getReader();
IndexSearcher searcher = newSearcher(reader);
searcher.setSimilarity(new DefaultSimilarity()); //avoid randomisation of similarity algo by test framework
writer.close();
String searchTerms[] = { "smith", "smythe", "smdssasd" };
for (String searchTerm : searchTerms) {
FuzzyQuery query = new FuzzyQuery(new Term("field", searchTerm), 2, 1);
ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs;
StoredDocument bestDoc = searcher.doc(hits[0].doc);
assertTrue(hits.length > 0);
String topMatch = bestDoc.get("field");
assertEquals(searchTerm, topMatch);
if (hits.length > 1) {
StoredDocument worstDoc = searcher.doc(hits[hits.length - 1].doc);
String worstMatch = worstDoc.get("field");
assertNotSame(searchTerm, worstMatch);
}
}
reader.close();
directory.close();
}
public void testMultipleQueriesIdfWorks() throws Exception {
// With issue LUCENE-329 - it could be argued a MultiTermQuery.TopTermsBoostOnlyBooleanQueryRewrite
// is the solution as it disables IDF.
// However - IDF is still useful as in this case where there are multiple FuzzyQueries.
Directory directory = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
addDoc("michael smith", writer);
addDoc("michael lucero", writer);
addDoc("doug cutting", writer);
addDoc("doug cuttin", writer);
addDoc("michael wardle", writer);
addDoc("micheal vegas", writer);
addDoc("michael lydon", writer);
IndexReader reader = writer.getReader();
IndexSearcher searcher = newSearcher(reader);
searcher.setSimilarity(new DefaultSimilarity()); //avoid randomisation of similarity algo by test framework
writer.close();
BooleanQuery query = new BooleanQuery();
String commonSearchTerm = "michael";
FuzzyQuery commonQuery = new FuzzyQuery(new Term("field", commonSearchTerm), 2, 1);
query.add(commonQuery, Occur.SHOULD);
String rareSearchTerm = "cutting";
FuzzyQuery rareQuery = new FuzzyQuery(new Term("field", rareSearchTerm), 2, 1);
query.add(rareQuery, Occur.SHOULD);
ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs;
// Matches on the rare surname should be worth more than matches on the common forename
assertEquals(7, hits.length);
StoredDocument bestDoc = searcher.doc(hits[0].doc);
String topMatch = bestDoc.get("field");
assertTrue(topMatch.contains(rareSearchTerm));
StoredDocument runnerUpDoc = searcher.doc(hits[1].doc);
String runnerUpMatch = runnerUpDoc.get("field");
assertTrue(runnerUpMatch.contains("cuttin"));
StoredDocument worstDoc = searcher.doc(hits[hits.length - 1].doc);
String worstMatch = worstDoc.get("field");
assertTrue(worstMatch.contains("micheal")); //misspelling of common name
reader.close();
directory.close();
}
/**
* MultiTermQuery provides (via attribute) information about which values
* must be competitive to enter the priority queue.