mirror of https://github.com/apache/lucene.git
LUCENE-329: Fix FuzzyQuery defaults to rank exact matches highest
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1680522 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
4fbbdf465d
commit
463d453abf
|
@ -135,6 +135,8 @@ Optimizations
|
|||
like a disjunction. (Adrien Grand)
|
||||
|
||||
Bug Fixes
|
||||
* LUCENE-329: Fix FuzzyQuery defaults to rank exact matches highest.
|
||||
(Mark Harwood, Adrien Grand)
|
||||
|
||||
* LUCENE-6378: Fix all RuntimeExceptions to throw the underlying root cause.
|
||||
(Varun Thacker, Adrien Grand, Mike McCandless)
|
||||
|
|
|
@ -117,16 +117,31 @@ public final class TermContext {
|
|||
* should be derived from a {@link IndexReaderContext}'s leaf ord.
|
||||
*/
|
||||
public void register(TermState state, final int ord, final int docFreq, final long totalTermFreq) {
|
||||
register(state, ord);
|
||||
accumulateStatistics(docFreq, totalTermFreq);
|
||||
}
|
||||
|
||||
/**
|
||||
* Expert: Registers and associates a {@link TermState} with an leaf ordinal. The
|
||||
* leaf ordinal should be derived from a {@link IndexReaderContext}'s leaf ord.
|
||||
* On the contrary to {@link #register(TermState, int, int, long)} this method
|
||||
* does NOT update term statistics.
|
||||
*/
|
||||
public void register(TermState state, final int ord) {
|
||||
assert state != null : "state must not be null";
|
||||
assert ord >= 0 && ord < states.length;
|
||||
assert states[ord] == null : "state for ord: " + ord
|
||||
+ " already registered";
|
||||
states[ord] = state;
|
||||
}
|
||||
|
||||
/** Expert: Accumulate term statistics. */
|
||||
public void accumulateStatistics(final int docFreq, final long totalTermFreq) {
|
||||
this.docFreq += docFreq;
|
||||
if (this.totalTermFreq >= 0 && totalTermFreq >= 0)
|
||||
this.totalTermFreq += totalTermFreq;
|
||||
else
|
||||
this.totalTermFreq = -1;
|
||||
states[ord] = state;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -98,7 +98,7 @@ public class FuzzyQuery extends MultiTermQuery {
|
|||
this.prefixLength = prefixLength;
|
||||
this.transpositions = transpositions;
|
||||
this.maxExpansions = maxExpansions;
|
||||
setRewriteMethod(new MultiTermQuery.TopTermsScoringBooleanQueryRewrite(maxExpansions));
|
||||
setRewriteMethod(new MultiTermQuery.TopTermsBlendedFreqScoringRewrite(maxExpansions));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -18,13 +18,16 @@ package org.apache.lucene.search;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
|
||||
import org.apache.lucene.index.FilteredTermsEnum; // javadocs
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.SingleTermsEnum; // javadocs
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermContext;
|
||||
import org.apache.lucene.index.TermState;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
@ -169,6 +172,106 @@ public abstract class MultiTermQuery extends Query {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* A rewrite method that first translates each term into
|
||||
* {@link BooleanClause.Occur#SHOULD} clause in a BooleanQuery, but adjusts
|
||||
* the frequencies used for scoring to be blended across the terms, otherwise
|
||||
* the rarest term typically ranks highest (often not useful eg in the set of
|
||||
* expanded terms in a FuzzyQuery).
|
||||
*
|
||||
* <p>
|
||||
* This rewrite method only uses the top scoring terms so it will not overflow
|
||||
* the boolean max clause count.
|
||||
*
|
||||
* @see #setRewriteMethod
|
||||
*/
|
||||
public static final class TopTermsBlendedFreqScoringRewrite extends
|
||||
TopTermsRewrite<BooleanQuery> {
|
||||
|
||||
/**
|
||||
* Create a TopTermsBlendedScoringBooleanQueryRewrite for at most
|
||||
* <code>size</code> terms.
|
||||
* <p>
|
||||
* NOTE: if {@link BooleanQuery#getMaxClauseCount} is smaller than
|
||||
* <code>size</code>, then it will be used instead.
|
||||
*/
|
||||
public TopTermsBlendedFreqScoringRewrite(int size) {
|
||||
super(size);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected int getMaxSize() {
|
||||
return BooleanQuery.getMaxClauseCount();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected BooleanQuery getTopLevelQuery() {
|
||||
return new BooleanQuery(true);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void addClause(BooleanQuery topLevel, Term term, int docCount,
|
||||
float boost, TermContext states) {
|
||||
final TermQuery tq = new TermQuery(term, states);
|
||||
tq.setBoost(boost);
|
||||
topLevel.add(tq, BooleanClause.Occur.SHOULD);
|
||||
}
|
||||
|
||||
@Override
|
||||
void adjustScoreTerms(IndexReader reader, ScoreTerm[] scoreTerms) {
|
||||
if (scoreTerms.length <= 1) {
|
||||
return;
|
||||
}
|
||||
int maxDoc = reader.maxDoc();
|
||||
int maxDf = 0;
|
||||
long maxTtf = 0;
|
||||
for (ScoreTerm scoreTerm : scoreTerms) {
|
||||
TermContext ctx = scoreTerm.termState;
|
||||
int df = ctx.docFreq();
|
||||
maxDf = Math.max(df, maxDf);
|
||||
long ttf = ctx.totalTermFreq();
|
||||
maxTtf = ttf == -1 || maxTtf == -1 ? -1 : Math.max(ttf, maxTtf);
|
||||
}
|
||||
|
||||
assert maxDf >= 0 : "DF must be >= 0";
|
||||
if (maxDf == 0) {
|
||||
return; // we are done that term doesn't exist at all
|
||||
}
|
||||
assert (maxTtf == -1) || (maxTtf >= maxDf);
|
||||
|
||||
for (int i = 0; i < scoreTerms.length; i++) {
|
||||
TermContext ctx = scoreTerms[i].termState;
|
||||
ctx = adjustFrequencies(ctx, maxDf, maxTtf);
|
||||
|
||||
ScoreTerm adjustedScoreTerm = new ScoreTerm(ctx);
|
||||
adjustedScoreTerm.boost = scoreTerms[i].boost;
|
||||
adjustedScoreTerm.bytes.copyBytes(scoreTerms[i].bytes);
|
||||
scoreTerms[i] = adjustedScoreTerm;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static TermContext adjustFrequencies(TermContext ctx, int artificialDf,
|
||||
long artificialTtf) {
|
||||
List<LeafReaderContext> leaves = ctx.topReaderContext.leaves();
|
||||
final int len;
|
||||
if (leaves == null) {
|
||||
len = 1;
|
||||
} else {
|
||||
len = leaves.size();
|
||||
}
|
||||
TermContext newCtx = new TermContext(ctx.topReaderContext);
|
||||
for (int i = 0; i < len; ++i) {
|
||||
TermState termState = ctx.get(i);
|
||||
if (termState == null) {
|
||||
continue;
|
||||
}
|
||||
newCtx.register(termState, i);
|
||||
}
|
||||
newCtx.accumulateStatistics(artificialDf, artificialTtf);
|
||||
return newCtx;
|
||||
}
|
||||
|
||||
/**
|
||||
* A rewrite method that first translates each term into
|
||||
* {@link BooleanClause.Occur#SHOULD} clause in a BooleanQuery, but the scores
|
||||
|
|
|
@ -18,10 +18,10 @@ package org.apache.lucene.search;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.PriorityQueue;
|
||||
import java.util.Comparator;
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
|
@ -158,14 +158,19 @@ public abstract class TopTermsRewrite<Q extends Query> extends TermCollectingRew
|
|||
final ScoreTerm[] scoreTerms = stQueue.toArray(new ScoreTerm[stQueue.size()]);
|
||||
ArrayUtil.timSort(scoreTerms, scoreTermSortByTermComp);
|
||||
|
||||
adjustScoreTerms(reader, scoreTerms);
|
||||
|
||||
for (final ScoreTerm st : scoreTerms) {
|
||||
final Term term = new Term(query.field, st.bytes.toBytesRef());
|
||||
assert reader.docFreq(term) == st.termState.docFreq() : "reader DF is " + reader.docFreq(term) + " vs " + st.termState.docFreq() + " term=" + term;
|
||||
addClause(q, term, st.termState.docFreq(), query.getBoost() * st.boost, st.termState); // add to query
|
||||
}
|
||||
return q;
|
||||
}
|
||||
|
||||
void adjustScoreTerms(IndexReader reader, ScoreTerm[] scoreTerms) {
|
||||
//no-op but allows subclasses the ability to tweak the score terms used in ranking e.g. balancing IDF.
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return 31 * size;
|
||||
|
|
|
@ -17,9 +17,9 @@ package org.apache.lucene.search;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Arrays;
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
|
@ -28,7 +28,10 @@ import org.apache.lucene.document.Field;
|
|||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.MultiReader;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.StoredDocument;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.BooleanClause.Occur;
|
||||
import org.apache.lucene.search.similarities.DefaultSimilarity;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.automaton.LevenshteinAutomata;
|
||||
|
@ -241,6 +244,90 @@ public class TestFuzzyQuery extends LuceneTestCase {
|
|||
directory.close();
|
||||
}
|
||||
|
||||
public void testSingleQueryExactMatchScoresHighest() throws Exception {
|
||||
//See issue LUCENE-329 - IDF shouldn't wreck similarity ranking
|
||||
Directory directory = newDirectory();
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
|
||||
addDoc("smith", writer);
|
||||
addDoc("smith", writer);
|
||||
addDoc("smith", writer);
|
||||
addDoc("smith", writer);
|
||||
addDoc("smith", writer);
|
||||
addDoc("smith", writer);
|
||||
addDoc("smythe", writer);
|
||||
addDoc("smdssasd", writer);
|
||||
|
||||
IndexReader reader = writer.getReader();
|
||||
IndexSearcher searcher = newSearcher(reader);
|
||||
searcher.setSimilarity(new DefaultSimilarity()); //avoid randomisation of similarity algo by test framework
|
||||
writer.close();
|
||||
String searchTerms[] = { "smith", "smythe", "smdssasd" };
|
||||
for (String searchTerm : searchTerms) {
|
||||
FuzzyQuery query = new FuzzyQuery(new Term("field", searchTerm), 2, 1);
|
||||
ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs;
|
||||
StoredDocument bestDoc = searcher.doc(hits[0].doc);
|
||||
assertTrue(hits.length > 0);
|
||||
String topMatch = bestDoc.get("field");
|
||||
assertEquals(searchTerm, topMatch);
|
||||
if (hits.length > 1) {
|
||||
StoredDocument worstDoc = searcher.doc(hits[hits.length - 1].doc);
|
||||
String worstMatch = worstDoc.get("field");
|
||||
assertNotSame(searchTerm, worstMatch);
|
||||
}
|
||||
}
|
||||
reader.close();
|
||||
directory.close();
|
||||
}
|
||||
|
||||
public void testMultipleQueriesIdfWorks() throws Exception {
|
||||
// With issue LUCENE-329 - it could be argued a MultiTermQuery.TopTermsBoostOnlyBooleanQueryRewrite
|
||||
// is the solution as it disables IDF.
|
||||
// However - IDF is still useful as in this case where there are multiple FuzzyQueries.
|
||||
Directory directory = newDirectory();
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
|
||||
|
||||
addDoc("michael smith", writer);
|
||||
addDoc("michael lucero", writer);
|
||||
addDoc("doug cutting", writer);
|
||||
addDoc("doug cuttin", writer);
|
||||
addDoc("michael wardle", writer);
|
||||
addDoc("micheal vegas", writer);
|
||||
addDoc("michael lydon", writer);
|
||||
|
||||
IndexReader reader = writer.getReader();
|
||||
IndexSearcher searcher = newSearcher(reader);
|
||||
searcher.setSimilarity(new DefaultSimilarity()); //avoid randomisation of similarity algo by test framework
|
||||
|
||||
writer.close();
|
||||
|
||||
BooleanQuery query = new BooleanQuery();
|
||||
String commonSearchTerm = "michael";
|
||||
FuzzyQuery commonQuery = new FuzzyQuery(new Term("field", commonSearchTerm), 2, 1);
|
||||
query.add(commonQuery, Occur.SHOULD);
|
||||
|
||||
String rareSearchTerm = "cutting";
|
||||
FuzzyQuery rareQuery = new FuzzyQuery(new Term("field", rareSearchTerm), 2, 1);
|
||||
query.add(rareQuery, Occur.SHOULD);
|
||||
ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs;
|
||||
|
||||
// Matches on the rare surname should be worth more than matches on the common forename
|
||||
assertEquals(7, hits.length);
|
||||
StoredDocument bestDoc = searcher.doc(hits[0].doc);
|
||||
String topMatch = bestDoc.get("field");
|
||||
assertTrue(topMatch.contains(rareSearchTerm));
|
||||
|
||||
StoredDocument runnerUpDoc = searcher.doc(hits[1].doc);
|
||||
String runnerUpMatch = runnerUpDoc.get("field");
|
||||
assertTrue(runnerUpMatch.contains("cuttin"));
|
||||
|
||||
StoredDocument worstDoc = searcher.doc(hits[hits.length - 1].doc);
|
||||
String worstMatch = worstDoc.get("field");
|
||||
assertTrue(worstMatch.contains("micheal")); //misspelling of common name
|
||||
|
||||
reader.close();
|
||||
directory.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* MultiTermQuery provides (via attribute) information about which values
|
||||
* must be competitive to enter the priority queue.
|
||||
|
|
Loading…
Reference in New Issue