LUCENE-1993: add maxDocFreq to MoreLikeThis

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@827042 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2009-10-20 11:59:53 +00:00
parent faa1b9815e
commit 5ceb81834d
2 changed files with 60 additions and 0 deletions

View File

@ -53,6 +53,10 @@ New features
any number of output parts, at the cost of doing multiple passes over any number of output parts, at the cost of doing multiple passes over
the input index. (Andrzej Bialecki) the input index. (Andrzej Bialecki)
* LUCENE-1993: Add maxDocFreq setting to MoreLikeThis, to exclude
from consideration terms that match more than the specified number
of documents. (Christian Steinert via Mike McCandless)
Optimizations Optimizations
* LUCENE-1965, LUCENE-1962: Arabic-, Persian- and SmartChineseAnalyzer * LUCENE-1965, LUCENE-1962: Arabic-, Persian- and SmartChineseAnalyzer

View File

@ -128,6 +128,8 @@ import org.apache.lucene.util.PriorityQueue;
* <ul> * <ul>
* <li> {@link #setMinTermFreq setMinTermFreq(...)} * <li> {@link #setMinTermFreq setMinTermFreq(...)}
* <li> {@link #setMinDocFreq setMinDocFreq(...)} * <li> {@link #setMinDocFreq setMinDocFreq(...)}
* <li> {@link #setMaxDocFreq setMaxDocFreq(...)}
* <li> {@link #setMaxDocFreqPct setMaxDocFreqPct(...)}
* <li> {@link #setMinWordLen setMinWordLen(...)} * <li> {@link #setMinWordLen setMinWordLen(...)}
* <li> {@link #setMaxWordLen setMaxWordLen(...)} * <li> {@link #setMaxWordLen setMaxWordLen(...)}
* <li> {@link #setMaxQueryTerms setMaxQueryTerms(...)} * <li> {@link #setMaxQueryTerms setMaxQueryTerms(...)}
@ -176,6 +178,14 @@ public final class MoreLikeThis {
*/ */
public static final int DEFAULT_MIN_DOC_FREQ = 5; public static final int DEFAULT_MIN_DOC_FREQ = 5;
/**
* Ignore words which occur in more than this many docs.
* @see #getMaxDocFreq
* @see #setMaxDocFreq
* @see #setMaxDocFreqPct
*/
public static final int DEFAULT_MAX_DOC_FREQ = Integer.MAX_VALUE;
/** /**
* Boost terms in query based on score. * Boost terms in query based on score.
* @see #isBoost * @see #isBoost
@ -241,6 +251,11 @@ public final class MoreLikeThis {
*/ */
private int minDocFreq = DEFAULT_MIN_DOC_FREQ; private int minDocFreq = DEFAULT_MIN_DOC_FREQ;
/**
* Ignore words which occur in more than this many docs.
*/
private int maxDocFreq = DEFAULT_MAX_DOC_FREQ;
/** /**
* Should we apply a boost to the Query based on the scores? * Should we apply a boost to the Query based on the scores?
*/ */
@ -387,6 +402,43 @@ public final class MoreLikeThis {
this.minDocFreq = minDocFreq; this.minDocFreq = minDocFreq;
} }
/**
* Returns the maximum frequency in which words may still appear.
* Words that appear in more than this many docs will be ignored. The default frequency is
* {@link #DEFAULT_MAX_DOC_FREQ}.
*
* @return get the maximum frequency at which words are still allowed,
* words which occur in more docs than this are ignored.
*/
public int getMaxDocFreq() {
return maxDocFreq;
}
/**
* Set the maximum frequency in which words may still appear. Words that appear
* in more than this many docs will be ignored.
*
* @param maxFreq
* the maximum count of documents that a term may appear
* in to be still considered relevant
*/
public void setMaxDocFreq(int maxFreq) {
this.maxDocFreq = maxFreq;
}
/**
* Set the maximum percentage in which words may still appear. Words that appear
* in more than this many percent of all docs will be ignored.
*
* @param maxPercentage
* the maximum percentage of documents (0-100) that a term may appear
* in to be still considered relevant
*/
public void setMaxDocFreqPct(int maxPercentage) {
this.maxDocFreq = maxPercentage * ir.numDocs() / 100;
}
/** /**
* Returns whether to boost terms in query based on "score" or not. The default is * Returns whether to boost terms in query based on "score" or not. The default is
* {@link #DEFAULT_BOOST}. * {@link #DEFAULT_BOOST}.
@ -660,6 +712,10 @@ public final class MoreLikeThis {
continue; // filter out words that don't occur in enough docs continue; // filter out words that don't occur in enough docs
} }
if (docFreq > maxDocFreq) {
continue; // filter out words that occur in too many docs
}
if (docFreq == 0) { if (docFreq == 0) {
continue; // index update problem? continue; // index update problem?
} }