mirror of https://github.com/apache/lucene.git
LUCENE-1993: add maxDocFreq to MoreLikeThis
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@827042 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
faa1b9815e
commit
5ceb81834d
|
@ -53,6 +53,10 @@ New features
|
||||||
any number of output parts, at the cost of doing multiple passes over
|
any number of output parts, at the cost of doing multiple passes over
|
||||||
the input index. (Andrzej Bialecki)
|
the input index. (Andrzej Bialecki)
|
||||||
|
|
||||||
|
* LUCENE-1993: Add maxDocFreq setting to MoreLikeThis, to exclude
|
||||||
|
from consideration terms that match more than the specified number
|
||||||
|
of documents. (Christian Steinert via Mike McCandless)
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
|
|
||||||
* LUCENE-1965, LUCENE-1962: Arabic-, Persian- and SmartChineseAnalyzer
|
* LUCENE-1965, LUCENE-1962: Arabic-, Persian- and SmartChineseAnalyzer
|
||||||
|
|
|
@ -128,6 +128,8 @@ import org.apache.lucene.util.PriorityQueue;
|
||||||
* <ul>
|
* <ul>
|
||||||
* <li> {@link #setMinTermFreq setMinTermFreq(...)}
|
* <li> {@link #setMinTermFreq setMinTermFreq(...)}
|
||||||
* <li> {@link #setMinDocFreq setMinDocFreq(...)}
|
* <li> {@link #setMinDocFreq setMinDocFreq(...)}
|
||||||
|
* <li> {@link #setMaxDocFreq setMaxDocFreq(...)}
|
||||||
|
* <li> {@link #setMaxDocFreqPct setMaxDocFreqPct(...)}
|
||||||
* <li> {@link #setMinWordLen setMinWordLen(...)}
|
* <li> {@link #setMinWordLen setMinWordLen(...)}
|
||||||
* <li> {@link #setMaxWordLen setMaxWordLen(...)}
|
* <li> {@link #setMaxWordLen setMaxWordLen(...)}
|
||||||
* <li> {@link #setMaxQueryTerms setMaxQueryTerms(...)}
|
* <li> {@link #setMaxQueryTerms setMaxQueryTerms(...)}
|
||||||
|
@ -176,6 +178,14 @@ public final class MoreLikeThis {
|
||||||
*/
|
*/
|
||||||
public static final int DEFAULT_MIN_DOC_FREQ = 5;
|
public static final int DEFAULT_MIN_DOC_FREQ = 5;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Ignore words which occur in more than this many docs.
|
||||||
|
* @see #getMaxDocFreq
|
||||||
|
* @see #setMaxDocFreq
|
||||||
|
* @see #setMaxDocFreqPct
|
||||||
|
*/
|
||||||
|
public static final int DEFAULT_MAX_DOC_FREQ = Integer.MAX_VALUE;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Boost terms in query based on score.
|
* Boost terms in query based on score.
|
||||||
* @see #isBoost
|
* @see #isBoost
|
||||||
|
@ -241,6 +251,11 @@ public final class MoreLikeThis {
|
||||||
*/
|
*/
|
||||||
private int minDocFreq = DEFAULT_MIN_DOC_FREQ;
|
private int minDocFreq = DEFAULT_MIN_DOC_FREQ;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Ignore words which occur in more than this many docs.
|
||||||
|
*/
|
||||||
|
private int maxDocFreq = DEFAULT_MAX_DOC_FREQ;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Should we apply a boost to the Query based on the scores?
|
* Should we apply a boost to the Query based on the scores?
|
||||||
*/
|
*/
|
||||||
|
@ -387,6 +402,43 @@ public final class MoreLikeThis {
|
||||||
this.minDocFreq = minDocFreq;
|
this.minDocFreq = minDocFreq;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the maximum frequency in which words may still appear.
|
||||||
|
* Words that appear in more than this many docs will be ignored. The default frequency is
|
||||||
|
* {@link #DEFAULT_MAX_DOC_FREQ}.
|
||||||
|
*
|
||||||
|
* @return get the maximum frequency at which words are still allowed,
|
||||||
|
* words which occur in more docs than this are ignored.
|
||||||
|
*/
|
||||||
|
public int getMaxDocFreq() {
|
||||||
|
return maxDocFreq;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set the maximum frequency in which words may still appear. Words that appear
|
||||||
|
* in more than this many docs will be ignored.
|
||||||
|
*
|
||||||
|
* @param maxFreq
|
||||||
|
* the maximum count of documents that a term may appear
|
||||||
|
* in to be still considered relevant
|
||||||
|
*/
|
||||||
|
public void setMaxDocFreq(int maxFreq) {
|
||||||
|
this.maxDocFreq = maxFreq;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set the maximum percentage in which words may still appear. Words that appear
|
||||||
|
* in more than this many percent of all docs will be ignored.
|
||||||
|
*
|
||||||
|
* @param maxPercentage
|
||||||
|
* the maximum percentage of documents (0-100) that a term may appear
|
||||||
|
* in to be still considered relevant
|
||||||
|
*/
|
||||||
|
public void setMaxDocFreqPct(int maxPercentage) {
|
||||||
|
this.maxDocFreq = maxPercentage * ir.numDocs() / 100;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns whether to boost terms in query based on "score" or not. The default is
|
* Returns whether to boost terms in query based on "score" or not. The default is
|
||||||
* {@link #DEFAULT_BOOST}.
|
* {@link #DEFAULT_BOOST}.
|
||||||
|
@ -660,6 +712,10 @@ public final class MoreLikeThis {
|
||||||
continue; // filter out words that don't occur in enough docs
|
continue; // filter out words that don't occur in enough docs
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (docFreq > maxDocFreq) {
|
||||||
|
continue; // filter out words that occur in too many docs
|
||||||
|
}
|
||||||
|
|
||||||
if (docFreq == 0) {
|
if (docFreq == 0) {
|
||||||
continue; // index update problem?
|
continue; // index update problem?
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue