Logic ignored stop words were in a early version of this code but it was taken out in the belief that there

was no point in explicitly looking for them as the scoring algorithm would effictively ignore them. I did a test and indexed 700 pages on a corporate web site and then ran the MoreLikeThis code on them and 1/2 of the docs had stop words identified as interesting. So - I added code in to ignore stop words, but make it backward compatible so that by default this code is not used. git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@169512 13f79535-47bb-0310-9956-ffa450edef68
2005-05-10 19:29:56 +00:00 · 2005-05-10 19:29:56 +00:00 · 1d68f8c88d
parent 81087e8bb6
commit 1d68f8c88d
1 changed files with 43 additions and 0 deletions
--- a/contrib/similarity/src/java/org/apache/lucene/search/similar/MoreLikeThis.java
+++ b/contrib/similarity/src/java/org/apache/lucene/search/similar/MoreLikeThis.java
@ -32,6 +32,7 @@ import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.document.Document;

+import java.util.Set;
 import java.util.HashMap;
 import java.util.Map;
 import java.util.Collection;
@ -128,6 +129,7 @@ import java.util.ArrayList;
 * <li> {@link #setMaxWordLen setMaxWordLen(...)}
 * <li> {@link #setMaxQueryTerms setMaxQueryTerms(...)}
 * <li> {@link #setMaxNumTokensParsed setMaxNumTokensParsed(...)}
+ * <li> {@link #setStopWords setStopWord(...)} 
 * </ul> 
 *
 * <hr>
@ -201,6 +203,20 @@ public final class MoreLikeThis {
     */
    public static final int DEFAULT_MAX_WORD_LENGTH = 0;

+	/**
+	 * Default set of stopwords.
+	 * If null means to allow stop words.
+	 *
+	 * @see #setStopWords
+	 * @see #getStopWords
+	 */
+	public static final Set DEFAULT_STOP_WORDS = null;
+
+	/**
+	 * Current set of stop words.
+	 */
+	private Set stopWords = DEFAULT_STOP_WORDS;
+
    /**
     * Return a Query with no more than this many terms.
     *
@ -417,6 +433,30 @@ public final class MoreLikeThis {
        this.maxWordLen = maxWordLen;
    }

+	/**
+	 * Set the set of stopwords.
+	 * Any word in this set is considered "uninteresting" and ignored.
+	 * Even if your Analyzer allows stopwords, you might want to tell the MoreLikeThis code to ignore them, as
+	 * for the purposes of document similarity it seems reasonable to assume that "a stop word is never interesting".
+	 * 
+	 * @param stopWords set of stopwords, if null it means to allow stop words
+	 *
+	 * @see org.apache.lucene.analysis.StopFilter#makeStopSet StopFilter.makeStopSet()
+	 * @see #getStopWords	 
+	 */
+	public void setStopWords(Set stopWords) {
+		this.stopWords = stopWords;
+	}
+
+	/**
+	 * Get the current stop words being used.
+	 * @see #setStopWords
+	 */
+	public Set getStopWords() {
+		return stopWords;
+	}
+		
+
    /**
     * Returns the maximum number of query terms that will be included in any generated query.
     * The default is {@link #DEFAULT_MAX_QUERY_TERMS}.
@ -793,6 +833,9 @@ public final class MoreLikeThis {
 		if (maxWordLen > 0 && len > maxWordLen) {
 			return true;
 		}
+		if (stopWords != null && stopWords.contains( term)) {
+			return true;
+		}
 		return false;
 	}