[1] Added comments to retrieveTerms() to document the return value.

[2] Added convenience routine retrieveInterestingTerms() which makes it easier to get at the "interesting words" in a document. git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@169508 13f79535-47bb-0310-9956-ffa450edef68
2005-05-10 18:49:43 +00:00 · 2005-05-10 18:49:43 +00:00 · 175cf8a9fd
parent a79c508580
commit 175cf8a9fd
1 changed files with 38 additions and 0 deletions
--- a/contrib/similarity/src/java/org/apache/lucene/search/similar/MoreLikeThis.java
+++ b/contrib/similarity/src/java/org/apache/lucene/search/similar/MoreLikeThis.java
@ -44,6 +44,7 @@ import java.io.StringReader;
 import java.io.FileReader;
 import java.io.InputStreamReader;
 import java.net.URL;
+import java.util.ArrayList;


 /**
@ -798,8 +799,24 @@ public final class MoreLikeThis {

    /**
     * Find words for a more-like-this query former.
+	 * The result is a priority queue of arrays.
+	 * Each array has 6 elements.
+	 * The elements are:
+	 * <ol>
+	 * <li> The word (String)
+	 * <li> The top field that this word comes from (String)
+	 * <li> The score for this word (Float)
+	 * <li> The IDF value (Float)
+	 * <li> The frequency of this word in the index (Integer)
+	 * <li> The frequency of this word in the source document (Integer)	 	 
+	 * </ol>
+	 * This is a somewhat "advanced" routine, and in general only the 1st entry in the array is of interest.
+	 * This method is exposed so that you can identify the "interesting words" in a document.
+	 * For an easier method to call see {@link #retrieveInterestingTerms retrieveInterestingTerms()}.
     *
     * @param r the reader that has the content of the document
+	 * @return the most intresting words in the document
+	 * @see #retrieveInterestingTerms
     */
    public PriorityQueue retrieveTerms(Reader r) throws IOException {
        Map words = new HashMap();
@ -810,6 +827,27 @@ public final class MoreLikeThis {
        return createQueue(words);
    }

+	/**
+	 * Convenience routine to make it easy to return the most interesting words in a document.
+	 * More advanced users will call {@link #retrieveTerms(java.io.Reader) retrieveTerms()} directly.
+	 * @param r the source document
+	 * @return the most interesting words in the document
+	 *
+	 * @see #retrieveTerms(java.io.Reader)
+	 * @see #setMaxQueryTerms
+	 */
+	public String[] retrieveInterestingTerms( Reader r) throws IOException {
+		ArrayList al = new ArrayList( maxQueryTerms);
+		PriorityQueue pq = retrieveTerms( r);
+		Object cur;
+		while (((cur = pq.pop()) != null)) {
+            Object[] ar = (Object[]) cur;
+			al.add( ar[ 0]); // the 1st entry is the interesting word
+		}
+		String[] res = new String[ al.size()];
+		return (String[]) al.toArray( res);
+	}
+
    /**
     * PriorityQueue that orders words by score.
     */