New low-level API exposed (getBestTextFragments) to provide access to fragment scores.

ToString() implemented on TextFragment objects to return text git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150997 13f79535-47bb-0310-9956-ffa450edef68
2004-08-12 22:09:37 +00:00 · 2004-08-12 22:09:37 +00:00 · 726ddaeb5a
parent f920cff71a
commit 726ddaeb5a
3 changed files with 70 additions and 22 deletions
--- a/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java
+++ b/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java
@ -96,11 +96,8 @@ public class Highlighter
 		throws IOException
 	{
 		maxNumFragments = Math.max(1, maxNumFragments); //sanity check
-		StringBuffer newText = new StringBuffer();
-		
-		TextFragment[] frag =getBestDocFragments(tokenStream,text, newText, maxNumFragments);
-
-		mergeContiguousFragments(frag);
+	
+		TextFragment[] frag =getBestTextFragments(tokenStream,text, true,maxNumFragments);

 		//Get text
 		ArrayList fragTexts = new ArrayList();
@ -109,33 +106,35 @@ public class Highlighter
 		{
 			if ((frag[i] != null) && (frag[i].getScore() > 0))
 			{
-				fragTexts.add(
-					newText.substring(
-						frag[i].textStartPos,
-						frag[i].textEndPos));
+				fragTexts.add(frag[i].toString());
 			}
 		}
 		return (String[]) fragTexts.toArray(new String[0]);
 	}
+	

 	/**
-	 * Low level api to get the most relevant sections of the document
+	 * Low level api to get the most relevant (formatted) sections of the document.
+	 * This method has been made public to allow visibility of score information held in TextFragment objects.
+	 * Thanks to Jason Calabrese for help in redefining the interface.  
 	 * @param tokenStream
 	 * @param text
 	 * @param maxNumFragments
+	 * @param mergeContiguousFragments
 	 * @return 
 	 * @throws IOException
 	 */
-	private final TextFragment[] getBestDocFragments(
+	public final TextFragment[] getBestTextFragments(
 		TokenStream tokenStream,	
 		String text,
-		StringBuffer newText,
+		boolean mergeContiguousFragments,
 		int maxNumFragments)
 		throws IOException
 	{
 		ArrayList docFrags = new ArrayList();
+		StringBuffer newText=new StringBuffer();

-		TextFragment currentFrag =	new TextFragment(newText.length(), docFrags.size());
+		TextFragment currentFrag =	new TextFragment(newText,newText.length(), docFrags.size());
 		fragmentScorer.startFragment(currentFrag);
 		docFrags.add(currentFrag);
 	
@ -175,7 +174,7 @@ public class Highlighter
 						currentFrag.setScore(fragmentScorer.getFragmentScore());
 						//record stats for a new fragment
 						currentFrag.textEndPos = newText.length();
-						currentFrag =new TextFragment(newText.length(), docFrags.size());
+						currentFrag =new TextFragment(newText, newText.length(), docFrags.size());
 						fragmentScorer.startFragment(currentFrag);
 						docFrags.add(currentFrag);
 					}
@ -243,6 +242,22 @@ public class Highlighter
 			{
 				frag[i] = (TextFragment) fragQueue.pop();
 			}
+			
+			//merge any contiguous fragments to improve readability
+			if(mergeContiguousFragments)
+			{
+				mergeContiguousFragments(frag);
+				ArrayList fragTexts = new ArrayList();
+				for (int i = 0; i < frag.length; i++)
+				{
+					if ((frag[i] != null) && (frag[i].getScore() > 0))
+					{
+						fragTexts.add(frag[i]);
+					}
+				}
+				frag= (TextFragment[]) fragTexts.toArray(new TextFragment[0]);				
+			}
+			
 			return frag;

 		}
--- a/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/TextFragment.java
+++ b/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/TextFragment.java
@ -25,13 +25,15 @@ package org.apache.lucene.search.highlight;
 */
 public class TextFragment
 {
+	StringBuffer markedUpText;
 	int fragNum;
 	int textStartPos;
 	int textEndPos;
 	float score;

-	public TextFragment(int textStartPos, int fragNum)
+	public TextFragment(StringBuffer markedUpText,int textStartPos, int fragNum)
 	{
+		this.markedUpText=markedUpText;
 		this.textStartPos = textStartPos;
 		this.fragNum = fragNum;
 	}
@ -68,4 +70,10 @@ public class TextFragment
 		return fragNum;
 	}

+	/* Returns the marked-up text for this text fragment 
+	 */
+	public String toString() {
+		return markedUpText.substring(textStartPos, textEndPos);
+	}
+
 }
--- a/sandbox/contributions/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java
+++ b/sandbox/contributions/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java
@ -20,6 +20,7 @@ import java.io.IOException;
 import java.io.Reader;
 import java.io.StringReader;
 import java.util.HashMap;
+import java.util.Iterator;
 import java.util.Map;
 import java.util.StringTokenizer;

@ -41,12 +42,6 @@ import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.MultiSearcher;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.Searcher;
-import org.apache.lucene.search.highlight.Formatter;
-import org.apache.lucene.search.highlight.Highlighter;
-import org.apache.lucene.search.highlight.QueryScorer;
-import org.apache.lucene.search.highlight.SimpleFragmenter;
-import org.apache.lucene.search.highlight.TokenGroup;
-import org.apache.lucene.search.highlight.WeightedTerm;
 import org.apache.lucene.store.RAMDirectory;

 /**
@ -219,7 +214,6 @@ public class HighlighterTest extends TestCase implements Formatter
 	}


-
 	public void testGetSimpleHighlight() throws Exception
 	{
 		doSearching("Kennedy");
@ -237,6 +231,37 @@ public class HighlighterTest extends TestCase implements Formatter
 		assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4);
 	}

+
+	public void testGetTextFragments() throws Exception
+	{
+		doSearching("Kennedy");
+		Highlighter highlighter =
+			new Highlighter(this,new QueryScorer(query));
+		highlighter.setTextFragmenter(new SimpleFragmenter(20));
+
+		for (int i = 0; i < hits.length(); i++)
+		{
+			String text = hits.doc(i).get(FIELD_NAME);
+			TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text));
+
+			String stringResults[] = highlighter.getBestFragments(tokenStream,text,10);
+
+			tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text));
+			TextFragment fragmentResults[] = highlighter.getBestTextFragments(tokenStream,text,true,10);
+
+			assertTrue("Failed to find correct number of text Fragments: " + 
+				fragmentResults.length + " vs "+ stringResults.length, fragmentResults.length==stringResults.length);
+			for (int j = 0; j < stringResults.length; j++) 
+			{
+				System.out.println(fragmentResults[j]);
+				assertTrue("Failed to find same text Fragments: " + 
+					fragmentResults[j] + " found", fragmentResults[j].toString().equals(stringResults[j]));
+				
+			}
+			
+		}
+	}
+
 	public void testMaxSizeHighlight() throws Exception
 	{
 		doSearching("meat");