New low-level API exposed (getBestTextFragments) to provide access to fragment scores.

ToString() implemented on TextFragment objects to return text


git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150997 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Mark Harwood 2004-08-12 22:09:37 +00:00
parent f920cff71a
commit 726ddaeb5a
3 changed files with 70 additions and 22 deletions

View File

@ -96,11 +96,8 @@ public class Highlighter
throws IOException
{
maxNumFragments = Math.max(1, maxNumFragments); //sanity check
StringBuffer newText = new StringBuffer();
TextFragment[] frag =getBestDocFragments(tokenStream,text, newText, maxNumFragments);
mergeContiguousFragments(frag);
TextFragment[] frag =getBestTextFragments(tokenStream,text, true,maxNumFragments);
//Get text
ArrayList fragTexts = new ArrayList();
@ -109,33 +106,35 @@ public class Highlighter
{
if ((frag[i] != null) && (frag[i].getScore() > 0))
{
fragTexts.add(
newText.substring(
frag[i].textStartPos,
frag[i].textEndPos));
fragTexts.add(frag[i].toString());
}
}
return (String[]) fragTexts.toArray(new String[0]);
}
/**
* Low level api to get the most relevant sections of the document
* Low level api to get the most relevant (formatted) sections of the document.
* This method has been made public to allow visibility of score information held in TextFragment objects.
* Thanks to Jason Calabrese for help in redefining the interface.
* @param tokenStream
* @param text
* @param maxNumFragments
* @param mergeContiguousFragments
* @return
* @throws IOException
*/
private final TextFragment[] getBestDocFragments(
public final TextFragment[] getBestTextFragments(
TokenStream tokenStream,
String text,
StringBuffer newText,
boolean mergeContiguousFragments,
int maxNumFragments)
throws IOException
{
ArrayList docFrags = new ArrayList();
StringBuffer newText=new StringBuffer();
TextFragment currentFrag = new TextFragment(newText.length(), docFrags.size());
TextFragment currentFrag = new TextFragment(newText,newText.length(), docFrags.size());
fragmentScorer.startFragment(currentFrag);
docFrags.add(currentFrag);
@ -175,7 +174,7 @@ public class Highlighter
currentFrag.setScore(fragmentScorer.getFragmentScore());
//record stats for a new fragment
currentFrag.textEndPos = newText.length();
currentFrag =new TextFragment(newText.length(), docFrags.size());
currentFrag =new TextFragment(newText, newText.length(), docFrags.size());
fragmentScorer.startFragment(currentFrag);
docFrags.add(currentFrag);
}
@ -243,6 +242,22 @@ public class Highlighter
{
frag[i] = (TextFragment) fragQueue.pop();
}
//merge any contiguous fragments to improve readability
if(mergeContiguousFragments)
{
mergeContiguousFragments(frag);
ArrayList fragTexts = new ArrayList();
for (int i = 0; i < frag.length; i++)
{
if ((frag[i] != null) && (frag[i].getScore() > 0))
{
fragTexts.add(frag[i]);
}
}
frag= (TextFragment[]) fragTexts.toArray(new TextFragment[0]);
}
return frag;
}

View File

@ -25,13 +25,15 @@ package org.apache.lucene.search.highlight;
*/
public class TextFragment
{
StringBuffer markedUpText;
int fragNum;
int textStartPos;
int textEndPos;
float score;
public TextFragment(int textStartPos, int fragNum)
public TextFragment(StringBuffer markedUpText,int textStartPos, int fragNum)
{
this.markedUpText=markedUpText;
this.textStartPos = textStartPos;
this.fragNum = fragNum;
}
@ -68,4 +70,10 @@ public class TextFragment
return fragNum;
}
/* Returns the marked-up text for this text fragment
*/
public String toString() {
return markedUpText.substring(textStartPos, textEndPos);
}
}

View File

@ -20,6 +20,7 @@ import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.StringTokenizer;
@ -41,12 +42,6 @@ import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MultiSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.highlight.Formatter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.TokenGroup;
import org.apache.lucene.search.highlight.WeightedTerm;
import org.apache.lucene.store.RAMDirectory;
/**
@ -219,7 +214,6 @@ public class HighlighterTest extends TestCase implements Formatter
}
public void testGetSimpleHighlight() throws Exception
{
doSearching("Kennedy");
@ -237,6 +231,37 @@ public class HighlighterTest extends TestCase implements Formatter
assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4);
}
public void testGetTextFragments() throws Exception
{
doSearching("Kennedy");
Highlighter highlighter =
new Highlighter(this,new QueryScorer(query));
highlighter.setTextFragmenter(new SimpleFragmenter(20));
for (int i = 0; i < hits.length(); i++)
{
String text = hits.doc(i).get(FIELD_NAME);
TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text));
String stringResults[] = highlighter.getBestFragments(tokenStream,text,10);
tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text));
TextFragment fragmentResults[] = highlighter.getBestTextFragments(tokenStream,text,true,10);
assertTrue("Failed to find correct number of text Fragments: " +
fragmentResults.length + " vs "+ stringResults.length, fragmentResults.length==stringResults.length);
for (int j = 0; j < stringResults.length; j++)
{
System.out.println(fragmentResults[j]);
assertTrue("Failed to find same text Fragments: " +
fragmentResults[j] + " found", fragmentResults[j].toString().equals(stringResults[j]));
}
}
}
public void testMaxSizeHighlight() throws Exception
{
doSearching("meat");