Fix for http://issues.apache.org/jira/browse/LUCENE-645 with added Junit tests for this bug and related problem where last fragment can be huge if highlighting huge documents.

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@432042 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Mark Harwood 2006-08-16 21:42:18 +00:00
parent d3629f25eb
commit d516bf50d8
2 changed files with 67 additions and 10 deletions

View File

@ -21,6 +21,7 @@ import java.util.ArrayList;
import java.util.Iterator;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.util.PriorityQueue;
@ -221,8 +222,8 @@ public class Highlighter
textFragmenter.start(text);
TokenGroup tokenGroup=new TokenGroup();
while ((token = tokenStream.next()) != null)
token = tokenStream.next();
while ((token!= null)&&(token.startOffset()<maxDocBytesToAnalyze))
{
if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct(token)))
{
@ -251,12 +252,13 @@ public class Highlighter
}
}
tokenGroup.addToken(token,fragmentScorer.getTokenScore(token));
tokenGroup.addToken(token,fragmentScorer.getTokenScore(token));
if(lastEndOffset>maxDocBytesToAnalyze)
{
break;
}
// if(lastEndOffset>maxDocBytesToAnalyze)
// {
// break;
// }
token = tokenStream.next();
}
currentFrag.setScore(fragmentScorer.getFragmentScore());
@ -274,9 +276,18 @@ public class Highlighter
lastEndOffset=Math.max(lastEndOffset,endOffset);
}
// append text after end of last token
// if (lastEndOffset < text.length())
// newText.append(encoder.encodeText(text.substring(lastEndOffset)));
//Test what remains of the original text beyond the point where we stopped analyzing
if (
// if there is text beyond the last token considered..
(lastEndOffset < text.length())
&&
// and that text is not too large...
(text.length()<maxDocBytesToAnalyze)
)
{
//append it to the last fragment
newText.append(encoder.encodeText(text.substring(lastEndOffset)));
}
currentFrag.textEndPos = newText.length();

View File

@ -44,6 +44,7 @@ import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.RangeFilter;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
@ -155,6 +156,17 @@ public class HighlighterTest extends TestCase implements Formatter
//Currently highlights "John" and "Kennedy" separately
assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 2);
}
public void testOffByOne() throws IOException
{
TermQuery query= new TermQuery( new Term( "data", "help" ));
Highlighter hg = new Highlighter(new SimpleHTMLFormatter(), new QueryScorer( query ));
hg.setTextFragmenter( new NullFragmenter() );
String match = null;
match = hg.getBestFragment( new StandardAnalyzer(), "data", "help me [54-65]");
assertEquals("<B>help</B> me [54-65]", match);
}
public void testGetBestFragmentsFilteredQuery() throws Exception
{
RangeFilter rf=new RangeFilter("contents","john","john",true,true);
@ -338,6 +350,40 @@ public class HighlighterTest extends TestCase implements Formatter
"us from finding matches for this record: " + numHighlights +
" found", numHighlights == 0);
}
public void testMaxSizeHighlightTruncates() throws IOException
{
String goodWord="goodtoken";
String stopWords[]={"stoppedtoken"};
TermQuery query= new TermQuery( new Term( "data", goodWord ));
SimpleHTMLFormatter fm=new SimpleHTMLFormatter();
Highlighter hg = new Highlighter(fm, new QueryScorer( query ));
hg.setTextFragmenter( new NullFragmenter() );
String match = null;
StringBuffer sb=new StringBuffer();
sb.append(goodWord);
for(int i=0;i<10000;i++)
{
sb.append(" ");
sb.append(stopWords[0]);
}
hg.setMaxDocBytesToAnalyze(100);
match = hg.getBestFragment( new StandardAnalyzer(stopWords), "data", sb.toString());
assertTrue("Matched text should be no more than 100 chars in length ",
match.length()<hg.getMaxDocBytesToAnalyze());
//add another tokenized word to the overrall length - but set way beyond
//the length of text under consideration (after a large slug of stop words + whitespace)
sb.append(" ");
sb.append(goodWord);
match = hg.getBestFragment( new StandardAnalyzer(stopWords), "data", sb.toString());
assertTrue("Matched text should be no more than 100 chars in length ",
match.length()<hg.getMaxDocBytesToAnalyze());
}