mirror of https://github.com/apache/lucene.git
Fix for http://issues.apache.org/jira/browse/LUCENE-645 with added Junit tests for this bug and related problem where last fragment can be huge if highlighting huge documents.
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@432042 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
d3629f25eb
commit
d516bf50d8
|
@ -21,6 +21,7 @@ import java.util.ArrayList;
|
|||
import java.util.Iterator;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.util.PriorityQueue;
|
||||
|
||||
|
@ -221,8 +222,8 @@ public class Highlighter
|
|||
textFragmenter.start(text);
|
||||
|
||||
TokenGroup tokenGroup=new TokenGroup();
|
||||
|
||||
while ((token = tokenStream.next()) != null)
|
||||
token = tokenStream.next();
|
||||
while ((token!= null)&&(token.startOffset()<maxDocBytesToAnalyze))
|
||||
{
|
||||
if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct(token)))
|
||||
{
|
||||
|
@ -251,12 +252,13 @@ public class Highlighter
|
|||
}
|
||||
}
|
||||
|
||||
tokenGroup.addToken(token,fragmentScorer.getTokenScore(token));
|
||||
tokenGroup.addToken(token,fragmentScorer.getTokenScore(token));
|
||||
|
||||
if(lastEndOffset>maxDocBytesToAnalyze)
|
||||
{
|
||||
break;
|
||||
}
|
||||
// if(lastEndOffset>maxDocBytesToAnalyze)
|
||||
// {
|
||||
// break;
|
||||
// }
|
||||
token = tokenStream.next();
|
||||
}
|
||||
currentFrag.setScore(fragmentScorer.getFragmentScore());
|
||||
|
||||
|
@ -274,9 +276,18 @@ public class Highlighter
|
|||
lastEndOffset=Math.max(lastEndOffset,endOffset);
|
||||
}
|
||||
|
||||
// append text after end of last token
|
||||
// if (lastEndOffset < text.length())
|
||||
// newText.append(encoder.encodeText(text.substring(lastEndOffset)));
|
||||
//Test what remains of the original text beyond the point where we stopped analyzing
|
||||
if (
|
||||
// if there is text beyond the last token considered..
|
||||
(lastEndOffset < text.length())
|
||||
&&
|
||||
// and that text is not too large...
|
||||
(text.length()<maxDocBytesToAnalyze)
|
||||
)
|
||||
{
|
||||
//append it to the last fragment
|
||||
newText.append(encoder.encodeText(text.substring(lastEndOffset)));
|
||||
}
|
||||
|
||||
currentFrag.textEndPos = newText.length();
|
||||
|
||||
|
|
|
@ -44,6 +44,7 @@ import org.apache.lucene.search.PhraseQuery;
|
|||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.RangeFilter;
|
||||
import org.apache.lucene.search.Searcher;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.spans.SpanNearQuery;
|
||||
import org.apache.lucene.search.spans.SpanQuery;
|
||||
import org.apache.lucene.search.spans.SpanTermQuery;
|
||||
|
@ -155,6 +156,17 @@ public class HighlighterTest extends TestCase implements Formatter
|
|||
//Currently highlights "John" and "Kennedy" separately
|
||||
assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 2);
|
||||
}
|
||||
|
||||
public void testOffByOne() throws IOException
|
||||
{
|
||||
TermQuery query= new TermQuery( new Term( "data", "help" ));
|
||||
Highlighter hg = new Highlighter(new SimpleHTMLFormatter(), new QueryScorer( query ));
|
||||
hg.setTextFragmenter( new NullFragmenter() );
|
||||
|
||||
String match = null;
|
||||
match = hg.getBestFragment( new StandardAnalyzer(), "data", "help me [54-65]");
|
||||
assertEquals("<B>help</B> me [54-65]", match);
|
||||
}
|
||||
public void testGetBestFragmentsFilteredQuery() throws Exception
|
||||
{
|
||||
RangeFilter rf=new RangeFilter("contents","john","john",true,true);
|
||||
|
@ -338,6 +350,40 @@ public class HighlighterTest extends TestCase implements Formatter
|
|||
"us from finding matches for this record: " + numHighlights +
|
||||
" found", numHighlights == 0);
|
||||
}
|
||||
public void testMaxSizeHighlightTruncates() throws IOException
|
||||
{
|
||||
String goodWord="goodtoken";
|
||||
String stopWords[]={"stoppedtoken"};
|
||||
|
||||
TermQuery query= new TermQuery( new Term( "data", goodWord ));
|
||||
SimpleHTMLFormatter fm=new SimpleHTMLFormatter();
|
||||
Highlighter hg = new Highlighter(fm, new QueryScorer( query ));
|
||||
hg.setTextFragmenter( new NullFragmenter() );
|
||||
|
||||
String match = null;
|
||||
StringBuffer sb=new StringBuffer();
|
||||
sb.append(goodWord);
|
||||
for(int i=0;i<10000;i++)
|
||||
{
|
||||
sb.append(" ");
|
||||
sb.append(stopWords[0]);
|
||||
}
|
||||
|
||||
hg.setMaxDocBytesToAnalyze(100);
|
||||
match = hg.getBestFragment( new StandardAnalyzer(stopWords), "data", sb.toString());
|
||||
assertTrue("Matched text should be no more than 100 chars in length ",
|
||||
match.length()<hg.getMaxDocBytesToAnalyze());
|
||||
|
||||
//add another tokenized word to the overrall length - but set way beyond
|
||||
//the length of text under consideration (after a large slug of stop words + whitespace)
|
||||
sb.append(" ");
|
||||
sb.append(goodWord);
|
||||
match = hg.getBestFragment( new StandardAnalyzer(stopWords), "data", sb.toString());
|
||||
assertTrue("Matched text should be no more than 100 chars in length ",
|
||||
match.length()<hg.getMaxDocBytesToAnalyze());
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue