mirror of https://github.com/apache/lucene.git
Added Yonik's patch to deal with overlapping tokens - see http://issues.apache.org/jira/browse/LUCENE-627?page=comments#action_12421332
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@422301 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
69299c5663
commit
5f7214b023
|
@ -228,15 +228,15 @@ public class Highlighter
|
|||
{
|
||||
//the current token is distinct from previous tokens -
|
||||
// markup the cached token group info
|
||||
startOffset = tokenGroup.startOffset;
|
||||
endOffset = tokenGroup.endOffset;
|
||||
startOffset = tokenGroup.matchStartOffset;
|
||||
endOffset = tokenGroup.matchEndOffset;
|
||||
tokenText = text.substring(startOffset, endOffset);
|
||||
String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
|
||||
//store any whitespace etc from between this and last group
|
||||
if (startOffset > lastEndOffset)
|
||||
newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
|
||||
newText.append(markedUpText);
|
||||
lastEndOffset=endOffset;
|
||||
lastEndOffset=Math.max(endOffset, lastEndOffset);
|
||||
tokenGroup.clear();
|
||||
|
||||
//check if current token marks the start of a new fragment
|
||||
|
@ -251,7 +251,7 @@ public class Highlighter
|
|||
}
|
||||
}
|
||||
|
||||
tokenGroup.addToken(token,fragmentScorer.getTokenScore(token));
|
||||
tokenGroup.addToken(token,fragmentScorer.getTokenScore(token));
|
||||
|
||||
if(lastEndOffset>maxDocBytesToAnalyze)
|
||||
{
|
||||
|
@ -263,15 +263,15 @@ public class Highlighter
|
|||
if(tokenGroup.numTokens>0)
|
||||
{
|
||||
//flush the accumulated text (same code as in above loop)
|
||||
startOffset = tokenGroup.startOffset;
|
||||
endOffset = tokenGroup.endOffset;
|
||||
startOffset = tokenGroup.matchStartOffset;
|
||||
endOffset = tokenGroup.matchEndOffset;
|
||||
tokenText = text.substring(startOffset, endOffset);
|
||||
String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
|
||||
//store any whitespace etc from between this and last group
|
||||
if (startOffset > lastEndOffset)
|
||||
newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
|
||||
newText.append(markedUpText);
|
||||
lastEndOffset=endOffset;
|
||||
lastEndOffset=Math.max(lastEndOffset,endOffset);
|
||||
}
|
||||
|
||||
// append text after end of last token
|
||||
|
|
|
@ -30,22 +30,36 @@ public class TokenGroup
|
|||
int numTokens=0;
|
||||
int startOffset=0;
|
||||
int endOffset=0;
|
||||
float tot;
|
||||
|
||||
int matchStartOffset, matchEndOffset;
|
||||
|
||||
|
||||
void addToken(Token token, float score)
|
||||
void addToken(Token token, float score)
|
||||
{
|
||||
if(numTokens < MAX_NUM_TOKENS_PER_GROUP)
|
||||
{
|
||||
if(numTokens==0)
|
||||
{
|
||||
startOffset=token.startOffset();
|
||||
endOffset=token.endOffset();
|
||||
startOffset=matchStartOffset=token.startOffset();
|
||||
endOffset=matchEndOffset=token.endOffset();
|
||||
tot += score;
|
||||
}
|
||||
else
|
||||
{
|
||||
startOffset=Math.min(startOffset,token.startOffset());
|
||||
endOffset=Math.max(endOffset,token.endOffset());
|
||||
}
|
||||
if (score>0) {
|
||||
if (tot==0) {
|
||||
matchStartOffset=token.startOffset();
|
||||
matchEndOffset=token.endOffset();
|
||||
} else {
|
||||
matchStartOffset=Math.min(matchStartOffset,token.startOffset());
|
||||
matchEndOffset=Math.max(matchEndOffset,token.endOffset());
|
||||
}
|
||||
tot+=score;
|
||||
}
|
||||
}
|
||||
tokens[numTokens]=token;
|
||||
scores[numTokens]=score;
|
||||
numTokens++;
|
||||
|
@ -61,6 +75,7 @@ public class TokenGroup
|
|||
void clear()
|
||||
{
|
||||
numTokens=0;
|
||||
tot=0;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -112,11 +127,6 @@ public class TokenGroup
|
|||
*/
|
||||
public float getTotalScore()
|
||||
{
|
||||
float total=0;
|
||||
for (int i = 0; i < numTokens; i++)
|
||||
{
|
||||
total+=scores[i];
|
||||
}
|
||||
return total;
|
||||
return tot;
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue