LUCENE-6139: TokenGroup start/end offset getters should have been returning offsets of matching tokens when there are some.

Also made the Highlighter use the getters instead of direct field access.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1649263 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
David Wayne Smiley 2015-01-03 23:23:12 +00:00
parent 8fd247cc0e
commit d251c71a97
3 changed files with 44 additions and 36 deletions

View File

@ -409,6 +409,10 @@ Bug Fixes
* LUCENE-6152: Fix double close problems in OutputStreamIndexOutput. * LUCENE-6152: Fix double close problems in OutputStreamIndexOutput.
(Uwe Schindler) (Uwe Schindler)
* LUCENE-6139: Highlighter: TokenGroup start & end offset getters should have
been returning the offsets of just the matching tokens in the group when
there's a distinction. (David Smiley)
Documentation Documentation
* LUCENE-5392: Add/improve analysis package documentation to reflect * LUCENE-5392: Add/improve analysis package documentation to reflect

View File

@ -225,12 +225,12 @@ public class Highlighter
throw new InvalidTokenOffsetsException("Token "+ termAtt.toString() throw new InvalidTokenOffsetsException("Token "+ termAtt.toString()
+" exceeds length of provided text sized "+text.length()); +" exceeds length of provided text sized "+text.length());
} }
if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct())) if((tokenGroup.getNumTokens() >0)&&(tokenGroup.isDistinct()))
{ {
//the current token is distinct from previous tokens - //the current token is distinct from previous tokens -
// markup the cached token group info // markup the cached token group info
startOffset = tokenGroup.matchStartOffset; startOffset = tokenGroup.getStartOffset();
endOffset = tokenGroup.matchEndOffset; endOffset = tokenGroup.getEndOffset();
tokenText = text.substring(startOffset, endOffset); tokenText = text.substring(startOffset, endOffset);
String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup); String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
//store any whitespace etc from between this and last group //store any whitespace etc from between this and last group
@ -261,11 +261,11 @@ public class Highlighter
} }
currentFrag.setScore(fragmentScorer.getFragmentScore()); currentFrag.setScore(fragmentScorer.getFragmentScore());
if(tokenGroup.numTokens>0) if(tokenGroup.getNumTokens() >0)
{ {
//flush the accumulated text (same code as in above loop) //flush the accumulated text (same code as in above loop)
startOffset = tokenGroup.matchStartOffset; startOffset = tokenGroup.getStartOffset();
endOffset = tokenGroup.matchEndOffset; endOffset = tokenGroup.getEndOffset();
tokenText = text.substring(startOffset, endOffset); tokenText = text.substring(startOffset, endOffset);
String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup); String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
//store any whitespace etc from between this and last group //store any whitespace etc from between this and last group

View File

@ -24,18 +24,20 @@ import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
/** /**
* One, or several overlapping tokens, along with the score(s) and the scope of * One, or several overlapping tokens, along with the score(s) and the scope of
* the original text * the original text.
*/ */
public class TokenGroup { public class TokenGroup {
private static final int MAX_NUM_TOKENS_PER_GROUP = 50; private static final int MAX_NUM_TOKENS_PER_GROUP = 50;
Token [] tokens=new Token[MAX_NUM_TOKENS_PER_GROUP];
float[] scores = new float[MAX_NUM_TOKENS_PER_GROUP]; private Token[] tokens = new Token[MAX_NUM_TOKENS_PER_GROUP];
int numTokens = 0; private float[] scores = new float[MAX_NUM_TOKENS_PER_GROUP];
int startOffset = 0; private int numTokens = 0;
int endOffset = 0; private int startOffset = 0;
float tot; private int endOffset = 0;
int matchStartOffset, matchEndOffset; private float tot;
private int matchStartOffset;
private int matchEndOffset;
private OffsetAttribute offsetAtt; private OffsetAttribute offsetAtt;
private CharTermAttribute termAtt; private CharTermAttribute termAtt;
@ -47,8 +49,8 @@ public class TokenGroup {
void addToken(float score) { void addToken(float score) {
if (numTokens < MAX_NUM_TOKENS_PER_GROUP) { if (numTokens < MAX_NUM_TOKENS_PER_GROUP) {
int termStartOffset = offsetAtt.startOffset(); final int termStartOffset = offsetAtt.startOffset();
int termEndOffset = offsetAtt.endOffset(); final int termEndOffset = offsetAtt.endOffset();
if (numTokens == 0) { if (numTokens == 0) {
startOffset = matchStartOffset = termStartOffset; startOffset = matchStartOffset = termStartOffset;
endOffset = matchEndOffset = termEndOffset; endOffset = matchEndOffset = termEndOffset;
@ -58,8 +60,8 @@ public class TokenGroup {
endOffset = Math.max(endOffset, termEndOffset); endOffset = Math.max(endOffset, termEndOffset);
if (score > 0) { if (score > 0) {
if (tot == 0) { if (tot == 0) {
matchStartOffset = offsetAtt.startOffset(); matchStartOffset = termStartOffset;
matchEndOffset = offsetAtt.endOffset(); matchEndOffset = termEndOffset;
} else { } else {
matchStartOffset = Math.min(matchStartOffset, termStartOffset); matchStartOffset = Math.min(matchStartOffset, termStartOffset);
matchEndOffset = Math.max(matchEndOffset, termEndOffset); matchEndOffset = Math.max(matchEndOffset, termEndOffset);
@ -85,12 +87,11 @@ public class TokenGroup {
tot = 0; tot = 0;
} }
/* /**
* @param index a value between 0 and numTokens -1 * @param index a value between 0 and numTokens -1
* @return the "n"th token * @return the "n"th token
*/ */
public Token getToken(int index) public Token getToken(int index) {
{
return tokens[index]; return tokens[index];
} }
@ -104,10 +105,19 @@ public class TokenGroup {
} }
/** /**
* @return the end position in the original text * @return the earliest start offset in the original text of a matching token in this group (score &gt; 0), or
* if there are none then the earliest offset of any token in the group.
*/
public int getStartOffset() {
return matchStartOffset;
}
/**
* @return the latest end offset in the original text of a matching token in this group (score &gt; 0), or
* if there are none then {@link #getEndOffset()}.
*/ */
public int getEndOffset() { public int getEndOffset() {
return endOffset; return matchEndOffset;
} }
/** /**
@ -117,17 +127,11 @@ public class TokenGroup {
return numTokens; return numTokens;
} }
/**
* @return the start position in the original text
*/
public int getStartOffset() {
return startOffset;
}
/** /**
* @return all tokens' scores summed up * @return all tokens' scores summed up
*/ */
public float getTotalScore() { public float getTotalScore() {
return tot; return tot;
} }
} }