mirror of https://github.com/apache/lucene.git
LUCENE-6139: TokenGroup start/end offset getters should have been returning offsets of matching tokens when there are some.
Also made the Highlighter use the getters instead of direct field access. git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1649263 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
8fd247cc0e
commit
d251c71a97
|
@ -409,6 +409,10 @@ Bug Fixes
|
||||||
* LUCENE-6152: Fix double close problems in OutputStreamIndexOutput.
|
* LUCENE-6152: Fix double close problems in OutputStreamIndexOutput.
|
||||||
(Uwe Schindler)
|
(Uwe Schindler)
|
||||||
|
|
||||||
|
* LUCENE-6139: Highlighter: TokenGroup start & end offset getters should have
|
||||||
|
been returning the offsets of just the matching tokens in the group when
|
||||||
|
there's a distinction. (David Smiley)
|
||||||
|
|
||||||
Documentation
|
Documentation
|
||||||
|
|
||||||
* LUCENE-5392: Add/improve analysis package documentation to reflect
|
* LUCENE-5392: Add/improve analysis package documentation to reflect
|
||||||
|
|
|
@ -225,12 +225,12 @@ public class Highlighter
|
||||||
throw new InvalidTokenOffsetsException("Token "+ termAtt.toString()
|
throw new InvalidTokenOffsetsException("Token "+ termAtt.toString()
|
||||||
+" exceeds length of provided text sized "+text.length());
|
+" exceeds length of provided text sized "+text.length());
|
||||||
}
|
}
|
||||||
if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct()))
|
if((tokenGroup.getNumTokens() >0)&&(tokenGroup.isDistinct()))
|
||||||
{
|
{
|
||||||
//the current token is distinct from previous tokens -
|
//the current token is distinct from previous tokens -
|
||||||
// markup the cached token group info
|
// markup the cached token group info
|
||||||
startOffset = tokenGroup.matchStartOffset;
|
startOffset = tokenGroup.getStartOffset();
|
||||||
endOffset = tokenGroup.matchEndOffset;
|
endOffset = tokenGroup.getEndOffset();
|
||||||
tokenText = text.substring(startOffset, endOffset);
|
tokenText = text.substring(startOffset, endOffset);
|
||||||
String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
|
String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
|
||||||
//store any whitespace etc from between this and last group
|
//store any whitespace etc from between this and last group
|
||||||
|
@ -261,11 +261,11 @@ public class Highlighter
|
||||||
}
|
}
|
||||||
currentFrag.setScore(fragmentScorer.getFragmentScore());
|
currentFrag.setScore(fragmentScorer.getFragmentScore());
|
||||||
|
|
||||||
if(tokenGroup.numTokens>0)
|
if(tokenGroup.getNumTokens() >0)
|
||||||
{
|
{
|
||||||
//flush the accumulated text (same code as in above loop)
|
//flush the accumulated text (same code as in above loop)
|
||||||
startOffset = tokenGroup.matchStartOffset;
|
startOffset = tokenGroup.getStartOffset();
|
||||||
endOffset = tokenGroup.matchEndOffset;
|
endOffset = tokenGroup.getEndOffset();
|
||||||
tokenText = text.substring(startOffset, endOffset);
|
tokenText = text.substring(startOffset, endOffset);
|
||||||
String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
|
String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
|
||||||
//store any whitespace etc from between this and last group
|
//store any whitespace etc from between this and last group
|
||||||
|
|
|
@ -24,18 +24,20 @@ import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* One, or several overlapping tokens, along with the score(s) and the scope of
|
* One, or several overlapping tokens, along with the score(s) and the scope of
|
||||||
* the original text
|
* the original text.
|
||||||
*/
|
*/
|
||||||
public class TokenGroup {
|
public class TokenGroup {
|
||||||
|
|
||||||
private static final int MAX_NUM_TOKENS_PER_GROUP = 50;
|
private static final int MAX_NUM_TOKENS_PER_GROUP = 50;
|
||||||
Token [] tokens=new Token[MAX_NUM_TOKENS_PER_GROUP];
|
|
||||||
float[] scores = new float[MAX_NUM_TOKENS_PER_GROUP];
|
private Token[] tokens = new Token[MAX_NUM_TOKENS_PER_GROUP];
|
||||||
int numTokens = 0;
|
private float[] scores = new float[MAX_NUM_TOKENS_PER_GROUP];
|
||||||
int startOffset = 0;
|
private int numTokens = 0;
|
||||||
int endOffset = 0;
|
private int startOffset = 0;
|
||||||
float tot;
|
private int endOffset = 0;
|
||||||
int matchStartOffset, matchEndOffset;
|
private float tot;
|
||||||
|
private int matchStartOffset;
|
||||||
|
private int matchEndOffset;
|
||||||
|
|
||||||
private OffsetAttribute offsetAtt;
|
private OffsetAttribute offsetAtt;
|
||||||
private CharTermAttribute termAtt;
|
private CharTermAttribute termAtt;
|
||||||
|
@ -47,8 +49,8 @@ public class TokenGroup {
|
||||||
|
|
||||||
void addToken(float score) {
|
void addToken(float score) {
|
||||||
if (numTokens < MAX_NUM_TOKENS_PER_GROUP) {
|
if (numTokens < MAX_NUM_TOKENS_PER_GROUP) {
|
||||||
int termStartOffset = offsetAtt.startOffset();
|
final int termStartOffset = offsetAtt.startOffset();
|
||||||
int termEndOffset = offsetAtt.endOffset();
|
final int termEndOffset = offsetAtt.endOffset();
|
||||||
if (numTokens == 0) {
|
if (numTokens == 0) {
|
||||||
startOffset = matchStartOffset = termStartOffset;
|
startOffset = matchStartOffset = termStartOffset;
|
||||||
endOffset = matchEndOffset = termEndOffset;
|
endOffset = matchEndOffset = termEndOffset;
|
||||||
|
@ -58,8 +60,8 @@ public class TokenGroup {
|
||||||
endOffset = Math.max(endOffset, termEndOffset);
|
endOffset = Math.max(endOffset, termEndOffset);
|
||||||
if (score > 0) {
|
if (score > 0) {
|
||||||
if (tot == 0) {
|
if (tot == 0) {
|
||||||
matchStartOffset = offsetAtt.startOffset();
|
matchStartOffset = termStartOffset;
|
||||||
matchEndOffset = offsetAtt.endOffset();
|
matchEndOffset = termEndOffset;
|
||||||
} else {
|
} else {
|
||||||
matchStartOffset = Math.min(matchStartOffset, termStartOffset);
|
matchStartOffset = Math.min(matchStartOffset, termStartOffset);
|
||||||
matchEndOffset = Math.max(matchEndOffset, termEndOffset);
|
matchEndOffset = Math.max(matchEndOffset, termEndOffset);
|
||||||
|
@ -85,12 +87,11 @@ public class TokenGroup {
|
||||||
tot = 0;
|
tot = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/**
|
||||||
* @param index a value between 0 and numTokens -1
|
* @param index a value between 0 and numTokens -1
|
||||||
* @return the "n"th token
|
* @return the "n"th token
|
||||||
*/
|
*/
|
||||||
public Token getToken(int index)
|
public Token getToken(int index) {
|
||||||
{
|
|
||||||
return tokens[index];
|
return tokens[index];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -104,10 +105,19 @@ public class TokenGroup {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @return the end position in the original text
|
* @return the earliest start offset in the original text of a matching token in this group (score > 0), or
|
||||||
|
* if there are none then the earliest offset of any token in the group.
|
||||||
|
*/
|
||||||
|
public int getStartOffset() {
|
||||||
|
return matchStartOffset;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return the latest end offset in the original text of a matching token in this group (score > 0), or
|
||||||
|
* if there are none then {@link #getEndOffset()}.
|
||||||
*/
|
*/
|
||||||
public int getEndOffset() {
|
public int getEndOffset() {
|
||||||
return endOffset;
|
return matchEndOffset;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -117,17 +127,11 @@ public class TokenGroup {
|
||||||
return numTokens;
|
return numTokens;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* @return the start position in the original text
|
|
||||||
*/
|
|
||||||
public int getStartOffset() {
|
|
||||||
return startOffset;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @return all tokens' scores summed up
|
* @return all tokens' scores summed up
|
||||||
*/
|
*/
|
||||||
public float getTotalScore() {
|
public float getTotalScore() {
|
||||||
return tot;
|
return tot;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue