mirror of https://github.com/apache/lucene.git
Added Yonik's patch to deal with overlapping tokens - see http://issues.apache.org/jira/browse/LUCENE-627?page=comments#action_12421332
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@422301 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
69299c5663
commit
5f7214b023
|
@ -25,8 +25,8 @@ import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.util.PriorityQueue;
|
import org.apache.lucene.util.PriorityQueue;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Class used to markup highlighted terms found in the best sections of a
|
* Class used to markup highlighted terms found in the best sections of a
|
||||||
* text, using configurable {@link Fragmenter}, {@link Scorer}, {@link Formatter},
|
* text, using configurable {@link Fragmenter}, {@link Scorer}, {@link Formatter},
|
||||||
* {@link Encoder} and tokenizers.
|
* {@link Encoder} and tokenizers.
|
||||||
* @author mark@searcharea.co.uk
|
* @author mark@searcharea.co.uk
|
||||||
*/
|
*/
|
||||||
|
@ -36,7 +36,7 @@ public class Highlighter
|
||||||
public static final int DEFAULT_MAX_DOC_BYTES_TO_ANALYZE=50*1024;
|
public static final int DEFAULT_MAX_DOC_BYTES_TO_ANALYZE=50*1024;
|
||||||
private int maxDocBytesToAnalyze=DEFAULT_MAX_DOC_BYTES_TO_ANALYZE;
|
private int maxDocBytesToAnalyze=DEFAULT_MAX_DOC_BYTES_TO_ANALYZE;
|
||||||
private Formatter formatter;
|
private Formatter formatter;
|
||||||
private Encoder encoder;
|
private Encoder encoder;
|
||||||
private Fragmenter textFragmenter=new SimpleFragmenter();
|
private Fragmenter textFragmenter=new SimpleFragmenter();
|
||||||
private Scorer fragmentScorer=null;
|
private Scorer fragmentScorer=null;
|
||||||
|
|
||||||
|
@ -44,14 +44,14 @@ public class Highlighter
|
||||||
{
|
{
|
||||||
this(new SimpleHTMLFormatter(),fragmentScorer);
|
this(new SimpleHTMLFormatter(),fragmentScorer);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public Highlighter(Formatter formatter, Scorer fragmentScorer)
|
public Highlighter(Formatter formatter, Scorer fragmentScorer)
|
||||||
{
|
{
|
||||||
this(formatter,new DefaultEncoder(),fragmentScorer);
|
this(formatter,new DefaultEncoder(),fragmentScorer);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public Highlighter(Formatter formatter, Encoder encoder, Scorer fragmentScorer)
|
public Highlighter(Formatter formatter, Encoder encoder, Scorer fragmentScorer)
|
||||||
{
|
{
|
||||||
this.formatter = formatter;
|
this.formatter = formatter;
|
||||||
|
@ -65,9 +65,9 @@ public class Highlighter
|
||||||
* {@link #getBestFragment(TokenStream, String)}
|
* {@link #getBestFragment(TokenStream, String)}
|
||||||
*
|
*
|
||||||
* @param analyzer the analyzer that will be used to split <code>text</code>
|
* @param analyzer the analyzer that will be used to split <code>text</code>
|
||||||
* into chunks
|
* into chunks
|
||||||
* @param text text to highlight terms in
|
* @param text text to highlight terms in
|
||||||
* @param fieldName Name of field used to influence analyzer's tokenization policy
|
* @param fieldName Name of field used to influence analyzer's tokenization policy
|
||||||
*
|
*
|
||||||
* @return highlighted text fragment or null if no terms found
|
* @return highlighted text fragment or null if no terms found
|
||||||
*/
|
*/
|
||||||
|
@ -77,18 +77,18 @@ public class Highlighter
|
||||||
TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(text));
|
TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(text));
|
||||||
return getBestFragment(tokenStream, text);
|
return getBestFragment(tokenStream, text);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Highlights chosen terms in a text, extracting the most relevant section.
|
* Highlights chosen terms in a text, extracting the most relevant section.
|
||||||
* The document text is analysed in chunks to record hit statistics
|
* The document text is analysed in chunks to record hit statistics
|
||||||
* across the document. After accumulating stats, the fragment with the highest score
|
* across the document. After accumulating stats, the fragment with the highest score
|
||||||
* is returned
|
* is returned
|
||||||
*
|
*
|
||||||
* @param tokenStream a stream of tokens identified in the text parameter, including offset information.
|
* @param tokenStream a stream of tokens identified in the text parameter, including offset information.
|
||||||
* This is typically produced by an analyzer re-parsing a document's
|
* This is typically produced by an analyzer re-parsing a document's
|
||||||
* text. Some work may be done on retrieving TokenStreams more efficently
|
* text. Some work may be done on retrieving TokenStreams more efficently
|
||||||
* by adding support for storing original text position data in the Lucene
|
* by adding support for storing original text position data in the Lucene
|
||||||
* index but this support is not currently available (as of Lucene 1.4 rc2).
|
* index but this support is not currently available (as of Lucene 1.4 rc2).
|
||||||
* @param text text to highlight terms in
|
* @param text text to highlight terms in
|
||||||
*
|
*
|
||||||
* @return highlighted text fragment or null if no terms found
|
* @return highlighted text fragment or null if no terms found
|
||||||
|
@ -110,7 +110,7 @@ public class Highlighter
|
||||||
* {@link #getBestFragments(TokenStream, String, int)}
|
* {@link #getBestFragments(TokenStream, String, int)}
|
||||||
*
|
*
|
||||||
* @param analyzer the analyzer that will be used to split <code>text</code>
|
* @param analyzer the analyzer that will be used to split <code>text</code>
|
||||||
* into chunks
|
* into chunks
|
||||||
* @param text text to highlight terms in
|
* @param text text to highlight terms in
|
||||||
* @param maxNumFragments the maximum number of fragments.
|
* @param maxNumFragments the maximum number of fragments.
|
||||||
* @deprecated This method incorrectly hardcodes the choice of fieldname. Use the
|
* @deprecated This method incorrectly hardcodes the choice of fieldname. Use the
|
||||||
|
@ -118,7 +118,7 @@ public class Highlighter
|
||||||
* @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
|
* @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
|
||||||
*/
|
*/
|
||||||
public final String[] getBestFragments(
|
public final String[] getBestFragments(
|
||||||
Analyzer analyzer,
|
Analyzer analyzer,
|
||||||
String text,
|
String text,
|
||||||
int maxNumFragments)
|
int maxNumFragments)
|
||||||
throws IOException
|
throws IOException
|
||||||
|
@ -132,7 +132,7 @@ public class Highlighter
|
||||||
* {@link #getBestFragments(TokenStream, String, int)}
|
* {@link #getBestFragments(TokenStream, String, int)}
|
||||||
*
|
*
|
||||||
* @param analyzer the analyzer that will be used to split <code>text</code>
|
* @param analyzer the analyzer that will be used to split <code>text</code>
|
||||||
* into chunks
|
* into chunks
|
||||||
* @param fieldName the name of the field being highlighted (used by analyzer)
|
* @param fieldName the name of the field being highlighted (used by analyzer)
|
||||||
* @param text text to highlight terms in
|
* @param text text to highlight terms in
|
||||||
* @param maxNumFragments the maximum number of fragments.
|
* @param maxNumFragments the maximum number of fragments.
|
||||||
|
@ -140,7 +140,7 @@ public class Highlighter
|
||||||
* @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
|
* @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
|
||||||
*/
|
*/
|
||||||
public final String[] getBestFragments(
|
public final String[] getBestFragments(
|
||||||
Analyzer analyzer,
|
Analyzer analyzer,
|
||||||
String fieldName,
|
String fieldName,
|
||||||
String text,
|
String text,
|
||||||
int maxNumFragments)
|
int maxNumFragments)
|
||||||
|
@ -149,12 +149,12 @@ public class Highlighter
|
||||||
TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(text));
|
TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(text));
|
||||||
return getBestFragments(tokenStream, text, maxNumFragments);
|
return getBestFragments(tokenStream, text, maxNumFragments);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Highlights chosen terms in a text, extracting the most relevant sections.
|
* Highlights chosen terms in a text, extracting the most relevant sections.
|
||||||
* The document text is analysed in chunks to record hit statistics
|
* The document text is analysed in chunks to record hit statistics
|
||||||
* across the document. After accumulating stats, the fragments with the highest scores
|
* across the document. After accumulating stats, the fragments with the highest scores
|
||||||
* are returned as an array of strings in order of score (contiguous fragments are merged into
|
* are returned as an array of strings in order of score (contiguous fragments are merged into
|
||||||
* one in their original order to improve readability)
|
* one in their original order to improve readability)
|
||||||
*
|
*
|
||||||
* @param text text to highlight terms in
|
* @param text text to highlight terms in
|
||||||
|
@ -163,13 +163,13 @@ public class Highlighter
|
||||||
* @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
|
* @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
|
||||||
*/
|
*/
|
||||||
public final String[] getBestFragments(
|
public final String[] getBestFragments(
|
||||||
TokenStream tokenStream,
|
TokenStream tokenStream,
|
||||||
String text,
|
String text,
|
||||||
int maxNumFragments)
|
int maxNumFragments)
|
||||||
throws IOException
|
throws IOException
|
||||||
{
|
{
|
||||||
maxNumFragments = Math.max(1, maxNumFragments); //sanity check
|
maxNumFragments = Math.max(1, maxNumFragments); //sanity check
|
||||||
|
|
||||||
TextFragment[] frag =getBestTextFragments(tokenStream,text, true,maxNumFragments);
|
TextFragment[] frag =getBestTextFragments(tokenStream,text, true,maxNumFragments);
|
||||||
|
|
||||||
//Get text
|
//Get text
|
||||||
|
@ -183,12 +183,12 @@ public class Highlighter
|
||||||
}
|
}
|
||||||
return (String[]) fragTexts.toArray(new String[0]);
|
return (String[]) fragTexts.toArray(new String[0]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Low level api to get the most relevant (formatted) sections of the document.
|
* Low level api to get the most relevant (formatted) sections of the document.
|
||||||
* This method has been made public to allow visibility of score information held in TextFragment objects.
|
* This method has been made public to allow visibility of score information held in TextFragment objects.
|
||||||
* Thanks to Jason Calabrese for help in redefining the interface.
|
* Thanks to Jason Calabrese for help in redefining the interface.
|
||||||
* @param tokenStream
|
* @param tokenStream
|
||||||
* @param text
|
* @param text
|
||||||
* @param maxNumFragments
|
* @param maxNumFragments
|
||||||
|
@ -196,7 +196,7 @@ public class Highlighter
|
||||||
* @throws IOException
|
* @throws IOException
|
||||||
*/
|
*/
|
||||||
public final TextFragment[] getBestTextFragments(
|
public final TextFragment[] getBestTextFragments(
|
||||||
TokenStream tokenStream,
|
TokenStream tokenStream,
|
||||||
String text,
|
String text,
|
||||||
boolean mergeContiguousFragments,
|
boolean mergeContiguousFragments,
|
||||||
int maxNumFragments)
|
int maxNumFragments)
|
||||||
|
@ -208,7 +208,7 @@ public class Highlighter
|
||||||
TextFragment currentFrag = new TextFragment(newText,newText.length(), docFrags.size());
|
TextFragment currentFrag = new TextFragment(newText,newText.length(), docFrags.size());
|
||||||
fragmentScorer.startFragment(currentFrag);
|
fragmentScorer.startFragment(currentFrag);
|
||||||
docFrags.add(currentFrag);
|
docFrags.add(currentFrag);
|
||||||
|
|
||||||
FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);
|
FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);
|
||||||
|
|
||||||
try
|
try
|
||||||
|
@ -219,27 +219,27 @@ public class Highlighter
|
||||||
int endOffset;
|
int endOffset;
|
||||||
int lastEndOffset = 0;
|
int lastEndOffset = 0;
|
||||||
textFragmenter.start(text);
|
textFragmenter.start(text);
|
||||||
|
|
||||||
TokenGroup tokenGroup=new TokenGroup();
|
TokenGroup tokenGroup=new TokenGroup();
|
||||||
|
|
||||||
while ((token = tokenStream.next()) != null)
|
while ((token = tokenStream.next()) != null)
|
||||||
{
|
{
|
||||||
if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct(token)))
|
if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct(token)))
|
||||||
{
|
{
|
||||||
//the current token is distinct from previous tokens -
|
//the current token is distinct from previous tokens -
|
||||||
// markup the cached token group info
|
// markup the cached token group info
|
||||||
startOffset = tokenGroup.startOffset;
|
startOffset = tokenGroup.matchStartOffset;
|
||||||
endOffset = tokenGroup.endOffset;
|
endOffset = tokenGroup.matchEndOffset;
|
||||||
tokenText = text.substring(startOffset, endOffset);
|
tokenText = text.substring(startOffset, endOffset);
|
||||||
String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
|
String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
|
||||||
//store any whitespace etc from between this and last group
|
//store any whitespace etc from between this and last group
|
||||||
if (startOffset > lastEndOffset)
|
if (startOffset > lastEndOffset)
|
||||||
newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
|
newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
|
||||||
newText.append(markedUpText);
|
newText.append(markedUpText);
|
||||||
lastEndOffset=endOffset;
|
lastEndOffset=Math.max(endOffset, lastEndOffset);
|
||||||
tokenGroup.clear();
|
tokenGroup.clear();
|
||||||
|
|
||||||
//check if current token marks the start of a new fragment
|
//check if current token marks the start of a new fragment
|
||||||
if(textFragmenter.isNewFragment(token))
|
if(textFragmenter.isNewFragment(token))
|
||||||
{
|
{
|
||||||
currentFrag.setScore(fragmentScorer.getFragmentScore());
|
currentFrag.setScore(fragmentScorer.getFragmentScore());
|
||||||
|
@ -250,28 +250,28 @@ public class Highlighter
|
||||||
docFrags.add(currentFrag);
|
docFrags.add(currentFrag);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
tokenGroup.addToken(token,fragmentScorer.getTokenScore(token));
|
tokenGroup.addToken(token,fragmentScorer.getTokenScore(token));
|
||||||
|
|
||||||
if(lastEndOffset>maxDocBytesToAnalyze)
|
if(lastEndOffset>maxDocBytesToAnalyze)
|
||||||
{
|
{
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
currentFrag.setScore(fragmentScorer.getFragmentScore());
|
currentFrag.setScore(fragmentScorer.getFragmentScore());
|
||||||
|
|
||||||
if(tokenGroup.numTokens>0)
|
if(tokenGroup.numTokens>0)
|
||||||
{
|
{
|
||||||
//flush the accumulated text (same code as in above loop)
|
//flush the accumulated text (same code as in above loop)
|
||||||
startOffset = tokenGroup.startOffset;
|
startOffset = tokenGroup.matchStartOffset;
|
||||||
endOffset = tokenGroup.endOffset;
|
endOffset = tokenGroup.matchEndOffset;
|
||||||
tokenText = text.substring(startOffset, endOffset);
|
tokenText = text.substring(startOffset, endOffset);
|
||||||
String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
|
String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
|
||||||
//store any whitespace etc from between this and last group
|
//store any whitespace etc from between this and last group
|
||||||
if (startOffset > lastEndOffset)
|
if (startOffset > lastEndOffset)
|
||||||
newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
|
newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
|
||||||
newText.append(markedUpText);
|
newText.append(markedUpText);
|
||||||
lastEndOffset=endOffset;
|
lastEndOffset=Math.max(lastEndOffset,endOffset);
|
||||||
}
|
}
|
||||||
|
|
||||||
// append text after end of last token
|
// append text after end of last token
|
||||||
|
@ -286,7 +286,7 @@ public class Highlighter
|
||||||
currentFrag = (TextFragment) i.next();
|
currentFrag = (TextFragment) i.next();
|
||||||
|
|
||||||
//If you are running with a version of Lucene before 11th Sept 03
|
//If you are running with a version of Lucene before 11th Sept 03
|
||||||
// you do not have PriorityQueue.insert() - so uncomment the code below
|
// you do not have PriorityQueue.insert() - so uncomment the code below
|
||||||
/*
|
/*
|
||||||
if (currentFrag.getScore() >= minScore)
|
if (currentFrag.getScore() >= minScore)
|
||||||
{
|
{
|
||||||
|
@ -296,8 +296,8 @@ public class Highlighter
|
||||||
fragQueue.pop(); // remove lowest in hit queue
|
fragQueue.pop(); // remove lowest in hit queue
|
||||||
minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore
|
minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
*/
|
*/
|
||||||
//The above code caused a problem as a result of Christoph Goller's 11th Sept 03
|
//The above code caused a problem as a result of Christoph Goller's 11th Sept 03
|
||||||
|
@ -312,7 +312,7 @@ public class Highlighter
|
||||||
{
|
{
|
||||||
frag[i] = (TextFragment) fragQueue.pop();
|
frag[i] = (TextFragment) fragQueue.pop();
|
||||||
}
|
}
|
||||||
|
|
||||||
//merge any contiguous fragments to improve readability
|
//merge any contiguous fragments to improve readability
|
||||||
if(mergeContiguousFragments)
|
if(mergeContiguousFragments)
|
||||||
{
|
{
|
||||||
|
@ -325,9 +325,9 @@ public class Highlighter
|
||||||
fragTexts.add(frag[i]);
|
fragTexts.add(frag[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
frag= (TextFragment[]) fragTexts.toArray(new TextFragment[0]);
|
frag= (TextFragment[]) fragTexts.toArray(new TextFragment[0]);
|
||||||
}
|
}
|
||||||
|
|
||||||
return frag;
|
return frag;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -347,7 +347,7 @@ public class Highlighter
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/** Improves readability of a score-sorted list of TextFragments by merging any fragments
|
/** Improves readability of a score-sorted list of TextFragments by merging any fragments
|
||||||
* that were contiguous in the original text into one larger fragment with the correct order.
|
* that were contiguous in the original text into one larger fragment with the correct order.
|
||||||
* This will leave a "null" in the array entry for the lesser scored fragment.
|
* This will leave a "null" in the array entry for the lesser scored fragment.
|
||||||
*
|
*
|
||||||
|
|
|
@ -30,37 +30,52 @@ public class TokenGroup
|
||||||
int numTokens=0;
|
int numTokens=0;
|
||||||
int startOffset=0;
|
int startOffset=0;
|
||||||
int endOffset=0;
|
int endOffset=0;
|
||||||
|
float tot;
|
||||||
|
|
||||||
void addToken(Token token, float score)
|
int matchStartOffset, matchEndOffset;
|
||||||
|
|
||||||
|
|
||||||
|
void addToken(Token token, float score)
|
||||||
{
|
{
|
||||||
if(numTokens < MAX_NUM_TOKENS_PER_GROUP)
|
if(numTokens < MAX_NUM_TOKENS_PER_GROUP)
|
||||||
{
|
{
|
||||||
if(numTokens==0)
|
if(numTokens==0)
|
||||||
{
|
{
|
||||||
startOffset=token.startOffset();
|
startOffset=matchStartOffset=token.startOffset();
|
||||||
endOffset=token.endOffset();
|
endOffset=matchEndOffset=token.endOffset();
|
||||||
|
tot += score;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
startOffset=Math.min(startOffset,token.startOffset());
|
startOffset=Math.min(startOffset,token.startOffset());
|
||||||
endOffset=Math.max(endOffset,token.endOffset());
|
endOffset=Math.max(endOffset,token.endOffset());
|
||||||
}
|
if (score>0) {
|
||||||
|
if (tot==0) {
|
||||||
|
matchStartOffset=token.startOffset();
|
||||||
|
matchEndOffset=token.endOffset();
|
||||||
|
} else {
|
||||||
|
matchStartOffset=Math.min(matchStartOffset,token.startOffset());
|
||||||
|
matchEndOffset=Math.max(matchEndOffset,token.endOffset());
|
||||||
|
}
|
||||||
|
tot+=score;
|
||||||
|
}
|
||||||
|
}
|
||||||
tokens[numTokens]=token;
|
tokens[numTokens]=token;
|
||||||
scores[numTokens]=score;
|
scores[numTokens]=score;
|
||||||
numTokens++;
|
numTokens++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
boolean isDistinct(Token token)
|
boolean isDistinct(Token token)
|
||||||
{
|
{
|
||||||
return token.startOffset()>=endOffset;
|
return token.startOffset()>=endOffset;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void clear()
|
void clear()
|
||||||
{
|
{
|
||||||
numTokens=0;
|
numTokens=0;
|
||||||
|
tot=0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -112,11 +127,6 @@ public class TokenGroup
|
||||||
*/
|
*/
|
||||||
public float getTotalScore()
|
public float getTotalScore()
|
||||||
{
|
{
|
||||||
float total=0;
|
return tot;
|
||||||
for (int i = 0; i < numTokens; i++)
|
|
||||||
{
|
|
||||||
total+=scores[i];
|
|
||||||
}
|
|
||||||
return total;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue