git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@422301 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Mark Harwood 2006-07-15 22:19:51 +00:00
parent 69299c5663
commit 5f7214b023
2 changed files with 72 additions and 62 deletions

View File

@ -25,8 +25,8 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.util.PriorityQueue; import org.apache.lucene.util.PriorityQueue;
/** /**
* Class used to markup highlighted terms found in the best sections of a * Class used to markup highlighted terms found in the best sections of a
* text, using configurable {@link Fragmenter}, {@link Scorer}, {@link Formatter}, * text, using configurable {@link Fragmenter}, {@link Scorer}, {@link Formatter},
* {@link Encoder} and tokenizers. * {@link Encoder} and tokenizers.
* @author mark@searcharea.co.uk * @author mark@searcharea.co.uk
*/ */
@ -36,7 +36,7 @@ public class Highlighter
public static final int DEFAULT_MAX_DOC_BYTES_TO_ANALYZE=50*1024; public static final int DEFAULT_MAX_DOC_BYTES_TO_ANALYZE=50*1024;
private int maxDocBytesToAnalyze=DEFAULT_MAX_DOC_BYTES_TO_ANALYZE; private int maxDocBytesToAnalyze=DEFAULT_MAX_DOC_BYTES_TO_ANALYZE;
private Formatter formatter; private Formatter formatter;
private Encoder encoder; private Encoder encoder;
private Fragmenter textFragmenter=new SimpleFragmenter(); private Fragmenter textFragmenter=new SimpleFragmenter();
private Scorer fragmentScorer=null; private Scorer fragmentScorer=null;
@ -44,14 +44,14 @@ public class Highlighter
{ {
this(new SimpleHTMLFormatter(),fragmentScorer); this(new SimpleHTMLFormatter(),fragmentScorer);
} }
public Highlighter(Formatter formatter, Scorer fragmentScorer) public Highlighter(Formatter formatter, Scorer fragmentScorer)
{ {
this(formatter,new DefaultEncoder(),fragmentScorer); this(formatter,new DefaultEncoder(),fragmentScorer);
} }
public Highlighter(Formatter formatter, Encoder encoder, Scorer fragmentScorer) public Highlighter(Formatter formatter, Encoder encoder, Scorer fragmentScorer)
{ {
this.formatter = formatter; this.formatter = formatter;
@ -65,9 +65,9 @@ public class Highlighter
* {@link #getBestFragment(TokenStream, String)} * {@link #getBestFragment(TokenStream, String)}
* *
* @param analyzer the analyzer that will be used to split <code>text</code> * @param analyzer the analyzer that will be used to split <code>text</code>
* into chunks * into chunks
* @param text text to highlight terms in * @param text text to highlight terms in
* @param fieldName Name of field used to influence analyzer's tokenization policy * @param fieldName Name of field used to influence analyzer's tokenization policy
* *
* @return highlighted text fragment or null if no terms found * @return highlighted text fragment or null if no terms found
*/ */
@ -77,18 +77,18 @@ public class Highlighter
TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(text)); TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(text));
return getBestFragment(tokenStream, text); return getBestFragment(tokenStream, text);
} }
/** /**
* Highlights chosen terms in a text, extracting the most relevant section. * Highlights chosen terms in a text, extracting the most relevant section.
* The document text is analysed in chunks to record hit statistics * The document text is analysed in chunks to record hit statistics
* across the document. After accumulating stats, the fragment with the highest score * across the document. After accumulating stats, the fragment with the highest score
* is returned * is returned
* *
* @param tokenStream a stream of tokens identified in the text parameter, including offset information. * @param tokenStream a stream of tokens identified in the text parameter, including offset information.
* This is typically produced by an analyzer re-parsing a document's * This is typically produced by an analyzer re-parsing a document's
* text. Some work may be done on retrieving TokenStreams more efficently * text. Some work may be done on retrieving TokenStreams more efficently
* by adding support for storing original text position data in the Lucene * by adding support for storing original text position data in the Lucene
* index but this support is not currently available (as of Lucene 1.4 rc2). * index but this support is not currently available (as of Lucene 1.4 rc2).
* @param text text to highlight terms in * @param text text to highlight terms in
* *
* @return highlighted text fragment or null if no terms found * @return highlighted text fragment or null if no terms found
@ -110,7 +110,7 @@ public class Highlighter
* {@link #getBestFragments(TokenStream, String, int)} * {@link #getBestFragments(TokenStream, String, int)}
* *
* @param analyzer the analyzer that will be used to split <code>text</code> * @param analyzer the analyzer that will be used to split <code>text</code>
* into chunks * into chunks
* @param text text to highlight terms in * @param text text to highlight terms in
* @param maxNumFragments the maximum number of fragments. * @param maxNumFragments the maximum number of fragments.
* @deprecated This method incorrectly hardcodes the choice of fieldname. Use the * @deprecated This method incorrectly hardcodes the choice of fieldname. Use the
@ -118,7 +118,7 @@ public class Highlighter
* @return highlighted text fragments (between 0 and maxNumFragments number of fragments) * @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
*/ */
public final String[] getBestFragments( public final String[] getBestFragments(
Analyzer analyzer, Analyzer analyzer,
String text, String text,
int maxNumFragments) int maxNumFragments)
throws IOException throws IOException
@ -132,7 +132,7 @@ public class Highlighter
* {@link #getBestFragments(TokenStream, String, int)} * {@link #getBestFragments(TokenStream, String, int)}
* *
* @param analyzer the analyzer that will be used to split <code>text</code> * @param analyzer the analyzer that will be used to split <code>text</code>
* into chunks * into chunks
* @param fieldName the name of the field being highlighted (used by analyzer) * @param fieldName the name of the field being highlighted (used by analyzer)
* @param text text to highlight terms in * @param text text to highlight terms in
* @param maxNumFragments the maximum number of fragments. * @param maxNumFragments the maximum number of fragments.
@ -140,7 +140,7 @@ public class Highlighter
* @return highlighted text fragments (between 0 and maxNumFragments number of fragments) * @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
*/ */
public final String[] getBestFragments( public final String[] getBestFragments(
Analyzer analyzer, Analyzer analyzer,
String fieldName, String fieldName,
String text, String text,
int maxNumFragments) int maxNumFragments)
@ -149,12 +149,12 @@ public class Highlighter
TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(text)); TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(text));
return getBestFragments(tokenStream, text, maxNumFragments); return getBestFragments(tokenStream, text, maxNumFragments);
} }
/** /**
* Highlights chosen terms in a text, extracting the most relevant sections. * Highlights chosen terms in a text, extracting the most relevant sections.
* The document text is analysed in chunks to record hit statistics * The document text is analysed in chunks to record hit statistics
* across the document. After accumulating stats, the fragments with the highest scores * across the document. After accumulating stats, the fragments with the highest scores
* are returned as an array of strings in order of score (contiguous fragments are merged into * are returned as an array of strings in order of score (contiguous fragments are merged into
* one in their original order to improve readability) * one in their original order to improve readability)
* *
* @param text text to highlight terms in * @param text text to highlight terms in
@ -163,13 +163,13 @@ public class Highlighter
* @return highlighted text fragments (between 0 and maxNumFragments number of fragments) * @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
*/ */
public final String[] getBestFragments( public final String[] getBestFragments(
TokenStream tokenStream, TokenStream tokenStream,
String text, String text,
int maxNumFragments) int maxNumFragments)
throws IOException throws IOException
{ {
maxNumFragments = Math.max(1, maxNumFragments); //sanity check maxNumFragments = Math.max(1, maxNumFragments); //sanity check
TextFragment[] frag =getBestTextFragments(tokenStream,text, true,maxNumFragments); TextFragment[] frag =getBestTextFragments(tokenStream,text, true,maxNumFragments);
//Get text //Get text
@ -183,12 +183,12 @@ public class Highlighter
} }
return (String[]) fragTexts.toArray(new String[0]); return (String[]) fragTexts.toArray(new String[0]);
} }
/** /**
* Low level api to get the most relevant (formatted) sections of the document. * Low level api to get the most relevant (formatted) sections of the document.
* This method has been made public to allow visibility of score information held in TextFragment objects. * This method has been made public to allow visibility of score information held in TextFragment objects.
* Thanks to Jason Calabrese for help in redefining the interface. * Thanks to Jason Calabrese for help in redefining the interface.
* @param tokenStream * @param tokenStream
* @param text * @param text
* @param maxNumFragments * @param maxNumFragments
@ -196,7 +196,7 @@ public class Highlighter
* @throws IOException * @throws IOException
*/ */
public final TextFragment[] getBestTextFragments( public final TextFragment[] getBestTextFragments(
TokenStream tokenStream, TokenStream tokenStream,
String text, String text,
boolean mergeContiguousFragments, boolean mergeContiguousFragments,
int maxNumFragments) int maxNumFragments)
@ -208,7 +208,7 @@ public class Highlighter
TextFragment currentFrag = new TextFragment(newText,newText.length(), docFrags.size()); TextFragment currentFrag = new TextFragment(newText,newText.length(), docFrags.size());
fragmentScorer.startFragment(currentFrag); fragmentScorer.startFragment(currentFrag);
docFrags.add(currentFrag); docFrags.add(currentFrag);
FragmentQueue fragQueue = new FragmentQueue(maxNumFragments); FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);
try try
@ -219,27 +219,27 @@ public class Highlighter
int endOffset; int endOffset;
int lastEndOffset = 0; int lastEndOffset = 0;
textFragmenter.start(text); textFragmenter.start(text);
TokenGroup tokenGroup=new TokenGroup(); TokenGroup tokenGroup=new TokenGroup();
while ((token = tokenStream.next()) != null) while ((token = tokenStream.next()) != null)
{ {
if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct(token))) if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct(token)))
{ {
//the current token is distinct from previous tokens - //the current token is distinct from previous tokens -
// markup the cached token group info // markup the cached token group info
startOffset = tokenGroup.startOffset; startOffset = tokenGroup.matchStartOffset;
endOffset = tokenGroup.endOffset; endOffset = tokenGroup.matchEndOffset;
tokenText = text.substring(startOffset, endOffset); tokenText = text.substring(startOffset, endOffset);
String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup); String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
//store any whitespace etc from between this and last group //store any whitespace etc from between this and last group
if (startOffset > lastEndOffset) if (startOffset > lastEndOffset)
newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset))); newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
newText.append(markedUpText); newText.append(markedUpText);
lastEndOffset=endOffset; lastEndOffset=Math.max(endOffset, lastEndOffset);
tokenGroup.clear(); tokenGroup.clear();
//check if current token marks the start of a new fragment //check if current token marks the start of a new fragment
if(textFragmenter.isNewFragment(token)) if(textFragmenter.isNewFragment(token))
{ {
currentFrag.setScore(fragmentScorer.getFragmentScore()); currentFrag.setScore(fragmentScorer.getFragmentScore());
@ -250,28 +250,28 @@ public class Highlighter
docFrags.add(currentFrag); docFrags.add(currentFrag);
} }
} }
tokenGroup.addToken(token,fragmentScorer.getTokenScore(token)); tokenGroup.addToken(token,fragmentScorer.getTokenScore(token));
if(lastEndOffset>maxDocBytesToAnalyze) if(lastEndOffset>maxDocBytesToAnalyze)
{ {
break; break;
} }
} }
currentFrag.setScore(fragmentScorer.getFragmentScore()); currentFrag.setScore(fragmentScorer.getFragmentScore());
if(tokenGroup.numTokens>0) if(tokenGroup.numTokens>0)
{ {
//flush the accumulated text (same code as in above loop) //flush the accumulated text (same code as in above loop)
startOffset = tokenGroup.startOffset; startOffset = tokenGroup.matchStartOffset;
endOffset = tokenGroup.endOffset; endOffset = tokenGroup.matchEndOffset;
tokenText = text.substring(startOffset, endOffset); tokenText = text.substring(startOffset, endOffset);
String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup); String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
//store any whitespace etc from between this and last group //store any whitespace etc from between this and last group
if (startOffset > lastEndOffset) if (startOffset > lastEndOffset)
newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset))); newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
newText.append(markedUpText); newText.append(markedUpText);
lastEndOffset=endOffset; lastEndOffset=Math.max(lastEndOffset,endOffset);
} }
// append text after end of last token // append text after end of last token
@ -286,7 +286,7 @@ public class Highlighter
currentFrag = (TextFragment) i.next(); currentFrag = (TextFragment) i.next();
//If you are running with a version of Lucene before 11th Sept 03 //If you are running with a version of Lucene before 11th Sept 03
// you do not have PriorityQueue.insert() - so uncomment the code below // you do not have PriorityQueue.insert() - so uncomment the code below
/* /*
if (currentFrag.getScore() >= minScore) if (currentFrag.getScore() >= minScore)
{ {
@ -296,8 +296,8 @@ public class Highlighter
fragQueue.pop(); // remove lowest in hit queue fragQueue.pop(); // remove lowest in hit queue
minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore
} }
} }
*/ */
//The above code caused a problem as a result of Christoph Goller's 11th Sept 03 //The above code caused a problem as a result of Christoph Goller's 11th Sept 03
@ -312,7 +312,7 @@ public class Highlighter
{ {
frag[i] = (TextFragment) fragQueue.pop(); frag[i] = (TextFragment) fragQueue.pop();
} }
//merge any contiguous fragments to improve readability //merge any contiguous fragments to improve readability
if(mergeContiguousFragments) if(mergeContiguousFragments)
{ {
@ -325,9 +325,9 @@ public class Highlighter
fragTexts.add(frag[i]); fragTexts.add(frag[i]);
} }
} }
frag= (TextFragment[]) fragTexts.toArray(new TextFragment[0]); frag= (TextFragment[]) fragTexts.toArray(new TextFragment[0]);
} }
return frag; return frag;
} }
@ -347,7 +347,7 @@ public class Highlighter
} }
/** Improves readability of a score-sorted list of TextFragments by merging any fragments /** Improves readability of a score-sorted list of TextFragments by merging any fragments
* that were contiguous in the original text into one larger fragment with the correct order. * that were contiguous in the original text into one larger fragment with the correct order.
* This will leave a "null" in the array entry for the lesser scored fragment. * This will leave a "null" in the array entry for the lesser scored fragment.
* *

View File

@ -30,37 +30,52 @@ public class TokenGroup
int numTokens=0; int numTokens=0;
int startOffset=0; int startOffset=0;
int endOffset=0; int endOffset=0;
float tot;
void addToken(Token token, float score) int matchStartOffset, matchEndOffset;
void addToken(Token token, float score)
{ {
if(numTokens < MAX_NUM_TOKENS_PER_GROUP) if(numTokens < MAX_NUM_TOKENS_PER_GROUP)
{ {
if(numTokens==0) if(numTokens==0)
{ {
startOffset=token.startOffset(); startOffset=matchStartOffset=token.startOffset();
endOffset=token.endOffset(); endOffset=matchEndOffset=token.endOffset();
tot += score;
} }
else else
{ {
startOffset=Math.min(startOffset,token.startOffset()); startOffset=Math.min(startOffset,token.startOffset());
endOffset=Math.max(endOffset,token.endOffset()); endOffset=Math.max(endOffset,token.endOffset());
} if (score>0) {
if (tot==0) {
matchStartOffset=token.startOffset();
matchEndOffset=token.endOffset();
} else {
matchStartOffset=Math.min(matchStartOffset,token.startOffset());
matchEndOffset=Math.max(matchEndOffset,token.endOffset());
}
tot+=score;
}
}
tokens[numTokens]=token; tokens[numTokens]=token;
scores[numTokens]=score; scores[numTokens]=score;
numTokens++; numTokens++;
} }
} }
boolean isDistinct(Token token) boolean isDistinct(Token token)
{ {
return token.startOffset()>=endOffset; return token.startOffset()>=endOffset;
} }
void clear() void clear()
{ {
numTokens=0; numTokens=0;
tot=0;
} }
/** /**
@ -112,11 +127,6 @@ public class TokenGroup
*/ */
public float getTotalScore() public float getTotalScore()
{ {
float total=0; return tot;
for (int i = 0; i < numTokens; i++)
{
total+=scores[i];
}
return total;
} }
} }