LUCENE-1695: Update the Highlighter to use the new TokenStream API. This issue breaks backwards compatibility with some public classes. If you have implemented custom Fregmenters or Scorers, you will need to adjust them to work with the new TokenStream API. Rather than getting passed a Token at a time, you will be given a TokenStream to init your impl with - store the Attributes you are interested in locally and access them on each call to the method that used to pass a new Token. Look at the included updated impls for examples.

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@799455 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Mark Robert Miller 2009-07-30 22:00:47 +00:00
parent 7ecaa8c990
commit f73a4f4324
13 changed files with 569 additions and 401 deletions

View File

@ -11,7 +11,12 @@ Changes in runtime behavior
API Changes
(None)
1. LUCENE-1695: Update the Highlighter to use the new TokenStream API. This issue breaks backwards
compatibility with some public classes. If you have implemented custom Fregmenters or Scorers,
you will need to adjust them to work with the new TokenStream API. Rather than getting passed a
Token at a time, you will be given a TokenStream to init your impl with - store the Attributes
you are interested in locally and access them on each call to the method that used to pass a new
Token. Look at the included updated impls for examples. (Mark Miller)
Bug fixes
@ -41,9 +46,6 @@ Bug fixes
8. LUCENE-1491: EdgeNGramTokenFilter no longer stops on tokens shorter than minimum n-gram size.
(Todd Teak via Otis Gospodnetic)
9. LUCENE-1752: Missing highlights when terms were repeated in separate, nested, boolean or
disjunction queries. (Koji Sekiguchi, Mark Miller)
New features

View File

@ -16,24 +16,31 @@ package org.apache.lucene.search.highlight;
* limitations under the License.
*/
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
/**
* Implements the policy for breaking text into multiple fragments for consideration
* by the {@link Highlighter} class. A sophisticated implementation may do this on the basis
* of detecting end of sentences in the text.
* Implements the policy for breaking text into multiple fragments for
* consideration by the {@link Highlighter} class. A sophisticated
* implementation may do this on the basis of detecting end of sentences in the
* text.
*/
public interface Fragmenter
{
/**
* Initializes the Fragmenter
* @param originalText
*/
public void start(String originalText);
public interface Fragmenter {
/**
* Test to see if this token from the stream should be held in a new TextFragment
* @param nextToken
*/
public boolean isNewFragment(Token nextToken);
/**
* Initializes the Fragmenter. You can grab references to the Attributes you are
* interested in from tokenStream and then access the values in isNewFragment.
*
* @param originalText
* @param tokenStream
*/
public void start(String originalText, TokenStream tokenStream);
/**
* Test to see if this token from the stream should be held in a new
* TextFragment. Every time this is called, the TokenStream
* passed to start(String, TokenStream) will have been incremented.
*
*/
public boolean isNewFragment();
}

View File

@ -22,8 +22,10 @@ import java.util.ArrayList;
import java.util.Iterator;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.util.PriorityQueue;
/**
@ -214,8 +216,14 @@ public class Highlighter
{
ArrayList docFrags = new ArrayList();
StringBuffer newText=new StringBuffer();
TermAttribute termAtt = (TermAttribute) tokenStream.addAttribute(TermAttribute.class);
OffsetAttribute offsetAtt = (OffsetAttribute) tokenStream.addAttribute(OffsetAttribute.class);
tokenStream.addAttribute(PositionIncrementAttribute.class);
tokenStream.reset();
TextFragment currentFrag = new TextFragment(newText,newText.length(), docFrags.size());
fragmentScorer.init(tokenStream);
fragmentScorer.startFragment(currentFrag);
docFrags.add(currentFrag);
@ -223,28 +231,27 @@ public class Highlighter
try
{
final Token reusableToken = new Token();
String tokenText;
int startOffset;
int endOffset;
int lastEndOffset = 0;
textFragmenter.start(text);
textFragmenter.start(text, tokenStream);
TokenGroup tokenGroup=new TokenGroup();
for (Token nextToken = tokenStream.next(reusableToken);
(nextToken!= null)&&(nextToken.startOffset()< maxDocCharsToAnalyze);
nextToken = tokenStream.next(reusableToken))
TokenGroup tokenGroup=new TokenGroup(tokenStream);
for (boolean next = tokenStream.incrementToken(); next && (offsetAtt.startOffset()< maxDocCharsToAnalyze);
next = tokenStream.incrementToken())
{
if( (nextToken.endOffset()>text.length())
if( (offsetAtt.endOffset()>text.length())
||
(nextToken.startOffset()>text.length())
(offsetAtt.startOffset()>text.length())
)
{
throw new InvalidTokenOffsetsException("Token "+nextToken.toString()
throw new InvalidTokenOffsetsException("Token "+ termAtt.term()
+" exceeds length of provided text sized "+text.length());
}
if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct(nextToken)))
if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct()))
{
//the current token is distinct from previous tokens -
// markup the cached token group info
@ -260,7 +267,7 @@ public class Highlighter
tokenGroup.clear();
//check if current token marks the start of a new fragment
if(textFragmenter.isNewFragment(nextToken))
if(textFragmenter.isNewFragment())
{
currentFrag.setScore(fragmentScorer.getFragmentScore());
//record stats for a new fragment
@ -271,7 +278,7 @@ public class Highlighter
}
}
tokenGroup.addToken(nextToken,fragmentScorer.getTokenScore(nextToken));
tokenGroup.addToken(fragmentScorer.getTokenScore());
// if(lastEndOffset>maxDocBytesToAnalyze)
// {
@ -332,7 +339,7 @@ public class Highlighter
//The above code caused a problem as a result of Christoph Goller's 11th Sept 03
//fix to PriorityQueue. The correct method to use here is the new "insert" method
// USE ABOVE CODE IF THIS DOES NOT COMPILE!
fragQueue.insert(currentFrag);
fragQueue.insertWithOverflow(currentFrag);
}
//return the most relevant fragments

View File

@ -16,17 +16,18 @@ package org.apache.lucene.search.highlight;
* limitations under the License.
*/
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
/**
* {@link Fragmenter} implementation which does not fragment the text.
* This is useful for highlighting the entire content of a document or field.
*/
public class NullFragmenter implements Fragmenter {
public void start(String s) {
public void start(String s, TokenStream tokenStream) {
}
public boolean isNewFragment(Token token) {
public boolean isNewFragment() {
return false;
}
}

View File

@ -1,4 +1,5 @@
package org.apache.lucene.search.highlight;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@ -19,134 +20,142 @@ package org.apache.lucene.search.highlight;
import java.util.HashMap;
import java.util.HashSet;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.Query;
/**
* {@link Scorer} implementation which scores text fragments by the number of unique query terms found.
* This class uses the {@link QueryTermExtractor} class to process determine the query terms and
* their boosts to be used.
* {@link Scorer} implementation which scores text fragments by the number of
* unique query terms found. This class uses the {@link QueryTermExtractor}
* class to process determine the query terms and their boosts to be used.
*/
//TODO: provide option to boost score of fragments near beginning of document
// TODO: provide option to boost score of fragments near beginning of document
// based on fragment.getFragNum()
public class QueryScorer implements Scorer
{
TextFragment currentTextFragment=null;
HashSet uniqueTermsInFragment;
float totalScore=0;
float maxTermWeight=0;
private HashMap termsToFind;
public class QueryScorer implements Scorer {
TextFragment currentTextFragment = null;
HashSet uniqueTermsInFragment;
/**
*
* @param query a Lucene query (ideally rewritten using query.rewrite
* before being passed to this class and the searcher)
*/
public QueryScorer(Query query)
{
this(QueryTermExtractor.getTerms(query));
}
/**
*
* @param query a Lucene query (ideally rewritten using query.rewrite
* before being passed to this class and the searcher)
* @param fieldName the Field name which is used to match Query terms
*/
public QueryScorer(Query query, String fieldName)
{
this(QueryTermExtractor.getTerms(query, false,fieldName));
}
float totalScore = 0;
float maxTermWeight = 0;
private HashMap termsToFind;
/**
*
* @param query a Lucene query (ideally rewritten using query.rewrite
* before being passed to this class and the searcher)
* @param reader used to compute IDF which can be used to a) score selected fragments better
* b) use graded highlights eg set font color intensity
* @param fieldName the field on which Inverse Document Frequency (IDF) calculations are based
*/
public QueryScorer(Query query, IndexReader reader, String fieldName)
{
this(QueryTermExtractor.getIdfWeightedTerms(query, reader, fieldName));
}
private TermAttribute termAtt;
public QueryScorer(WeightedTerm []weightedTerms )
{
termsToFind = new HashMap();
for (int i = 0; i < weightedTerms.length; i++)
{
WeightedTerm existingTerm=(WeightedTerm) termsToFind.get(weightedTerms[i].term);
if( (existingTerm==null) ||(existingTerm.weight<weightedTerms[i].weight) )
{
//if a term is defined more than once, always use the highest scoring weight
termsToFind.put(weightedTerms[i].term,weightedTerms[i]);
maxTermWeight=Math.max(maxTermWeight,weightedTerms[i].getWeight());
}
}
}
/**
*
* @param query a Lucene query (ideally rewritten using query.rewrite before
* being passed to this class and the searcher)
*/
public QueryScorer(Query query) {
this(QueryTermExtractor.getTerms(query));
}
/* (non-Javadoc)
* @see org.apache.lucene.search.highlight.FragmentScorer#startFragment(org.apache.lucene.search.highlight.TextFragment)
*/
public void startFragment(TextFragment newFragment)
{
uniqueTermsInFragment = new HashSet();
currentTextFragment=newFragment;
totalScore=0;
}
/* (non-Javadoc)
* @see org.apache.lucene.search.highlight.FragmentScorer#scoreToken(org.apache.lucene.analysis.Token)
*/
public float getTokenScore(Token token)
{
String termText=token.term();
WeightedTerm queryTerm=(WeightedTerm) termsToFind.get(termText);
if(queryTerm==null)
{
//not a query term - return
return 0;
}
//found a query term - is it unique in this doc?
if(!uniqueTermsInFragment.contains(termText))
{
totalScore+=queryTerm.getWeight();
uniqueTermsInFragment.add(termText);
}
return queryTerm.getWeight();
}
/* (non-Javadoc)
* @see org.apache.lucene.search.highlight.FragmentScorer#endFragment(org.apache.lucene.search.highlight.TextFragment)
*/
public float getFragmentScore()
{
return totalScore;
}
/**
*
* @param query a Lucene query (ideally rewritten using query.rewrite before
* being passed to this class and the searcher)
* @param fieldName the Field name which is used to match Query terms
*/
public QueryScorer(Query query, String fieldName) {
this(QueryTermExtractor.getTerms(query, false, fieldName));
}
/**
*
* @param query a Lucene query (ideally rewritten using query.rewrite before
* being passed to this class and the searcher)
* @param reader used to compute IDF which can be used to a) score selected
* fragments better b) use graded highlights eg set font color
* intensity
* @param fieldName the field on which Inverse Document Frequency (IDF)
* calculations are based
*/
public QueryScorer(Query query, IndexReader reader, String fieldName) {
this(QueryTermExtractor.getIdfWeightedTerms(query, reader, fieldName));
}
/* (non-Javadoc)
* @see org.apache.lucene.search.highlight.FragmentScorer#allFragmentsProcessed()
*/
public void allFragmentsProcessed()
{
//this class has no special operations to perform at end of processing
}
/**
*
* @return The highest weighted term (useful for passing to GradientFormatter to set
* top end of coloring scale.
*/
public float getMaxTermWeight()
{
return maxTermWeight;
public QueryScorer(WeightedTerm[] weightedTerms) {
termsToFind = new HashMap();
for (int i = 0; i < weightedTerms.length; i++) {
WeightedTerm existingTerm = (WeightedTerm) termsToFind
.get(weightedTerms[i].term);
if ((existingTerm == null)
|| (existingTerm.weight < weightedTerms[i].weight)) {
// if a term is defined more than once, always use the highest scoring
// weight
termsToFind.put(weightedTerms[i].term, weightedTerms[i]);
maxTermWeight = Math.max(maxTermWeight, weightedTerms[i].getWeight());
}
}
}
/* (non-Javadoc)
* @see org.apache.lucene.search.highlight.Scorer#init(org.apache.lucene.analysis.TokenStream)
*/
public void init(TokenStream tokenStream) {
termAtt = (TermAttribute) tokenStream.getAttribute(TermAttribute.class);
}
/*
* (non-Javadoc)
*
* @see
* org.apache.lucene.search.highlight.FragmentScorer#startFragment(org.apache
* .lucene.search.highlight.TextFragment)
*/
public void startFragment(TextFragment newFragment) {
uniqueTermsInFragment = new HashSet();
currentTextFragment = newFragment;
totalScore = 0;
}
/* (non-Javadoc)
* @see org.apache.lucene.search.highlight.Scorer#getTokenScore()
*/
public float getTokenScore() {
String termText = termAtt.term();
WeightedTerm queryTerm = (WeightedTerm) termsToFind.get(termText);
if (queryTerm == null) {
// not a query term - return
return 0;
}
// found a query term - is it unique in this doc?
if (!uniqueTermsInFragment.contains(termText)) {
totalScore += queryTerm.getWeight();
uniqueTermsInFragment.add(termText);
}
return queryTerm.getWeight();
}
/* (non-Javadoc)
* @see org.apache.lucene.search.highlight.Scorer#getFragmentScore()
*/
public float getFragmentScore() {
return totalScore;
}
/*
* (non-Javadoc)
*
* @see
* org.apache.lucene.search.highlight.FragmentScorer#allFragmentsProcessed()
*/
public void allFragmentsProcessed() {
// this class has no special operations to perform at end of processing
}
/**
*
* @return The highest weighted term (useful for passing to GradientFormatter
* to set top end of coloring scale.
*/
public float getMaxTermWeight() {
return maxTermWeight;
}
}

View File

@ -1,4 +1,5 @@
package org.apache.lucene.search.highlight;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@ -16,34 +17,45 @@ package org.apache.lucene.search.highlight;
* limitations under the License.
*/
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
/**
* Adds to the score for a fragment based on its tokens
*/
public interface Scorer
{
/**
* called when a new fragment is started for consideration
* @param newFragment
*/
public void startFragment(TextFragment newFragment);
public interface Scorer {
/**
* Called for each token in the current fragment
* @param token The token to be scored
* @return a score which is passed to the Highlighter class to influence the mark-up of the text
* (this return value is NOT used to score the fragment)
*/
public float getTokenScore(Token token);
/**
* Called to init the Scorer with a TokenStream. You can grab references to
* the attributes you are interested in here and access them from
* getTokenScore().
*
* @param tokenStream
*/
public void init(TokenStream tokenStream);
/**
* Called when the highlighter has no more tokens for the current fragment - the scorer returns
* the weighting it has derived for the most recent fragment, typically based on the tokens
* passed to getTokenScore().
*
*/
public float getFragmentScore();
/**
* called when a new fragment is started for consideration
*
* @param newFragment
*/
public void startFragment(TextFragment newFragment);
/**
* Called for each token in the current fragment. The Highlighter will
* increment the TokenStream passed to init on every call.
*
* @return a score which is passed to the Highlighter class to influence the
* mark-up of the text (this return value is NOT used to score the
* fragment)
*/
public float getTokenScore();
/**
* Called when the highlighter has no more tokens for the current fragment -
* the scorer returns the weighting it has derived for the most recent
* fragment, typically based on the tokens passed to getTokenScore().
*
*/
public float getFragmentScore();
}

View File

@ -1,4 +1,5 @@
package org.apache.lucene.search.highlight;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@ -16,69 +17,64 @@ package org.apache.lucene.search.highlight;
* limitations under the License.
*/
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
/**
* {@link Fragmenter} implementation which breaks text up into same-size
* {@link Fragmenter} implementation which breaks text up into same-size
* fragments with no concerns over spotting sentence boundaries.
*/
public class SimpleFragmenter implements Fragmenter
{
private static final int DEFAULT_FRAGMENT_SIZE =100;
private int currentNumFrags;
private int fragmentSize;
public class SimpleFragmenter implements Fragmenter {
private static final int DEFAULT_FRAGMENT_SIZE = 100;
private int currentNumFrags;
private int fragmentSize;
private OffsetAttribute offsetAtt;
public SimpleFragmenter() {
this(DEFAULT_FRAGMENT_SIZE);
}
/**
*
* @param fragmentSize size in number of characters of each fragment
*/
public SimpleFragmenter(int fragmentSize) {
this.fragmentSize = fragmentSize;
}
public SimpleFragmenter()
{
this(DEFAULT_FRAGMENT_SIZE);
}
/* (non-Javadoc)
* @see org.apache.lucene.search.highlight.Fragmenter#start(java.lang.String, org.apache.lucene.analysis.TokenStream)
*/
public void start(String originalText, TokenStream stream) {
offsetAtt = (OffsetAttribute) stream.getAttribute(OffsetAttribute.class);
currentNumFrags = 1;
}
/**
*
* @param fragmentSize size in number of characters of each fragment
*/
public SimpleFragmenter(int fragmentSize)
{
this.fragmentSize=fragmentSize;
}
/* (non-Javadoc)
* @see org.apache.lucene.search.highlight.Fragmenter#isNewFragment()
*/
public boolean isNewFragment() {
boolean isNewFrag = offsetAtt.endOffset() >= (fragmentSize * currentNumFrags);
if (isNewFrag) {
currentNumFrags++;
}
return isNewFrag;
}
/* (non-Javadoc)
* @see org.apache.lucene.search.highlight.TextFragmenter#start(java.lang.String)
*/
public void start(String originalText)
{
currentNumFrags=1;
}
/**
* @return size in number of characters of each fragment
*/
public int getFragmentSize() {
return fragmentSize;
}
/* (non-Javadoc)
* @see org.apache.lucene.search.highlight.TextFragmenter#isNewFragment(org.apache.lucene.analysis.Token)
*/
public boolean isNewFragment(Token token)
{
boolean isNewFrag= token.endOffset()>=(fragmentSize*currentNumFrags);
if(isNewFrag)
{
currentNumFrags++;
}
return isNewFrag;
}
/**
* @return size in number of characters of each fragment
*/
public int getFragmentSize()
{
return fragmentSize;
}
/**
* @param size size in characters of each fragment
*/
public void setFragmentSize(int size)
{
fragmentSize = size;
}
/**
* @param size size in characters of each fragment
*/
public void setFragmentSize(int size) {
fragmentSize = size;
}
}

View File

@ -17,10 +17,13 @@ package org.apache.lucene.search.highlight;
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.Token;
import java.util.List;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/**
* {@link Fragmenter} implementation which breaks text up into same-size
@ -34,6 +37,9 @@ public class SimpleSpanFragmenter implements Fragmenter {
private SpanScorer spanScorer;
private int waitForPos = -1;
private int textSize;
private TermAttribute termAtt;
private PositionIncrementAttribute posIncAtt;
private OffsetAttribute offsetAtt;
/**
* @param spanscorer SpanScorer that was used to score hits
@ -50,12 +56,12 @@ public class SimpleSpanFragmenter implements Fragmenter {
this.fragmentSize = fragmentSize;
this.spanScorer = spanscorer;
}
/* (non-Javadoc)
* @see org.apache.lucene.search.highlight.Fragmenter#isNewFragment(org.apache.lucene.analysis.Token)
* @see org.apache.lucene.search.highlight.Fragmenter#isNewFragment()
*/
public boolean isNewFragment(Token token) {
position += token.getPositionIncrement();
public boolean isNewFragment() {
position += posIncAtt.getPositionIncrement();
if (waitForPos == position) {
waitForPos = -1;
@ -63,7 +69,7 @@ public class SimpleSpanFragmenter implements Fragmenter {
return false;
}
WeightedSpanTerm wSpanTerm = spanScorer.getWeightedSpanTerm(token.term());
WeightedSpanTerm wSpanTerm = spanScorer.getWeightedSpanTerm(termAtt.term());
if (wSpanTerm != null) {
List positionSpans = wSpanTerm.getPositionSpans();
@ -76,8 +82,8 @@ public class SimpleSpanFragmenter implements Fragmenter {
}
}
boolean isNewFrag = token.endOffset() >= (fragmentSize * currentNumFrags)
&& (textSize - token.endOffset()) >= (fragmentSize >>> 1);
boolean isNewFrag = offsetAtt.endOffset() >= (fragmentSize * currentNumFrags)
&& (textSize - offsetAtt.endOffset()) >= (fragmentSize >>> 1);
if (isNewFrag) {
currentNumFrags++;
@ -86,12 +92,16 @@ public class SimpleSpanFragmenter implements Fragmenter {
return isNewFrag;
}
/* (non-Javadoc)
* @see org.apache.lucene.search.highlight.Fragmenter#start(java.lang.String)
* @see org.apache.lucene.search.highlight.Fragmenter#start(java.lang.String, org.apache.lucene.analysis.TokenStream)
*/
public void start(String originalText) {
public void start(String originalText, TokenStream tokenStream) {
position = -1;
currentNumFrags = 1;
textSize = originalText.length();
termAtt = (TermAttribute) tokenStream.getAttribute(TermAttribute.class);
posIncAtt = (PositionIncrementAttribute) tokenStream.getAttribute(PositionIncrementAttribute.class);
offsetAtt = (OffsetAttribute) tokenStream.getAttribute(OffsetAttribute.class);
}
}

View File

@ -7,9 +7,10 @@ import java.util.Map;
import java.util.Set;
import org.apache.lucene.analysis.CachingTokenFilter;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.ConstantScoreRangeQuery;
import org.apache.lucene.search.Query;
@ -26,6 +27,8 @@ public class SpanScorer implements Scorer {
private float maxTermWeight;
private int position = -1;
private String defaultField;
private TermAttribute termAtt;
private PositionIncrementAttribute posIncAtt;
private static boolean highlightCnstScrRngQuery;
/**
@ -176,9 +179,9 @@ public class SpanScorer implements Scorer {
* @see org.apache.lucene.search.highlight.Scorer#getTokenScore(org.apache.lucene.analysis.Token,
* int)
*/
public float getTokenScore(Token token) {
position += token.getPositionIncrement();
String termText = token.term();
public float getTokenScore() {
position += posIncAtt.getPositionIncrement();
String termText = termAtt.term();
WeightedSpanTerm weightedSpanTerm;
@ -203,6 +206,11 @@ public class SpanScorer implements Scorer {
return score;
}
public void init(TokenStream tokenStream) {
termAtt = (TermAttribute) tokenStream.getAttribute(TermAttribute.class);
posIncAtt = (PositionIncrementAttribute) tokenStream.getAttribute(PositionIncrementAttribute.class);
}
/**
* Retrieve the WeightedSpanTerm for the specified token. Useful for passing
* Span information to a Fragmenter.

View File

@ -1,4 +1,5 @@
package org.apache.lucene.search.highlight;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@ -15,118 +16,117 @@ package org.apache.lucene.search.highlight;
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/**
* One, or several overlapping tokens, along with the score(s) and the
* scope of the original text
* One, or several overlapping tokens, along with the score(s) and the scope of
* the original text
*/
public class TokenGroup
{
private static final int MAX_NUM_TOKENS_PER_GROUP=50;
Token [] tokens=new Token[MAX_NUM_TOKENS_PER_GROUP];
float [] scores=new float[MAX_NUM_TOKENS_PER_GROUP];
int numTokens=0;
int startOffset=0;
int endOffset=0;
float tot;
public class TokenGroup {
private static final int MAX_NUM_TOKENS_PER_GROUP = 50;
Token [] tokens=new Token[MAX_NUM_TOKENS_PER_GROUP];
float[] scores = new float[MAX_NUM_TOKENS_PER_GROUP];
int numTokens = 0;
int startOffset = 0;
int endOffset = 0;
float tot;
int matchStartOffset, matchEndOffset;
private OffsetAttribute offsetAtt;
private TermAttribute termAtt;
void addToken(Token token, float score)
{
if(numTokens < MAX_NUM_TOKENS_PER_GROUP)
{
if(numTokens==0)
{
startOffset=matchStartOffset=token.startOffset();
endOffset=matchEndOffset=token.endOffset();
tot += score;
}
else
{
startOffset=Math.min(startOffset,token.startOffset());
endOffset=Math.max(endOffset,token.endOffset());
if (score>0) {
if (tot==0) {
matchStartOffset=token.startOffset();
matchEndOffset=token.endOffset();
public TokenGroup(TokenStream tokenStream) {
offsetAtt = (OffsetAttribute) tokenStream.getAttribute(OffsetAttribute.class);
termAtt = (TermAttribute) tokenStream.getAttribute(TermAttribute.class);
}
void addToken(float score) {
if (numTokens < MAX_NUM_TOKENS_PER_GROUP) {
int termStartOffset = offsetAtt.startOffset();
int termEndOffset = offsetAtt.endOffset();
if (numTokens == 0) {
startOffset = matchStartOffset = termStartOffset;
endOffset = matchEndOffset = termEndOffset;
tot += score;
} else {
startOffset = Math.min(startOffset, termStartOffset);
endOffset = Math.max(endOffset, termEndOffset);
if (score > 0) {
if (tot == 0) {
matchStartOffset = offsetAtt.startOffset();
matchEndOffset = offsetAtt.endOffset();
} else {
matchStartOffset=Math.min(matchStartOffset,token.startOffset());
matchEndOffset=Math.max(matchEndOffset,token.endOffset());
matchStartOffset = Math.min(matchStartOffset, termStartOffset);
matchEndOffset = Math.max(matchEndOffset, termEndOffset);
}
tot+=score;
tot += score;
}
}
tokens[numTokens]= (Token) token.clone();
scores[numTokens]=score;
numTokens++;
}
}
Token token = new Token(termStartOffset, termEndOffset);
token.setTermBuffer(termAtt.term());
tokens[numTokens] = token;
scores[numTokens] = score;
numTokens++;
}
}
boolean isDistinct(Token token)
{
return token.startOffset()>=endOffset;
}
boolean isDistinct() {
return offsetAtt.startOffset() >= endOffset;
}
void clear() {
numTokens = 0;
tot = 0;
}
/*
* @param index a value between 0 and numTokens -1
* @return the "n"th token
*/
public Token getToken(int index)
{
return tokens[index];
}
void clear()
{
numTokens=0;
tot=0;
}
/**
*
* @param index a value between 0 and numTokens -1
* @return the "n"th token
*/
public Token getToken(int index)
{
return tokens[index];
}
/**
*
* @param index a value between 0 and numTokens -1
* @return the "n"th score
*/
public float getScore(int index) {
return scores[index];
}
/**
*
* @param index a value between 0 and numTokens -1
* @return the "n"th score
*/
public float getScore(int index)
{
return scores[index];
}
/**
* @return the end position in the original text
*/
public int getEndOffset() {
return endOffset;
}
/**
* @return the end position in the original text
*/
public int getEndOffset()
{
return endOffset;
}
/**
* @return the number of tokens in this group
*/
public int getNumTokens() {
return numTokens;
}
/**
* @return the number of tokens in this group
*/
public int getNumTokens()
{
return numTokens;
}
/**
* @return the start position in the original text
*/
public int getStartOffset() {
return startOffset;
}
/**
* @return the start position in the original text
*/
public int getStartOffset()
{
return startOffset;
}
/**
* @return all tokens' scores summed up
*/
public float getTotalScore()
{
return tot;
}
/**
* @return all tokens' scores summed up
*/
public float getTotalScore() {
return tot;
}
}

View File

@ -29,6 +29,8 @@ import java.util.Comparator;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.TermFreqVector;
@ -135,32 +137,45 @@ public class TokenSources
* @param tokenPositionsGuaranteedContiguous true if the token position numbers have no overlaps or gaps. If looking
* to eek out the last drops of performance, set to true. If in doubt, set to false.
*/
public static TokenStream getTokenStream(TermPositionVector tpv, boolean tokenPositionsGuaranteedContiguous)
{
public static TokenStream getTokenStream(TermPositionVector tpv, boolean tokenPositionsGuaranteedContiguous) {
//an object used to iterate across an array of tokens
class StoredTokenStream extends TokenStream
{
Token tokens[];
int currentToken=0;
StoredTokenStream(Token tokens[])
{
this.tokens=tokens;
class StoredTokenStream extends TokenStream {
Token tokens[];
int currentToken = 0;
TermAttribute termAtt;
OffsetAttribute offsetAtt;
StoredTokenStream(Token tokens[]) {
this.tokens = tokens;
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
}
public Token next(final Token reusableToken) {
System.out.println("next token");
assert reusableToken != null;
if (currentToken >= tokens.length) {
return null;
}
public Token next(final Token reusableToken)
{
assert reusableToken != null;
if(currentToken>=tokens.length)
{
return null;
}
return tokens[currentToken++];
}
}
return tokens[currentToken++];
}
public boolean incrementToken() throws IOException {
System.out.println("inc token");
if (currentToken >= tokens.length) {
return false;
}
Token token = tokens[currentToken++];
termAtt.setTermBuffer(token.term());
offsetAtt.setOffset(token.startOffset(), token.endOffset());
return true;
}
}
//code to reconstruct the original sequence of Tokens
String[] terms=tpv.getTerms();
int[] freq=tpv.getTermFrequencies();
int totalTokens=0;
Token newToken = new Token();
for (int t = 0; t < freq.length; t++)
{
totalTokens+=freq[t];
@ -190,8 +205,9 @@ public class TokenSources
}
for (int tp = 0; tp < offsets.length; tp++)
{
newToken.reinit(terms[t], offsets[tp].getStartOffset(), offsets[tp].getEndOffset());
unsortedTokens.add(newToken.clone());
Token token = new Token(offsets[tp].getStartOffset(), offsets[tp].getEndOffset());
token.setTermBuffer(terms[t]);
unsortedTokens.add(token);
}
}
else
@ -204,8 +220,8 @@ public class TokenSources
//tokens stored with positions - can use this to index straight into sorted array
for (int tp = 0; tp < pos.length; tp++)
{
newToken.reinit(terms[t], offsets[tp].getStartOffset(), offsets[tp].getEndOffset());
tokensInOriginalOrder[pos[tp]] = (Token) newToken.clone();
Token token = new Token(terms[t], offsets[tp].getStartOffset(), offsets[tp].getEndOffset());
tokensInOriginalOrder[pos[tp]] = token;
}
}
}
@ -218,7 +234,7 @@ public class TokenSources
{
Token t1=(Token) o1;
Token t2=(Token) o2;
if(t1.startOffset()>t2.startOffset())
if(t1.startOffset()>t2.endOffset())
return 1;
if(t1.startOffset()<t2.startOffset())
return -1;

View File

@ -42,8 +42,8 @@ import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanOrQuery;
@ -98,7 +98,7 @@ public class WeightedSpanTermExtractor {
private void extract(Query query, Map terms) throws IOException {
if (query instanceof BooleanQuery) {
BooleanClause[] queryClauses = ((BooleanQuery) query).getClauses();
for (int i = 0; i < queryClauses.length; i++) {
if (!queryClauses[i].isProhibited()) {
extract(queryClauses[i].getQuery(), terms);
@ -441,7 +441,7 @@ public class WeightedSpanTermExtractor {
* This class makes sure that if both position sensitive and insensitive
* versions of the same term are added, the position insensitive one wins.
*/
private class PositionCheckingMap extends HashMap {
static private class PositionCheckingMap extends HashMap {
public void putAll(Map m) {
Iterator it = m.keySet().iterator();

View File

@ -38,10 +38,14 @@ import junit.framework.TestCase;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CachingTokenFilter;
import org.apache.lucene.analysis.LowerCaseTokenizer;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
@ -62,9 +66,8 @@ import org.apache.lucene.search.MultiSearcher;
import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermRangeFilter;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TermRangeFilter;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.search.BooleanClause.Occur;
@ -75,6 +78,7 @@ import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
@ -87,7 +91,7 @@ public class HighlighterTest extends TestCase implements Formatter {
static final String FIELD_NAME = "contents";
private Query query;
RAMDirectory ramDir;
public Searcher searcher = null;
public IndexSearcher searcher = null;
public Hits hits = null;
int numHighlights = 0;
Analyzer analyzer = new StandardAnalyzer();
@ -108,11 +112,40 @@ public class HighlighterTest extends TestCase implements Formatter {
super(arg0);
}
public void testHits() throws Exception {
Analyzer analyzer = new SimpleAnalyzer();
QueryParser qp = new QueryParser(FIELD_NAME, analyzer);
query = qp.parse("\"very long\"");
searcher = new IndexSearcher(ramDir, false);
TopDocs hits = searcher.search(query, 10);
Highlighter highlighter = new Highlighter(null);
for (int i = 0; i < hits.scoreDocs.length; i++) {
Document doc = searcher.doc(hits.scoreDocs[i].doc);
String storedField = doc.get(FIELD_NAME);
TokenStream stream = TokenSources.getAnyTokenStream(searcher
.getIndexReader(), hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
CachingTokenFilter ctf = new CachingTokenFilter(stream);
SpanScorer scorer = new SpanScorer(query, FIELD_NAME, ctf);
// ctf.reset();
Fragmenter fragmenter = new SimpleSpanFragmenter(scorer);
highlighter.setFragmentScorer(scorer);
highlighter.setTextFragmenter(fragmenter);
String fragment = highlighter.getBestFragment(ctf, storedField);
System.out.println(fragment);
}
}
public void testHighlightingWithDefaultField() throws Exception {
String s1 = "I call our world Flatland, not because we call it so,";
QueryParser parser = new QueryParser(FIELD_NAME, new StandardAnalyzer());
QueryParser parser = new QueryParser(FIELD_NAME, new StandardAnalyzer(Version.LUCENE_CURRENT));
// Verify that a query against the default field results in text being
// highlighted
@ -144,7 +177,7 @@ public class HighlighterTest extends TestCase implements Formatter {
*/
private static String highlightField(Query query, String fieldName, String text)
throws IOException, InvalidTokenOffsetsException {
CachingTokenFilter tokenStream = new CachingTokenFilter(new StandardAnalyzer().tokenStream(
CachingTokenFilter tokenStream = new CachingTokenFilter(new StandardAnalyzer(Version.LUCENE_CURRENT).tokenStream(
fieldName, new StringReader(text)));
// Assuming "<B>", "</B>" used to highlight
SimpleHTMLFormatter formatter = new SimpleHTMLFormatter();
@ -908,10 +941,12 @@ public class HighlighterTest extends TestCase implements Formatter {
Query query = parser.parse(srchkey);
TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(s));
Highlighter highlighter = getHighlighter(query, null, tokenStream, HighlighterTest.this);
// Get 3 best fragments and seperate with a "..."
tokenStream = analyzer.tokenStream(null, new StringReader(s));
String result = highlighter.getBestFragments(tokenStream, s, 3, "...");
String expectedResult = "<B>football</B>-<B>soccer</B> in the euro 2004 <B>footie</B> competition";
assertTrue("overlapping analyzer should handle highlights OK, expected:" + expectedResult
@ -1075,10 +1110,11 @@ public class HighlighterTest extends TestCase implements Formatter {
}
public void testUnRewrittenQuery() throws Exception {
TestHighlightRunner helper = new TestHighlightRunner() {
final TestHighlightRunner helper = new TestHighlightRunner() {
public void run() throws Exception {
numHighlights = 0;
SpanScorer.setHighlightCnstScrRngQuery(false);
// test to show how rewritten query can still be used
searcher = new IndexSearcher(ramDir);
Analyzer analyzer = new StandardAnalyzer();
@ -1154,13 +1190,17 @@ public class HighlighterTest extends TestCase implements Formatter {
public void startFragment(TextFragment newFragment) {
}
public float getTokenScore(Token token) {
public float getTokenScore() {
return 0;
}
public float getFragmentScore() {
return 1;
}
public void init(TokenStream tokenStream) {
}
});
highlighter.setTextFragmenter(new SimpleFragmenter(2000));
TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(rawDocContent));
@ -1292,27 +1332,44 @@ public class HighlighterTest extends TestCase implements Formatter {
return new TokenStream() {
Iterator iter;
List lst;
private TermAttribute termAtt;
private PositionIncrementAttribute posIncrAtt;
private OffsetAttribute offsetAtt;
{
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
lst = new ArrayList();
Token t;
t = createToken("hi", 0, 2);
t.setPositionIncrement(1);
lst.add(t);
t = createToken("hispeed", 0, 8);
t.setPositionIncrement(1);
lst.add(t);
t = createToken("speed", 3, 8);
t.setPositionIncrement(0);
lst.add(t);
t = createToken("10", 8, 10);
t.setPositionIncrement(1);
lst.add(t);
t = createToken("foo", 11, 14);
t.setPositionIncrement(1);
lst.add(t);
iter = lst.iterator();
}
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
return iter.hasNext() ? (Token) iter.next() : null;
public boolean incrementToken() throws IOException {
if(iter.hasNext()) {
Token token = (Token) iter.next();
termAtt.setTermBuffer(token.term());
posIncrAtt.setPositionIncrement(token.getPositionIncrement());
offsetAtt.setOffset(token.startOffset(), token.endOffset());
return true;
}
return false;
}
};
}
@ -1322,26 +1379,42 @@ public class HighlighterTest extends TestCase implements Formatter {
return new TokenStream() {
Iterator iter;
List lst;
private TermAttribute termAtt;
private PositionIncrementAttribute posIncrAtt;
private OffsetAttribute offsetAtt;
{
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
lst = new ArrayList();
Token t;
t = createToken("hispeed", 0, 8);
t.setPositionIncrement(1);
lst.add(t);
t = createToken("hi", 0, 2);
t.setPositionIncrement(0);
lst.add(t);
t = createToken("speed", 3, 8);
t.setPositionIncrement(1);
lst.add(t);
t = createToken("10", 8, 10);
t.setPositionIncrement(1);
lst.add(t);
t = createToken("foo", 11, 14);
t.setPositionIncrement(1);
lst.add(t);
iter = lst.iterator();
}
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
return iter.hasNext() ? (Token) iter.next() : null;
public boolean incrementToken() throws IOException {
if(iter.hasNext()) {
Token token = (Token) iter.next();
termAtt.setTermBuffer(token.term());
posIncrAtt.setPositionIncrement(token.getPositionIncrement());
offsetAtt.setOffset(token.startOffset(), token.endOffset());
return true;
}
return false;
}
};
}
@ -1611,7 +1684,11 @@ class SynonymAnalyzer extends Analyzer {
* java.io.Reader)
*/
public TokenStream tokenStream(String arg0, Reader arg1) {
return new SynonymTokenizer(new LowerCaseTokenizer(arg1), synonyms);
LowerCaseTokenizer stream = new LowerCaseTokenizer(arg1);
stream.addAttribute(TermAttribute.class);
stream.addAttribute(PositionIncrementAttribute.class);
stream.addAttribute(OffsetAttribute.class);
return new SynonymTokenizer(stream, synonyms);
}
}
@ -1622,47 +1699,70 @@ class SynonymAnalyzer extends Analyzer {
class SynonymTokenizer extends TokenStream {
private TokenStream realStream;
private Token currentRealToken = null;
private org.apache.lucene.analysis.Token cRealToken = null;
private Map synonyms;
StringTokenizer st = null;
private TermAttribute realTermAtt;
private PositionIncrementAttribute realPosIncrAtt;
private OffsetAttribute realOffsetAtt;
private TermAttribute termAtt;
private PositionIncrementAttribute posIncrAtt;
private OffsetAttribute offsetAtt;
public SynonymTokenizer(TokenStream realStream, Map synonyms) {
this.realStream = realStream;
this.synonyms = synonyms;
realTermAtt = (TermAttribute) realStream.getAttribute(TermAttribute.class);
realPosIncrAtt = (PositionIncrementAttribute) realStream.getAttribute(PositionIncrementAttribute.class);
realOffsetAtt = (OffsetAttribute) realStream.getAttribute(OffsetAttribute.class);
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
}
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
public boolean incrementToken() throws IOException {
if (currentRealToken == null) {
Token nextRealToken = realStream.next(reusableToken);
if (nextRealToken == null) {
return null;
boolean next = realStream.incrementToken();
if (!next) {
return false;
}
String expansions = (String) synonyms.get(nextRealToken.term());
//Token nextRealToken = new Token(, offsetAtt.startOffset(), offsetAtt.endOffset());
termAtt.setTermBuffer(realTermAtt.term());
offsetAtt.setOffset(realOffsetAtt.startOffset(), realOffsetAtt.endOffset());
posIncrAtt.setPositionIncrement(realPosIncrAtt.getPositionIncrement());
String expansions = (String) synonyms.get(realTermAtt.term());
if (expansions == null) {
return nextRealToken;
return true;
}
st = new StringTokenizer(expansions, ",");
if (st.hasMoreTokens()) {
currentRealToken = (Token) nextRealToken.clone();
currentRealToken = new Token(realOffsetAtt.startOffset(), realOffsetAtt.endOffset());
currentRealToken.setTermBuffer(realTermAtt.term());
}
return currentRealToken;
return true;
} else {
reusableToken.reinit(st.nextToken(),
currentRealToken.startOffset(),
currentRealToken.endOffset());
reusableToken.setPositionIncrement(0);
String tok = st.nextToken();
termAtt.setTermBuffer(tok);
offsetAtt.setOffset(currentRealToken.startOffset(), currentRealToken.endOffset());
posIncrAtt.setPositionIncrement(0);
if (!st.hasMoreTokens()) {
currentRealToken = null;
st = null;
}
return reusableToken;
return true;
}
}
static abstract class TestHighlightRunner {
static final int STANDARD = 0;
static final int SPAN = 1;
int mode = STANDARD;
Fragmenter frag = new SimpleFragmenter(20);
public Highlighter getHighlighter(Query query, String fieldName, TokenStream stream,
Formatter formatter) {
@ -1725,7 +1825,7 @@ class SynonymTokenizer extends TokenStream {
if (mode == SPAN) {
((CachingTokenFilter) tokenStream).reset();
}
highlighter.setTextFragmenter(new SimpleFragmenter(20));
highlighter.setTextFragmenter(frag);
String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,
fragmentSeparator);