From f73a4f432424878fefc11cdc3598f26a7006d270 Mon Sep 17 00:00:00 2001 From: Mark Robert Miller Date: Thu, 30 Jul 2009 22:00:47 +0000 Subject: [PATCH] LUCENE-1695: Update the Highlighter to use the new TokenStream API. This issue breaks backwards compatibility with some public classes. If you have implemented custom Fregmenters or Scorers, you will need to adjust them to work with the new TokenStream API. Rather than getting passed a Token at a time, you will be given a TokenStream to init your impl with - store the Attributes you are interested in locally and access them on each call to the method that used to pass a new Token. Look at the included updated impls for examples. git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@799455 13f79535-47bb-0310-9956-ffa450edef68 --- contrib/CHANGES.txt | 10 +- .../lucene/search/highlight/Fragmenter.java | 39 +-- .../lucene/search/highlight/Highlighter.java | 39 +-- .../search/highlight/NullFragmenter.java | 7 +- .../lucene/search/highlight/QueryScorer.java | 243 +++++++++--------- .../lucene/search/highlight/Scorer.java | 58 +++-- .../search/highlight/SimpleFragmenter.java | 104 ++++---- .../highlight/SimpleSpanFragmenter.java | 32 ++- .../lucene/search/highlight/SpanScorer.java | 18 +- .../lucene/search/highlight/TokenGroup.java | 190 +++++++------- .../lucene/search/highlight/TokenSources.java | 66 +++-- .../highlight/WeightedSpanTermExtractor.java | 6 +- .../search/highlight/HighlighterTest.java | 158 +++++++++--- 13 files changed, 569 insertions(+), 401 deletions(-) diff --git a/contrib/CHANGES.txt b/contrib/CHANGES.txt index 4546df08a3f..09d756e365b 100644 --- a/contrib/CHANGES.txt +++ b/contrib/CHANGES.txt @@ -11,7 +11,12 @@ Changes in runtime behavior API Changes - (None) + 1. LUCENE-1695: Update the Highlighter to use the new TokenStream API. This issue breaks backwards + compatibility with some public classes. If you have implemented custom Fregmenters or Scorers, + you will need to adjust them to work with the new TokenStream API. Rather than getting passed a + Token at a time, you will be given a TokenStream to init your impl with - store the Attributes + you are interested in locally and access them on each call to the method that used to pass a new + Token. Look at the included updated impls for examples. (Mark Miller) Bug fixes @@ -41,9 +46,6 @@ Bug fixes 8. LUCENE-1491: EdgeNGramTokenFilter no longer stops on tokens shorter than minimum n-gram size. (Todd Teak via Otis Gospodnetic) - - 9. LUCENE-1752: Missing highlights when terms were repeated in separate, nested, boolean or - disjunction queries. (Koji Sekiguchi, Mark Miller) New features diff --git a/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Fragmenter.java b/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Fragmenter.java index bab5554e617..1dc6a8f3193 100644 --- a/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Fragmenter.java +++ b/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Fragmenter.java @@ -16,24 +16,31 @@ package org.apache.lucene.search.highlight; * limitations under the License. */ -import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; /** - * Implements the policy for breaking text into multiple fragments for consideration - * by the {@link Highlighter} class. A sophisticated implementation may do this on the basis - * of detecting end of sentences in the text. + * Implements the policy for breaking text into multiple fragments for + * consideration by the {@link Highlighter} class. A sophisticated + * implementation may do this on the basis of detecting end of sentences in the + * text. */ -public interface Fragmenter -{ - /** - * Initializes the Fragmenter - * @param originalText - */ - public void start(String originalText); +public interface Fragmenter { - /** - * Test to see if this token from the stream should be held in a new TextFragment - * @param nextToken - */ - public boolean isNewFragment(Token nextToken); + /** + * Initializes the Fragmenter. You can grab references to the Attributes you are + * interested in from tokenStream and then access the values in isNewFragment. + * + * @param originalText + * @param tokenStream + */ + public void start(String originalText, TokenStream tokenStream); + + + /** + * Test to see if this token from the stream should be held in a new + * TextFragment. Every time this is called, the TokenStream + * passed to start(String, TokenStream) will have been incremented. + * + */ + public boolean isNewFragment(); } diff --git a/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java b/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java index c0473b5e617..80a53ae6da2 100644 --- a/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java +++ b/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java @@ -22,8 +22,10 @@ import java.util.ArrayList; import java.util.Iterator; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.util.PriorityQueue; /** @@ -214,8 +216,14 @@ public class Highlighter { ArrayList docFrags = new ArrayList(); StringBuffer newText=new StringBuffer(); - + + TermAttribute termAtt = (TermAttribute) tokenStream.addAttribute(TermAttribute.class); + OffsetAttribute offsetAtt = (OffsetAttribute) tokenStream.addAttribute(OffsetAttribute.class); + tokenStream.addAttribute(PositionIncrementAttribute.class); + tokenStream.reset(); + TextFragment currentFrag = new TextFragment(newText,newText.length(), docFrags.size()); + fragmentScorer.init(tokenStream); fragmentScorer.startFragment(currentFrag); docFrags.add(currentFrag); @@ -223,28 +231,27 @@ public class Highlighter try { - final Token reusableToken = new Token(); + String tokenText; int startOffset; int endOffset; int lastEndOffset = 0; - textFragmenter.start(text); + textFragmenter.start(text, tokenStream); - TokenGroup tokenGroup=new TokenGroup(); - - for (Token nextToken = tokenStream.next(reusableToken); - (nextToken!= null)&&(nextToken.startOffset()< maxDocCharsToAnalyze); - nextToken = tokenStream.next(reusableToken)) + TokenGroup tokenGroup=new TokenGroup(tokenStream); + + for (boolean next = tokenStream.incrementToken(); next && (offsetAtt.startOffset()< maxDocCharsToAnalyze); + next = tokenStream.incrementToken()) { - if( (nextToken.endOffset()>text.length()) + if( (offsetAtt.endOffset()>text.length()) || - (nextToken.startOffset()>text.length()) + (offsetAtt.startOffset()>text.length()) ) { - throw new InvalidTokenOffsetsException("Token "+nextToken.toString() + throw new InvalidTokenOffsetsException("Token "+ termAtt.term() +" exceeds length of provided text sized "+text.length()); } - if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct(nextToken))) + if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct())) { //the current token is distinct from previous tokens - // markup the cached token group info @@ -260,7 +267,7 @@ public class Highlighter tokenGroup.clear(); //check if current token marks the start of a new fragment - if(textFragmenter.isNewFragment(nextToken)) + if(textFragmenter.isNewFragment()) { currentFrag.setScore(fragmentScorer.getFragmentScore()); //record stats for a new fragment @@ -271,7 +278,7 @@ public class Highlighter } } - tokenGroup.addToken(nextToken,fragmentScorer.getTokenScore(nextToken)); + tokenGroup.addToken(fragmentScorer.getTokenScore()); // if(lastEndOffset>maxDocBytesToAnalyze) // { @@ -332,7 +339,7 @@ public class Highlighter //The above code caused a problem as a result of Christoph Goller's 11th Sept 03 //fix to PriorityQueue. The correct method to use here is the new "insert" method // USE ABOVE CODE IF THIS DOES NOT COMPILE! - fragQueue.insert(currentFrag); + fragQueue.insertWithOverflow(currentFrag); } //return the most relevant fragments diff --git a/contrib/highlighter/src/java/org/apache/lucene/search/highlight/NullFragmenter.java b/contrib/highlighter/src/java/org/apache/lucene/search/highlight/NullFragmenter.java index 67a027655bb..2dd042c5afc 100644 --- a/contrib/highlighter/src/java/org/apache/lucene/search/highlight/NullFragmenter.java +++ b/contrib/highlighter/src/java/org/apache/lucene/search/highlight/NullFragmenter.java @@ -16,17 +16,18 @@ package org.apache.lucene.search.highlight; * limitations under the License. */ -import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; /** * {@link Fragmenter} implementation which does not fragment the text. * This is useful for highlighting the entire content of a document or field. */ public class NullFragmenter implements Fragmenter { - public void start(String s) { + public void start(String s, TokenStream tokenStream) { } - public boolean isNewFragment(Token token) { + public boolean isNewFragment() { return false; } + } diff --git a/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java b/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java index c6a9545bca2..578f35189c6 100644 --- a/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java +++ b/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java @@ -1,4 +1,5 @@ package org.apache.lucene.search.highlight; + /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -19,134 +20,142 @@ package org.apache.lucene.search.highlight; import java.util.HashMap; import java.util.HashSet; -import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.index.IndexReader; import org.apache.lucene.search.Query; /** - * {@link Scorer} implementation which scores text fragments by the number of unique query terms found. - * This class uses the {@link QueryTermExtractor} class to process determine the query terms and - * their boosts to be used. + * {@link Scorer} implementation which scores text fragments by the number of + * unique query terms found. This class uses the {@link QueryTermExtractor} + * class to process determine the query terms and their boosts to be used. */ -//TODO: provide option to boost score of fragments near beginning of document +// TODO: provide option to boost score of fragments near beginning of document // based on fragment.getFragNum() -public class QueryScorer implements Scorer -{ - TextFragment currentTextFragment=null; - HashSet uniqueTermsInFragment; - float totalScore=0; - float maxTermWeight=0; - private HashMap termsToFind; - +public class QueryScorer implements Scorer { + + TextFragment currentTextFragment = null; + HashSet uniqueTermsInFragment; - /** - * - * @param query a Lucene query (ideally rewritten using query.rewrite - * before being passed to this class and the searcher) - */ - public QueryScorer(Query query) - { - this(QueryTermExtractor.getTerms(query)); - } - - /** - * - * @param query a Lucene query (ideally rewritten using query.rewrite - * before being passed to this class and the searcher) - * @param fieldName the Field name which is used to match Query terms - */ - public QueryScorer(Query query, String fieldName) - { - this(QueryTermExtractor.getTerms(query, false,fieldName)); - } + float totalScore = 0; + float maxTermWeight = 0; + private HashMap termsToFind; - /** - * - * @param query a Lucene query (ideally rewritten using query.rewrite - * before being passed to this class and the searcher) - * @param reader used to compute IDF which can be used to a) score selected fragments better - * b) use graded highlights eg set font color intensity - * @param fieldName the field on which Inverse Document Frequency (IDF) calculations are based - */ - public QueryScorer(Query query, IndexReader reader, String fieldName) - { - this(QueryTermExtractor.getIdfWeightedTerms(query, reader, fieldName)); - } + private TermAttribute termAtt; - public QueryScorer(WeightedTerm []weightedTerms ) - { - termsToFind = new HashMap(); - for (int i = 0; i < weightedTerms.length; i++) - { - WeightedTerm existingTerm=(WeightedTerm) termsToFind.get(weightedTerms[i].term); - if( (existingTerm==null) ||(existingTerm.weight= (fragmentSize * currentNumFrags); + if (isNewFrag) { + currentNumFrags++; + } + return isNewFrag; + } - /* (non-Javadoc) - * @see org.apache.lucene.search.highlight.TextFragmenter#start(java.lang.String) - */ - public void start(String originalText) - { - currentNumFrags=1; - } + /** + * @return size in number of characters of each fragment + */ + public int getFragmentSize() { + return fragmentSize; + } - /* (non-Javadoc) - * @see org.apache.lucene.search.highlight.TextFragmenter#isNewFragment(org.apache.lucene.analysis.Token) - */ - public boolean isNewFragment(Token token) - { - boolean isNewFrag= token.endOffset()>=(fragmentSize*currentNumFrags); - if(isNewFrag) - { - currentNumFrags++; - } - return isNewFrag; - } - - /** - * @return size in number of characters of each fragment - */ - public int getFragmentSize() - { - return fragmentSize; - } - - /** - * @param size size in characters of each fragment - */ - public void setFragmentSize(int size) - { - fragmentSize = size; - } + /** + * @param size size in characters of each fragment + */ + public void setFragmentSize(int size) { + fragmentSize = size; + } } diff --git a/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleSpanFragmenter.java b/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleSpanFragmenter.java index ea9adae1eae..b522260af92 100644 --- a/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleSpanFragmenter.java +++ b/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleSpanFragmenter.java @@ -17,10 +17,13 @@ package org.apache.lucene.search.highlight; * See the License for the specific language governing permissions and * limitations under the License. */ -import org.apache.lucene.analysis.Token; - import java.util.List; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; + /** * {@link Fragmenter} implementation which breaks text up into same-size @@ -34,6 +37,9 @@ public class SimpleSpanFragmenter implements Fragmenter { private SpanScorer spanScorer; private int waitForPos = -1; private int textSize; + private TermAttribute termAtt; + private PositionIncrementAttribute posIncAtt; + private OffsetAttribute offsetAtt; /** * @param spanscorer SpanScorer that was used to score hits @@ -50,12 +56,12 @@ public class SimpleSpanFragmenter implements Fragmenter { this.fragmentSize = fragmentSize; this.spanScorer = spanscorer; } - + /* (non-Javadoc) - * @see org.apache.lucene.search.highlight.Fragmenter#isNewFragment(org.apache.lucene.analysis.Token) + * @see org.apache.lucene.search.highlight.Fragmenter#isNewFragment() */ - public boolean isNewFragment(Token token) { - position += token.getPositionIncrement(); + public boolean isNewFragment() { + position += posIncAtt.getPositionIncrement(); if (waitForPos == position) { waitForPos = -1; @@ -63,7 +69,7 @@ public class SimpleSpanFragmenter implements Fragmenter { return false; } - WeightedSpanTerm wSpanTerm = spanScorer.getWeightedSpanTerm(token.term()); + WeightedSpanTerm wSpanTerm = spanScorer.getWeightedSpanTerm(termAtt.term()); if (wSpanTerm != null) { List positionSpans = wSpanTerm.getPositionSpans(); @@ -76,8 +82,8 @@ public class SimpleSpanFragmenter implements Fragmenter { } } - boolean isNewFrag = token.endOffset() >= (fragmentSize * currentNumFrags) - && (textSize - token.endOffset()) >= (fragmentSize >>> 1); + boolean isNewFrag = offsetAtt.endOffset() >= (fragmentSize * currentNumFrags) + && (textSize - offsetAtt.endOffset()) >= (fragmentSize >>> 1); if (isNewFrag) { currentNumFrags++; @@ -86,12 +92,16 @@ public class SimpleSpanFragmenter implements Fragmenter { return isNewFrag; } + /* (non-Javadoc) - * @see org.apache.lucene.search.highlight.Fragmenter#start(java.lang.String) + * @see org.apache.lucene.search.highlight.Fragmenter#start(java.lang.String, org.apache.lucene.analysis.TokenStream) */ - public void start(String originalText) { + public void start(String originalText, TokenStream tokenStream) { position = -1; currentNumFrags = 1; textSize = originalText.length(); + termAtt = (TermAttribute) tokenStream.getAttribute(TermAttribute.class); + posIncAtt = (PositionIncrementAttribute) tokenStream.getAttribute(PositionIncrementAttribute.class); + offsetAtt = (OffsetAttribute) tokenStream.getAttribute(OffsetAttribute.class); } } diff --git a/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SpanScorer.java b/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SpanScorer.java index 6aaa4fea4e9..2280178dd51 100644 --- a/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SpanScorer.java +++ b/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SpanScorer.java @@ -7,9 +7,10 @@ import java.util.Map; import java.util.Set; import org.apache.lucene.analysis.CachingTokenFilter; -import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.search.ConstantScoreRangeQuery; import org.apache.lucene.search.Query; @@ -26,6 +27,8 @@ public class SpanScorer implements Scorer { private float maxTermWeight; private int position = -1; private String defaultField; + private TermAttribute termAtt; + private PositionIncrementAttribute posIncAtt; private static boolean highlightCnstScrRngQuery; /** @@ -176,9 +179,9 @@ public class SpanScorer implements Scorer { * @see org.apache.lucene.search.highlight.Scorer#getTokenScore(org.apache.lucene.analysis.Token, * int) */ - public float getTokenScore(Token token) { - position += token.getPositionIncrement(); - String termText = token.term(); + public float getTokenScore() { + position += posIncAtt.getPositionIncrement(); + String termText = termAtt.term(); WeightedSpanTerm weightedSpanTerm; @@ -203,6 +206,11 @@ public class SpanScorer implements Scorer { return score; } + public void init(TokenStream tokenStream) { + termAtt = (TermAttribute) tokenStream.getAttribute(TermAttribute.class); + posIncAtt = (PositionIncrementAttribute) tokenStream.getAttribute(PositionIncrementAttribute.class); + } + /** * Retrieve the WeightedSpanTerm for the specified token. Useful for passing * Span information to a Fragmenter. diff --git a/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenGroup.java b/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenGroup.java index 280192a984e..03dc523f002 100644 --- a/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenGroup.java +++ b/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenGroup.java @@ -1,4 +1,5 @@ package org.apache.lucene.search.highlight; + /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -15,118 +16,117 @@ package org.apache.lucene.search.highlight; * See the License for the specific language governing permissions and * limitations under the License. */ + import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; /** - * One, or several overlapping tokens, along with the score(s) and the - * scope of the original text + * One, or several overlapping tokens, along with the score(s) and the scope of + * the original text */ -public class TokenGroup -{ - - private static final int MAX_NUM_TOKENS_PER_GROUP=50; - Token [] tokens=new Token[MAX_NUM_TOKENS_PER_GROUP]; - float [] scores=new float[MAX_NUM_TOKENS_PER_GROUP]; - int numTokens=0; - int startOffset=0; - int endOffset=0; - float tot; +public class TokenGroup { + private static final int MAX_NUM_TOKENS_PER_GROUP = 50; + Token [] tokens=new Token[MAX_NUM_TOKENS_PER_GROUP]; + float[] scores = new float[MAX_NUM_TOKENS_PER_GROUP]; + int numTokens = 0; + int startOffset = 0; + int endOffset = 0; + float tot; int matchStartOffset, matchEndOffset; + private OffsetAttribute offsetAtt; + private TermAttribute termAtt; - void addToken(Token token, float score) - { - if(numTokens < MAX_NUM_TOKENS_PER_GROUP) - { - if(numTokens==0) - { - startOffset=matchStartOffset=token.startOffset(); - endOffset=matchEndOffset=token.endOffset(); - tot += score; - } - else - { - startOffset=Math.min(startOffset,token.startOffset()); - endOffset=Math.max(endOffset,token.endOffset()); - if (score>0) { - if (tot==0) { - matchStartOffset=token.startOffset(); - matchEndOffset=token.endOffset(); + public TokenGroup(TokenStream tokenStream) { + offsetAtt = (OffsetAttribute) tokenStream.getAttribute(OffsetAttribute.class); + termAtt = (TermAttribute) tokenStream.getAttribute(TermAttribute.class); + } + + void addToken(float score) { + if (numTokens < MAX_NUM_TOKENS_PER_GROUP) { + int termStartOffset = offsetAtt.startOffset(); + int termEndOffset = offsetAtt.endOffset(); + if (numTokens == 0) { + startOffset = matchStartOffset = termStartOffset; + endOffset = matchEndOffset = termEndOffset; + tot += score; + } else { + startOffset = Math.min(startOffset, termStartOffset); + endOffset = Math.max(endOffset, termEndOffset); + if (score > 0) { + if (tot == 0) { + matchStartOffset = offsetAtt.startOffset(); + matchEndOffset = offsetAtt.endOffset(); } else { - matchStartOffset=Math.min(matchStartOffset,token.startOffset()); - matchEndOffset=Math.max(matchEndOffset,token.endOffset()); + matchStartOffset = Math.min(matchStartOffset, termStartOffset); + matchEndOffset = Math.max(matchEndOffset, termEndOffset); } - tot+=score; + tot += score; } } - tokens[numTokens]= (Token) token.clone(); - scores[numTokens]=score; - numTokens++; - } - } + Token token = new Token(termStartOffset, termEndOffset); + token.setTermBuffer(termAtt.term()); + tokens[numTokens] = token; + scores[numTokens] = score; + numTokens++; + } + } - boolean isDistinct(Token token) - { - return token.startOffset()>=endOffset; - } + boolean isDistinct() { + return offsetAtt.startOffset() >= endOffset; + } + void clear() { + numTokens = 0; + tot = 0; + } + + /* + * @param index a value between 0 and numTokens -1 + * @return the "n"th token + */ + public Token getToken(int index) + { + return tokens[index]; + } - void clear() - { - numTokens=0; - tot=0; - } - - /** - * - * @param index a value between 0 and numTokens -1 - * @return the "n"th token - */ - public Token getToken(int index) - { - return tokens[index]; - } + /** + * + * @param index a value between 0 and numTokens -1 + * @return the "n"th score + */ + public float getScore(int index) { + return scores[index]; + } - /** - * - * @param index a value between 0 and numTokens -1 - * @return the "n"th score - */ - public float getScore(int index) - { - return scores[index]; - } + /** + * @return the end position in the original text + */ + public int getEndOffset() { + return endOffset; + } - /** - * @return the end position in the original text - */ - public int getEndOffset() - { - return endOffset; - } + /** + * @return the number of tokens in this group + */ + public int getNumTokens() { + return numTokens; + } - /** - * @return the number of tokens in this group - */ - public int getNumTokens() - { - return numTokens; - } + /** + * @return the start position in the original text + */ + public int getStartOffset() { + return startOffset; + } - /** - * @return the start position in the original text - */ - public int getStartOffset() - { - return startOffset; - } - - /** - * @return all tokens' scores summed up - */ - public float getTotalScore() - { - return tot; - } + /** + * @return all tokens' scores summed up + */ + public float getTotalScore() { + return tot; + } } diff --git a/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java b/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java index 1dba0675008..f3660e8fe6c 100644 --- a/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java +++ b/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java @@ -29,6 +29,8 @@ import java.util.Comparator; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.TermFreqVector; @@ -135,32 +137,45 @@ public class TokenSources * @param tokenPositionsGuaranteedContiguous true if the token position numbers have no overlaps or gaps. If looking * to eek out the last drops of performance, set to true. If in doubt, set to false. */ - public static TokenStream getTokenStream(TermPositionVector tpv, boolean tokenPositionsGuaranteedContiguous) - { + public static TokenStream getTokenStream(TermPositionVector tpv, boolean tokenPositionsGuaranteedContiguous) { //an object used to iterate across an array of tokens - class StoredTokenStream extends TokenStream - { - Token tokens[]; - int currentToken=0; - StoredTokenStream(Token tokens[]) - { - this.tokens=tokens; + class StoredTokenStream extends TokenStream { + Token tokens[]; + int currentToken = 0; + TermAttribute termAtt; + OffsetAttribute offsetAtt; + + StoredTokenStream(Token tokens[]) { + this.tokens = tokens; + termAtt = (TermAttribute) addAttribute(TermAttribute.class); + offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); + } + + public Token next(final Token reusableToken) { + System.out.println("next token"); + assert reusableToken != null; + if (currentToken >= tokens.length) { + return null; } - public Token next(final Token reusableToken) - { - assert reusableToken != null; - if(currentToken>=tokens.length) - { - return null; - } - return tokens[currentToken++]; - } - } + return tokens[currentToken++]; + } + + public boolean incrementToken() throws IOException { + System.out.println("inc token"); + if (currentToken >= tokens.length) { + return false; + } + Token token = tokens[currentToken++]; + termAtt.setTermBuffer(token.term()); + offsetAtt.setOffset(token.startOffset(), token.endOffset()); + return true; + } + } //code to reconstruct the original sequence of Tokens String[] terms=tpv.getTerms(); int[] freq=tpv.getTermFrequencies(); int totalTokens=0; - Token newToken = new Token(); + for (int t = 0; t < freq.length; t++) { totalTokens+=freq[t]; @@ -190,8 +205,9 @@ public class TokenSources } for (int tp = 0; tp < offsets.length; tp++) { - newToken.reinit(terms[t], offsets[tp].getStartOffset(), offsets[tp].getEndOffset()); - unsortedTokens.add(newToken.clone()); + Token token = new Token(offsets[tp].getStartOffset(), offsets[tp].getEndOffset()); + token.setTermBuffer(terms[t]); + unsortedTokens.add(token); } } else @@ -204,8 +220,8 @@ public class TokenSources //tokens stored with positions - can use this to index straight into sorted array for (int tp = 0; tp < pos.length; tp++) { - newToken.reinit(terms[t], offsets[tp].getStartOffset(), offsets[tp].getEndOffset()); - tokensInOriginalOrder[pos[tp]] = (Token) newToken.clone(); + Token token = new Token(terms[t], offsets[tp].getStartOffset(), offsets[tp].getEndOffset()); + tokensInOriginalOrder[pos[tp]] = token; } } } @@ -218,7 +234,7 @@ public class TokenSources { Token t1=(Token) o1; Token t2=(Token) o2; - if(t1.startOffset()>t2.startOffset()) + if(t1.startOffset()>t2.endOffset()) return 1; if(t1.startOffset()", "" used to highlight SimpleHTMLFormatter formatter = new SimpleHTMLFormatter(); @@ -908,10 +941,12 @@ public class HighlighterTest extends TestCase implements Formatter { Query query = parser.parse(srchkey); TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(s)); + Highlighter highlighter = getHighlighter(query, null, tokenStream, HighlighterTest.this); // Get 3 best fragments and seperate with a "..." tokenStream = analyzer.tokenStream(null, new StringReader(s)); + String result = highlighter.getBestFragments(tokenStream, s, 3, "..."); String expectedResult = "football-soccer in the euro 2004 footie competition"; assertTrue("overlapping analyzer should handle highlights OK, expected:" + expectedResult @@ -1075,10 +1110,11 @@ public class HighlighterTest extends TestCase implements Formatter { } public void testUnRewrittenQuery() throws Exception { - TestHighlightRunner helper = new TestHighlightRunner() { + final TestHighlightRunner helper = new TestHighlightRunner() { public void run() throws Exception { numHighlights = 0; + SpanScorer.setHighlightCnstScrRngQuery(false); // test to show how rewritten query can still be used searcher = new IndexSearcher(ramDir); Analyzer analyzer = new StandardAnalyzer(); @@ -1154,13 +1190,17 @@ public class HighlighterTest extends TestCase implements Formatter { public void startFragment(TextFragment newFragment) { } - public float getTokenScore(Token token) { + public float getTokenScore() { return 0; } public float getFragmentScore() { return 1; } + + public void init(TokenStream tokenStream) { + + } }); highlighter.setTextFragmenter(new SimpleFragmenter(2000)); TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(rawDocContent)); @@ -1292,27 +1332,44 @@ public class HighlighterTest extends TestCase implements Formatter { return new TokenStream() { Iterator iter; List lst; + private TermAttribute termAtt; + private PositionIncrementAttribute posIncrAtt; + private OffsetAttribute offsetAtt; { + termAtt = (TermAttribute) addAttribute(TermAttribute.class); + posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); + offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); lst = new ArrayList(); Token t; t = createToken("hi", 0, 2); + t.setPositionIncrement(1); lst.add(t); t = createToken("hispeed", 0, 8); + t.setPositionIncrement(1); lst.add(t); t = createToken("speed", 3, 8); t.setPositionIncrement(0); lst.add(t); t = createToken("10", 8, 10); + t.setPositionIncrement(1); lst.add(t); t = createToken("foo", 11, 14); + t.setPositionIncrement(1); lst.add(t); iter = lst.iterator(); } - public Token next(final Token reusableToken) throws IOException { - assert reusableToken != null; - return iter.hasNext() ? (Token) iter.next() : null; + public boolean incrementToken() throws IOException { + if(iter.hasNext()) { + Token token = (Token) iter.next(); + termAtt.setTermBuffer(token.term()); + posIncrAtt.setPositionIncrement(token.getPositionIncrement()); + offsetAtt.setOffset(token.startOffset(), token.endOffset()); + return true; + } + return false; } + }; } @@ -1322,26 +1379,42 @@ public class HighlighterTest extends TestCase implements Formatter { return new TokenStream() { Iterator iter; List lst; + private TermAttribute termAtt; + private PositionIncrementAttribute posIncrAtt; + private OffsetAttribute offsetAtt; { + termAtt = (TermAttribute) addAttribute(TermAttribute.class); + posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); + offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); lst = new ArrayList(); Token t; t = createToken("hispeed", 0, 8); + t.setPositionIncrement(1); lst.add(t); t = createToken("hi", 0, 2); t.setPositionIncrement(0); lst.add(t); t = createToken("speed", 3, 8); + t.setPositionIncrement(1); lst.add(t); t = createToken("10", 8, 10); + t.setPositionIncrement(1); lst.add(t); t = createToken("foo", 11, 14); + t.setPositionIncrement(1); lst.add(t); iter = lst.iterator(); } - public Token next(final Token reusableToken) throws IOException { - assert reusableToken != null; - return iter.hasNext() ? (Token) iter.next() : null; + public boolean incrementToken() throws IOException { + if(iter.hasNext()) { + Token token = (Token) iter.next(); + termAtt.setTermBuffer(token.term()); + posIncrAtt.setPositionIncrement(token.getPositionIncrement()); + offsetAtt.setOffset(token.startOffset(), token.endOffset()); + return true; + } + return false; } }; } @@ -1611,7 +1684,11 @@ class SynonymAnalyzer extends Analyzer { * java.io.Reader) */ public TokenStream tokenStream(String arg0, Reader arg1) { - return new SynonymTokenizer(new LowerCaseTokenizer(arg1), synonyms); + LowerCaseTokenizer stream = new LowerCaseTokenizer(arg1); + stream.addAttribute(TermAttribute.class); + stream.addAttribute(PositionIncrementAttribute.class); + stream.addAttribute(OffsetAttribute.class); + return new SynonymTokenizer(stream, synonyms); } } @@ -1622,47 +1699,70 @@ class SynonymAnalyzer extends Analyzer { class SynonymTokenizer extends TokenStream { private TokenStream realStream; private Token currentRealToken = null; + private org.apache.lucene.analysis.Token cRealToken = null; private Map synonyms; StringTokenizer st = null; + private TermAttribute realTermAtt; + private PositionIncrementAttribute realPosIncrAtt; + private OffsetAttribute realOffsetAtt; + private TermAttribute termAtt; + private PositionIncrementAttribute posIncrAtt; + private OffsetAttribute offsetAtt; public SynonymTokenizer(TokenStream realStream, Map synonyms) { this.realStream = realStream; this.synonyms = synonyms; + realTermAtt = (TermAttribute) realStream.getAttribute(TermAttribute.class); + realPosIncrAtt = (PositionIncrementAttribute) realStream.getAttribute(PositionIncrementAttribute.class); + realOffsetAtt = (OffsetAttribute) realStream.getAttribute(OffsetAttribute.class); + + termAtt = (TermAttribute) addAttribute(TermAttribute.class); + posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); + offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); } - public Token next(final Token reusableToken) throws IOException { - assert reusableToken != null; + public boolean incrementToken() throws IOException { + if (currentRealToken == null) { - Token nextRealToken = realStream.next(reusableToken); - if (nextRealToken == null) { - return null; + boolean next = realStream.incrementToken(); + if (!next) { + return false; } - String expansions = (String) synonyms.get(nextRealToken.term()); + //Token nextRealToken = new Token(, offsetAtt.startOffset(), offsetAtt.endOffset()); + termAtt.setTermBuffer(realTermAtt.term()); + offsetAtt.setOffset(realOffsetAtt.startOffset(), realOffsetAtt.endOffset()); + posIncrAtt.setPositionIncrement(realPosIncrAtt.getPositionIncrement()); + + String expansions = (String) synonyms.get(realTermAtt.term()); if (expansions == null) { - return nextRealToken; + return true; } st = new StringTokenizer(expansions, ","); if (st.hasMoreTokens()) { - currentRealToken = (Token) nextRealToken.clone(); + currentRealToken = new Token(realOffsetAtt.startOffset(), realOffsetAtt.endOffset()); + currentRealToken.setTermBuffer(realTermAtt.term()); } - return currentRealToken; + + return true; } else { - reusableToken.reinit(st.nextToken(), - currentRealToken.startOffset(), - currentRealToken.endOffset()); - reusableToken.setPositionIncrement(0); + String tok = st.nextToken(); + termAtt.setTermBuffer(tok); + offsetAtt.setOffset(currentRealToken.startOffset(), currentRealToken.endOffset()); + posIncrAtt.setPositionIncrement(0); if (!st.hasMoreTokens()) { currentRealToken = null; st = null; } - return reusableToken; + return true; } + } static abstract class TestHighlightRunner { static final int STANDARD = 0; static final int SPAN = 1; int mode = STANDARD; + Fragmenter frag = new SimpleFragmenter(20); public Highlighter getHighlighter(Query query, String fieldName, TokenStream stream, Formatter formatter) { @@ -1725,7 +1825,7 @@ class SynonymTokenizer extends TokenStream { if (mode == SPAN) { ((CachingTokenFilter) tokenStream).reset(); } - highlighter.setTextFragmenter(new SimpleFragmenter(20)); + highlighter.setTextFragmenter(frag); String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, fragmentSeparator);