diff --git a/contrib/CHANGES.txt b/contrib/CHANGES.txt index 4546df08a3f..09d756e365b 100644 --- a/contrib/CHANGES.txt +++ b/contrib/CHANGES.txt @@ -11,7 +11,12 @@ Changes in runtime behavior API Changes - (None) + 1. LUCENE-1695: Update the Highlighter to use the new TokenStream API. This issue breaks backwards + compatibility with some public classes. If you have implemented custom Fregmenters or Scorers, + you will need to adjust them to work with the new TokenStream API. Rather than getting passed a + Token at a time, you will be given a TokenStream to init your impl with - store the Attributes + you are interested in locally and access them on each call to the method that used to pass a new + Token. Look at the included updated impls for examples. (Mark Miller) Bug fixes @@ -41,9 +46,6 @@ Bug fixes 8. LUCENE-1491: EdgeNGramTokenFilter no longer stops on tokens shorter than minimum n-gram size. (Todd Teak via Otis Gospodnetic) - - 9. LUCENE-1752: Missing highlights when terms were repeated in separate, nested, boolean or - disjunction queries. (Koji Sekiguchi, Mark Miller) New features diff --git a/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Fragmenter.java b/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Fragmenter.java index bab5554e617..1dc6a8f3193 100644 --- a/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Fragmenter.java +++ b/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Fragmenter.java @@ -16,24 +16,31 @@ package org.apache.lucene.search.highlight; * limitations under the License. */ -import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; /** - * Implements the policy for breaking text into multiple fragments for consideration - * by the {@link Highlighter} class. A sophisticated implementation may do this on the basis - * of detecting end of sentences in the text. + * Implements the policy for breaking text into multiple fragments for + * consideration by the {@link Highlighter} class. A sophisticated + * implementation may do this on the basis of detecting end of sentences in the + * text. */ -public interface Fragmenter -{ - /** - * Initializes the Fragmenter - * @param originalText - */ - public void start(String originalText); +public interface Fragmenter { - /** - * Test to see if this token from the stream should be held in a new TextFragment - * @param nextToken - */ - public boolean isNewFragment(Token nextToken); + /** + * Initializes the Fragmenter. You can grab references to the Attributes you are + * interested in from tokenStream and then access the values in isNewFragment. + * + * @param originalText + * @param tokenStream + */ + public void start(String originalText, TokenStream tokenStream); + + + /** + * Test to see if this token from the stream should be held in a new + * TextFragment. Every time this is called, the TokenStream + * passed to start(String, TokenStream) will have been incremented. + * + */ + public boolean isNewFragment(); } diff --git a/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java b/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java index c0473b5e617..80a53ae6da2 100644 --- a/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java +++ b/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java @@ -22,8 +22,10 @@ import java.util.ArrayList; import java.util.Iterator; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.util.PriorityQueue; /** @@ -214,8 +216,14 @@ public class Highlighter { ArrayList docFrags = new ArrayList(); StringBuffer newText=new StringBuffer(); - + + TermAttribute termAtt = (TermAttribute) tokenStream.addAttribute(TermAttribute.class); + OffsetAttribute offsetAtt = (OffsetAttribute) tokenStream.addAttribute(OffsetAttribute.class); + tokenStream.addAttribute(PositionIncrementAttribute.class); + tokenStream.reset(); + TextFragment currentFrag = new TextFragment(newText,newText.length(), docFrags.size()); + fragmentScorer.init(tokenStream); fragmentScorer.startFragment(currentFrag); docFrags.add(currentFrag); @@ -223,28 +231,27 @@ public class Highlighter try { - final Token reusableToken = new Token(); + String tokenText; int startOffset; int endOffset; int lastEndOffset = 0; - textFragmenter.start(text); + textFragmenter.start(text, tokenStream); - TokenGroup tokenGroup=new TokenGroup(); - - for (Token nextToken = tokenStream.next(reusableToken); - (nextToken!= null)&&(nextToken.startOffset()< maxDocCharsToAnalyze); - nextToken = tokenStream.next(reusableToken)) + TokenGroup tokenGroup=new TokenGroup(tokenStream); + + for (boolean next = tokenStream.incrementToken(); next && (offsetAtt.startOffset()< maxDocCharsToAnalyze); + next = tokenStream.incrementToken()) { - if( (nextToken.endOffset()>text.length()) + if( (offsetAtt.endOffset()>text.length()) || - (nextToken.startOffset()>text.length()) + (offsetAtt.startOffset()>text.length()) ) { - throw new InvalidTokenOffsetsException("Token "+nextToken.toString() + throw new InvalidTokenOffsetsException("Token "+ termAtt.term() +" exceeds length of provided text sized "+text.length()); } - if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct(nextToken))) + if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct())) { //the current token is distinct from previous tokens - // markup the cached token group info @@ -260,7 +267,7 @@ public class Highlighter tokenGroup.clear(); //check if current token marks the start of a new fragment - if(textFragmenter.isNewFragment(nextToken)) + if(textFragmenter.isNewFragment()) { currentFrag.setScore(fragmentScorer.getFragmentScore()); //record stats for a new fragment @@ -271,7 +278,7 @@ public class Highlighter } } - tokenGroup.addToken(nextToken,fragmentScorer.getTokenScore(nextToken)); + tokenGroup.addToken(fragmentScorer.getTokenScore()); // if(lastEndOffset>maxDocBytesToAnalyze) // { @@ -332,7 +339,7 @@ public class Highlighter //The above code caused a problem as a result of Christoph Goller's 11th Sept 03 //fix to PriorityQueue. The correct method to use here is the new "insert" method // USE ABOVE CODE IF THIS DOES NOT COMPILE! - fragQueue.insert(currentFrag); + fragQueue.insertWithOverflow(currentFrag); } //return the most relevant fragments diff --git a/contrib/highlighter/src/java/org/apache/lucene/search/highlight/NullFragmenter.java b/contrib/highlighter/src/java/org/apache/lucene/search/highlight/NullFragmenter.java index 67a027655bb..2dd042c5afc 100644 --- a/contrib/highlighter/src/java/org/apache/lucene/search/highlight/NullFragmenter.java +++ b/contrib/highlighter/src/java/org/apache/lucene/search/highlight/NullFragmenter.java @@ -16,17 +16,18 @@ package org.apache.lucene.search.highlight; * limitations under the License. */ -import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; /** * {@link Fragmenter} implementation which does not fragment the text. * This is useful for highlighting the entire content of a document or field. */ public class NullFragmenter implements Fragmenter { - public void start(String s) { + public void start(String s, TokenStream tokenStream) { } - public boolean isNewFragment(Token token) { + public boolean isNewFragment() { return false; } + } diff --git a/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java b/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java index c6a9545bca2..578f35189c6 100644 --- a/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java +++ b/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java @@ -1,4 +1,5 @@ package org.apache.lucene.search.highlight; + /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -19,134 +20,142 @@ package org.apache.lucene.search.highlight; import java.util.HashMap; import java.util.HashSet; -import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.index.IndexReader; import org.apache.lucene.search.Query; /** - * {@link Scorer} implementation which scores text fragments by the number of unique query terms found. - * This class uses the {@link QueryTermExtractor} class to process determine the query terms and - * their boosts to be used. + * {@link Scorer} implementation which scores text fragments by the number of + * unique query terms found. This class uses the {@link QueryTermExtractor} + * class to process determine the query terms and their boosts to be used. */ -//TODO: provide option to boost score of fragments near beginning of document +// TODO: provide option to boost score of fragments near beginning of document // based on fragment.getFragNum() -public class QueryScorer implements Scorer -{ - TextFragment currentTextFragment=null; - HashSet uniqueTermsInFragment; - float totalScore=0; - float maxTermWeight=0; - private HashMap termsToFind; - +public class QueryScorer implements Scorer { + + TextFragment currentTextFragment = null; + HashSet uniqueTermsInFragment; - /** - * - * @param query a Lucene query (ideally rewritten using query.rewrite - * before being passed to this class and the searcher) - */ - public QueryScorer(Query query) - { - this(QueryTermExtractor.getTerms(query)); - } - - /** - * - * @param query a Lucene query (ideally rewritten using query.rewrite - * before being passed to this class and the searcher) - * @param fieldName the Field name which is used to match Query terms - */ - public QueryScorer(Query query, String fieldName) - { - this(QueryTermExtractor.getTerms(query, false,fieldName)); - } + float totalScore = 0; + float maxTermWeight = 0; + private HashMap termsToFind; - /** - * - * @param query a Lucene query (ideally rewritten using query.rewrite - * before being passed to this class and the searcher) - * @param reader used to compute IDF which can be used to a) score selected fragments better - * b) use graded highlights eg set font color intensity - * @param fieldName the field on which Inverse Document Frequency (IDF) calculations are based - */ - public QueryScorer(Query query, IndexReader reader, String fieldName) - { - this(QueryTermExtractor.getIdfWeightedTerms(query, reader, fieldName)); - } + private TermAttribute termAtt; - public QueryScorer(WeightedTerm []weightedTerms ) - { - termsToFind = new HashMap(); - for (int i = 0; i < weightedTerms.length; i++) - { - WeightedTerm existingTerm=(WeightedTerm) termsToFind.get(weightedTerms[i].term); - if( (existingTerm==null) ||(existingTerm.weight= (fragmentSize * currentNumFrags); + if (isNewFrag) { + currentNumFrags++; + } + return isNewFrag; + } - /* (non-Javadoc) - * @see org.apache.lucene.search.highlight.TextFragmenter#start(java.lang.String) - */ - public void start(String originalText) - { - currentNumFrags=1; - } + /** + * @return size in number of characters of each fragment + */ + public int getFragmentSize() { + return fragmentSize; + } - /* (non-Javadoc) - * @see org.apache.lucene.search.highlight.TextFragmenter#isNewFragment(org.apache.lucene.analysis.Token) - */ - public boolean isNewFragment(Token token) - { - boolean isNewFrag= token.endOffset()>=(fragmentSize*currentNumFrags); - if(isNewFrag) - { - currentNumFrags++; - } - return isNewFrag; - } - - /** - * @return size in number of characters of each fragment - */ - public int getFragmentSize() - { - return fragmentSize; - } - - /** - * @param size size in characters of each fragment - */ - public void setFragmentSize(int size) - { - fragmentSize = size; - } + /** + * @param size size in characters of each fragment + */ + public void setFragmentSize(int size) { + fragmentSize = size; + } } diff --git a/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleSpanFragmenter.java b/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleSpanFragmenter.java index ea9adae1eae..b522260af92 100644 --- a/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleSpanFragmenter.java +++ b/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleSpanFragmenter.java @@ -17,10 +17,13 @@ package org.apache.lucene.search.highlight; * See the License for the specific language governing permissions and * limitations under the License. */ -import org.apache.lucene.analysis.Token; - import java.util.List; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; + /** * {@link Fragmenter} implementation which breaks text up into same-size @@ -34,6 +37,9 @@ public class SimpleSpanFragmenter implements Fragmenter { private SpanScorer spanScorer; private int waitForPos = -1; private int textSize; + private TermAttribute termAtt; + private PositionIncrementAttribute posIncAtt; + private OffsetAttribute offsetAtt; /** * @param spanscorer SpanScorer that was used to score hits @@ -50,12 +56,12 @@ public class SimpleSpanFragmenter implements Fragmenter { this.fragmentSize = fragmentSize; this.spanScorer = spanscorer; } - + /* (non-Javadoc) - * @see org.apache.lucene.search.highlight.Fragmenter#isNewFragment(org.apache.lucene.analysis.Token) + * @see org.apache.lucene.search.highlight.Fragmenter#isNewFragment() */ - public boolean isNewFragment(Token token) { - position += token.getPositionIncrement(); + public boolean isNewFragment() { + position += posIncAtt.getPositionIncrement(); if (waitForPos == position) { waitForPos = -1; @@ -63,7 +69,7 @@ public class SimpleSpanFragmenter implements Fragmenter { return false; } - WeightedSpanTerm wSpanTerm = spanScorer.getWeightedSpanTerm(token.term()); + WeightedSpanTerm wSpanTerm = spanScorer.getWeightedSpanTerm(termAtt.term()); if (wSpanTerm != null) { List positionSpans = wSpanTerm.getPositionSpans(); @@ -76,8 +82,8 @@ public class SimpleSpanFragmenter implements Fragmenter { } } - boolean isNewFrag = token.endOffset() >= (fragmentSize * currentNumFrags) - && (textSize - token.endOffset()) >= (fragmentSize >>> 1); + boolean isNewFrag = offsetAtt.endOffset() >= (fragmentSize * currentNumFrags) + && (textSize - offsetAtt.endOffset()) >= (fragmentSize >>> 1); if (isNewFrag) { currentNumFrags++; @@ -86,12 +92,16 @@ public class SimpleSpanFragmenter implements Fragmenter { return isNewFrag; } + /* (non-Javadoc) - * @see org.apache.lucene.search.highlight.Fragmenter#start(java.lang.String) + * @see org.apache.lucene.search.highlight.Fragmenter#start(java.lang.String, org.apache.lucene.analysis.TokenStream) */ - public void start(String originalText) { + public void start(String originalText, TokenStream tokenStream) { position = -1; currentNumFrags = 1; textSize = originalText.length(); + termAtt = (TermAttribute) tokenStream.getAttribute(TermAttribute.class); + posIncAtt = (PositionIncrementAttribute) tokenStream.getAttribute(PositionIncrementAttribute.class); + offsetAtt = (OffsetAttribute) tokenStream.getAttribute(OffsetAttribute.class); } } diff --git a/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SpanScorer.java b/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SpanScorer.java index 6aaa4fea4e9..2280178dd51 100644 --- a/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SpanScorer.java +++ b/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SpanScorer.java @@ -7,9 +7,10 @@ import java.util.Map; import java.util.Set; import org.apache.lucene.analysis.CachingTokenFilter; -import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.search.ConstantScoreRangeQuery; import org.apache.lucene.search.Query; @@ -26,6 +27,8 @@ public class SpanScorer implements Scorer { private float maxTermWeight; private int position = -1; private String defaultField; + private TermAttribute termAtt; + private PositionIncrementAttribute posIncAtt; private static boolean highlightCnstScrRngQuery; /** @@ -176,9 +179,9 @@ public class SpanScorer implements Scorer { * @see org.apache.lucene.search.highlight.Scorer#getTokenScore(org.apache.lucene.analysis.Token, * int) */ - public float getTokenScore(Token token) { - position += token.getPositionIncrement(); - String termText = token.term(); + public float getTokenScore() { + position += posIncAtt.getPositionIncrement(); + String termText = termAtt.term(); WeightedSpanTerm weightedSpanTerm; @@ -203,6 +206,11 @@ public class SpanScorer implements Scorer { return score; } + public void init(TokenStream tokenStream) { + termAtt = (TermAttribute) tokenStream.getAttribute(TermAttribute.class); + posIncAtt = (PositionIncrementAttribute) tokenStream.getAttribute(PositionIncrementAttribute.class); + } + /** * Retrieve the WeightedSpanTerm for the specified token. Useful for passing * Span information to a Fragmenter. diff --git a/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenGroup.java b/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenGroup.java index 280192a984e..03dc523f002 100644 --- a/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenGroup.java +++ b/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenGroup.java @@ -1,4 +1,5 @@ package org.apache.lucene.search.highlight; + /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -15,118 +16,117 @@ package org.apache.lucene.search.highlight; * See the License for the specific language governing permissions and * limitations under the License. */ + import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; /** - * One, or several overlapping tokens, along with the score(s) and the - * scope of the original text + * One, or several overlapping tokens, along with the score(s) and the scope of + * the original text */ -public class TokenGroup -{ - - private static final int MAX_NUM_TOKENS_PER_GROUP=50; - Token [] tokens=new Token[MAX_NUM_TOKENS_PER_GROUP]; - float [] scores=new float[MAX_NUM_TOKENS_PER_GROUP]; - int numTokens=0; - int startOffset=0; - int endOffset=0; - float tot; +public class TokenGroup { + private static final int MAX_NUM_TOKENS_PER_GROUP = 50; + Token [] tokens=new Token[MAX_NUM_TOKENS_PER_GROUP]; + float[] scores = new float[MAX_NUM_TOKENS_PER_GROUP]; + int numTokens = 0; + int startOffset = 0; + int endOffset = 0; + float tot; int matchStartOffset, matchEndOffset; + private OffsetAttribute offsetAtt; + private TermAttribute termAtt; - void addToken(Token token, float score) - { - if(numTokens < MAX_NUM_TOKENS_PER_GROUP) - { - if(numTokens==0) - { - startOffset=matchStartOffset=token.startOffset(); - endOffset=matchEndOffset=token.endOffset(); - tot += score; - } - else - { - startOffset=Math.min(startOffset,token.startOffset()); - endOffset=Math.max(endOffset,token.endOffset()); - if (score>0) { - if (tot==0) { - matchStartOffset=token.startOffset(); - matchEndOffset=token.endOffset(); + public TokenGroup(TokenStream tokenStream) { + offsetAtt = (OffsetAttribute) tokenStream.getAttribute(OffsetAttribute.class); + termAtt = (TermAttribute) tokenStream.getAttribute(TermAttribute.class); + } + + void addToken(float score) { + if (numTokens < MAX_NUM_TOKENS_PER_GROUP) { + int termStartOffset = offsetAtt.startOffset(); + int termEndOffset = offsetAtt.endOffset(); + if (numTokens == 0) { + startOffset = matchStartOffset = termStartOffset; + endOffset = matchEndOffset = termEndOffset; + tot += score; + } else { + startOffset = Math.min(startOffset, termStartOffset); + endOffset = Math.max(endOffset, termEndOffset); + if (score > 0) { + if (tot == 0) { + matchStartOffset = offsetAtt.startOffset(); + matchEndOffset = offsetAtt.endOffset(); } else { - matchStartOffset=Math.min(matchStartOffset,token.startOffset()); - matchEndOffset=Math.max(matchEndOffset,token.endOffset()); + matchStartOffset = Math.min(matchStartOffset, termStartOffset); + matchEndOffset = Math.max(matchEndOffset, termEndOffset); } - tot+=score; + tot += score; } } - tokens[numTokens]= (Token) token.clone(); - scores[numTokens]=score; - numTokens++; - } - } + Token token = new Token(termStartOffset, termEndOffset); + token.setTermBuffer(termAtt.term()); + tokens[numTokens] = token; + scores[numTokens] = score; + numTokens++; + } + } - boolean isDistinct(Token token) - { - return token.startOffset()>=endOffset; - } + boolean isDistinct() { + return offsetAtt.startOffset() >= endOffset; + } + void clear() { + numTokens = 0; + tot = 0; + } + + /* + * @param index a value between 0 and numTokens -1 + * @return the "n"th token + */ + public Token getToken(int index) + { + return tokens[index]; + } - void clear() - { - numTokens=0; - tot=0; - } - - /** - * - * @param index a value between 0 and numTokens -1 - * @return the "n"th token - */ - public Token getToken(int index) - { - return tokens[index]; - } + /** + * + * @param index a value between 0 and numTokens -1 + * @return the "n"th score + */ + public float getScore(int index) { + return scores[index]; + } - /** - * - * @param index a value between 0 and numTokens -1 - * @return the "n"th score - */ - public float getScore(int index) - { - return scores[index]; - } + /** + * @return the end position in the original text + */ + public int getEndOffset() { + return endOffset; + } - /** - * @return the end position in the original text - */ - public int getEndOffset() - { - return endOffset; - } + /** + * @return the number of tokens in this group + */ + public int getNumTokens() { + return numTokens; + } - /** - * @return the number of tokens in this group - */ - public int getNumTokens() - { - return numTokens; - } + /** + * @return the start position in the original text + */ + public int getStartOffset() { + return startOffset; + } - /** - * @return the start position in the original text - */ - public int getStartOffset() - { - return startOffset; - } - - /** - * @return all tokens' scores summed up - */ - public float getTotalScore() - { - return tot; - } + /** + * @return all tokens' scores summed up + */ + public float getTotalScore() { + return tot; + } } diff --git a/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java b/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java index 1dba0675008..f3660e8fe6c 100644 --- a/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java +++ b/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java @@ -29,6 +29,8 @@ import java.util.Comparator; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.TermFreqVector; @@ -135,32 +137,45 @@ public class TokenSources * @param tokenPositionsGuaranteedContiguous true if the token position numbers have no overlaps or gaps. If looking * to eek out the last drops of performance, set to true. If in doubt, set to false. */ - public static TokenStream getTokenStream(TermPositionVector tpv, boolean tokenPositionsGuaranteedContiguous) - { + public static TokenStream getTokenStream(TermPositionVector tpv, boolean tokenPositionsGuaranteedContiguous) { //an object used to iterate across an array of tokens - class StoredTokenStream extends TokenStream - { - Token tokens[]; - int currentToken=0; - StoredTokenStream(Token tokens[]) - { - this.tokens=tokens; + class StoredTokenStream extends TokenStream { + Token tokens[]; + int currentToken = 0; + TermAttribute termAtt; + OffsetAttribute offsetAtt; + + StoredTokenStream(Token tokens[]) { + this.tokens = tokens; + termAtt = (TermAttribute) addAttribute(TermAttribute.class); + offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); + } + + public Token next(final Token reusableToken) { + System.out.println("next token"); + assert reusableToken != null; + if (currentToken >= tokens.length) { + return null; } - public Token next(final Token reusableToken) - { - assert reusableToken != null; - if(currentToken>=tokens.length) - { - return null; - } - return tokens[currentToken++]; - } - } + return tokens[currentToken++]; + } + + public boolean incrementToken() throws IOException { + System.out.println("inc token"); + if (currentToken >= tokens.length) { + return false; + } + Token token = tokens[currentToken++]; + termAtt.setTermBuffer(token.term()); + offsetAtt.setOffset(token.startOffset(), token.endOffset()); + return true; + } + } //code to reconstruct the original sequence of Tokens String[] terms=tpv.getTerms(); int[] freq=tpv.getTermFrequencies(); int totalTokens=0; - Token newToken = new Token(); + for (int t = 0; t < freq.length; t++) { totalTokens+=freq[t]; @@ -190,8 +205,9 @@ public class TokenSources } for (int tp = 0; tp < offsets.length; tp++) { - newToken.reinit(terms[t], offsets[tp].getStartOffset(), offsets[tp].getEndOffset()); - unsortedTokens.add(newToken.clone()); + Token token = new Token(offsets[tp].getStartOffset(), offsets[tp].getEndOffset()); + token.setTermBuffer(terms[t]); + unsortedTokens.add(token); } } else @@ -204,8 +220,8 @@ public class TokenSources //tokens stored with positions - can use this to index straight into sorted array for (int tp = 0; tp < pos.length; tp++) { - newToken.reinit(terms[t], offsets[tp].getStartOffset(), offsets[tp].getEndOffset()); - tokensInOriginalOrder[pos[tp]] = (Token) newToken.clone(); + Token token = new Token(terms[t], offsets[tp].getStartOffset(), offsets[tp].getEndOffset()); + tokensInOriginalOrder[pos[tp]] = token; } } } @@ -218,7 +234,7 @@ public class TokenSources { Token t1=(Token) o1; Token t2=(Token) o2; - if(t1.startOffset()>t2.startOffset()) + if(t1.startOffset()>t2.endOffset()) return 1; if(t1.startOffset()", "" used to highlight SimpleHTMLFormatter formatter = new SimpleHTMLFormatter(); @@ -908,10 +941,12 @@ public class HighlighterTest extends TestCase implements Formatter { Query query = parser.parse(srchkey); TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(s)); + Highlighter highlighter = getHighlighter(query, null, tokenStream, HighlighterTest.this); // Get 3 best fragments and seperate with a "..." tokenStream = analyzer.tokenStream(null, new StringReader(s)); + String result = highlighter.getBestFragments(tokenStream, s, 3, "..."); String expectedResult = "football-soccer in the euro 2004 footie competition"; assertTrue("overlapping analyzer should handle highlights OK, expected:" + expectedResult @@ -1075,10 +1110,11 @@ public class HighlighterTest extends TestCase implements Formatter { } public void testUnRewrittenQuery() throws Exception { - TestHighlightRunner helper = new TestHighlightRunner() { + final TestHighlightRunner helper = new TestHighlightRunner() { public void run() throws Exception { numHighlights = 0; + SpanScorer.setHighlightCnstScrRngQuery(false); // test to show how rewritten query can still be used searcher = new IndexSearcher(ramDir); Analyzer analyzer = new StandardAnalyzer(); @@ -1154,13 +1190,17 @@ public class HighlighterTest extends TestCase implements Formatter { public void startFragment(TextFragment newFragment) { } - public float getTokenScore(Token token) { + public float getTokenScore() { return 0; } public float getFragmentScore() { return 1; } + + public void init(TokenStream tokenStream) { + + } }); highlighter.setTextFragmenter(new SimpleFragmenter(2000)); TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(rawDocContent)); @@ -1292,27 +1332,44 @@ public class HighlighterTest extends TestCase implements Formatter { return new TokenStream() { Iterator iter; List lst; + private TermAttribute termAtt; + private PositionIncrementAttribute posIncrAtt; + private OffsetAttribute offsetAtt; { + termAtt = (TermAttribute) addAttribute(TermAttribute.class); + posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); + offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); lst = new ArrayList(); Token t; t = createToken("hi", 0, 2); + t.setPositionIncrement(1); lst.add(t); t = createToken("hispeed", 0, 8); + t.setPositionIncrement(1); lst.add(t); t = createToken("speed", 3, 8); t.setPositionIncrement(0); lst.add(t); t = createToken("10", 8, 10); + t.setPositionIncrement(1); lst.add(t); t = createToken("foo", 11, 14); + t.setPositionIncrement(1); lst.add(t); iter = lst.iterator(); } - public Token next(final Token reusableToken) throws IOException { - assert reusableToken != null; - return iter.hasNext() ? (Token) iter.next() : null; + public boolean incrementToken() throws IOException { + if(iter.hasNext()) { + Token token = (Token) iter.next(); + termAtt.setTermBuffer(token.term()); + posIncrAtt.setPositionIncrement(token.getPositionIncrement()); + offsetAtt.setOffset(token.startOffset(), token.endOffset()); + return true; + } + return false; } + }; } @@ -1322,26 +1379,42 @@ public class HighlighterTest extends TestCase implements Formatter { return new TokenStream() { Iterator iter; List lst; + private TermAttribute termAtt; + private PositionIncrementAttribute posIncrAtt; + private OffsetAttribute offsetAtt; { + termAtt = (TermAttribute) addAttribute(TermAttribute.class); + posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); + offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); lst = new ArrayList(); Token t; t = createToken("hispeed", 0, 8); + t.setPositionIncrement(1); lst.add(t); t = createToken("hi", 0, 2); t.setPositionIncrement(0); lst.add(t); t = createToken("speed", 3, 8); + t.setPositionIncrement(1); lst.add(t); t = createToken("10", 8, 10); + t.setPositionIncrement(1); lst.add(t); t = createToken("foo", 11, 14); + t.setPositionIncrement(1); lst.add(t); iter = lst.iterator(); } - public Token next(final Token reusableToken) throws IOException { - assert reusableToken != null; - return iter.hasNext() ? (Token) iter.next() : null; + public boolean incrementToken() throws IOException { + if(iter.hasNext()) { + Token token = (Token) iter.next(); + termAtt.setTermBuffer(token.term()); + posIncrAtt.setPositionIncrement(token.getPositionIncrement()); + offsetAtt.setOffset(token.startOffset(), token.endOffset()); + return true; + } + return false; } }; } @@ -1611,7 +1684,11 @@ class SynonymAnalyzer extends Analyzer { * java.io.Reader) */ public TokenStream tokenStream(String arg0, Reader arg1) { - return new SynonymTokenizer(new LowerCaseTokenizer(arg1), synonyms); + LowerCaseTokenizer stream = new LowerCaseTokenizer(arg1); + stream.addAttribute(TermAttribute.class); + stream.addAttribute(PositionIncrementAttribute.class); + stream.addAttribute(OffsetAttribute.class); + return new SynonymTokenizer(stream, synonyms); } } @@ -1622,47 +1699,70 @@ class SynonymAnalyzer extends Analyzer { class SynonymTokenizer extends TokenStream { private TokenStream realStream; private Token currentRealToken = null; + private org.apache.lucene.analysis.Token cRealToken = null; private Map synonyms; StringTokenizer st = null; + private TermAttribute realTermAtt; + private PositionIncrementAttribute realPosIncrAtt; + private OffsetAttribute realOffsetAtt; + private TermAttribute termAtt; + private PositionIncrementAttribute posIncrAtt; + private OffsetAttribute offsetAtt; public SynonymTokenizer(TokenStream realStream, Map synonyms) { this.realStream = realStream; this.synonyms = synonyms; + realTermAtt = (TermAttribute) realStream.getAttribute(TermAttribute.class); + realPosIncrAtt = (PositionIncrementAttribute) realStream.getAttribute(PositionIncrementAttribute.class); + realOffsetAtt = (OffsetAttribute) realStream.getAttribute(OffsetAttribute.class); + + termAtt = (TermAttribute) addAttribute(TermAttribute.class); + posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); + offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); } - public Token next(final Token reusableToken) throws IOException { - assert reusableToken != null; + public boolean incrementToken() throws IOException { + if (currentRealToken == null) { - Token nextRealToken = realStream.next(reusableToken); - if (nextRealToken == null) { - return null; + boolean next = realStream.incrementToken(); + if (!next) { + return false; } - String expansions = (String) synonyms.get(nextRealToken.term()); + //Token nextRealToken = new Token(, offsetAtt.startOffset(), offsetAtt.endOffset()); + termAtt.setTermBuffer(realTermAtt.term()); + offsetAtt.setOffset(realOffsetAtt.startOffset(), realOffsetAtt.endOffset()); + posIncrAtt.setPositionIncrement(realPosIncrAtt.getPositionIncrement()); + + String expansions = (String) synonyms.get(realTermAtt.term()); if (expansions == null) { - return nextRealToken; + return true; } st = new StringTokenizer(expansions, ","); if (st.hasMoreTokens()) { - currentRealToken = (Token) nextRealToken.clone(); + currentRealToken = new Token(realOffsetAtt.startOffset(), realOffsetAtt.endOffset()); + currentRealToken.setTermBuffer(realTermAtt.term()); } - return currentRealToken; + + return true; } else { - reusableToken.reinit(st.nextToken(), - currentRealToken.startOffset(), - currentRealToken.endOffset()); - reusableToken.setPositionIncrement(0); + String tok = st.nextToken(); + termAtt.setTermBuffer(tok); + offsetAtt.setOffset(currentRealToken.startOffset(), currentRealToken.endOffset()); + posIncrAtt.setPositionIncrement(0); if (!st.hasMoreTokens()) { currentRealToken = null; st = null; } - return reusableToken; + return true; } + } static abstract class TestHighlightRunner { static final int STANDARD = 0; static final int SPAN = 1; int mode = STANDARD; + Fragmenter frag = new SimpleFragmenter(20); public Highlighter getHighlighter(Query query, String fieldName, TokenStream stream, Formatter formatter) { @@ -1725,7 +1825,7 @@ class SynonymTokenizer extends TokenStream { if (mode == SPAN) { ((CachingTokenFilter) tokenStream).reset(); } - highlighter.setTextFragmenter(new SimpleFragmenter(20)); + highlighter.setTextFragmenter(frag); String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, fragmentSeparator);