diff --git a/CHANGES.txt b/CHANGES.txt index 7f9aced0962..6978e2737eb 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -356,6 +356,9 @@ API Changes 33. LUCENE-1705: Added IndexWriter.deleteAllDocuments. (Tim Smith via Mike McCandless) +34. LUCENE-1460: Changed TokenStreams/TokenFilters in contrib to + use the new TokenStream API. (Robert Muir, Michael Busch) + Bug fixes 1. LUCENE-1415: MultiPhraseQuery has incorrect hashCode() and equals() diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicNormalizationFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicNormalizationFilter.java index 399b1f7ae85..4e12ab7a1c5 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicNormalizationFilter.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicNormalizationFilter.java @@ -19,35 +19,33 @@ package org.apache.lucene.analysis.ar; import java.io.IOException; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; /** * A TokenFilter that applies {@link ArabicNormalizer} to normalize the orthography. * */ -public class ArabicNormalizationFilter extends TokenFilter { +public final class ArabicNormalizationFilter extends TokenFilter { protected ArabicNormalizer normalizer = null; - + private TermAttribute termAtt; + public ArabicNormalizationFilter(TokenStream input) { super(input); normalizer = new ArabicNormalizer(); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); } - - - public Token next(Token reusableToken) throws IOException { - if ((reusableToken = input.next(reusableToken)) == null) { - return null; + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + int newlen = normalizer.normalize(termAtt.termBuffer(), termAtt.termLength()); + termAtt.setTermLength(newlen); + return true; } else { - int oldlen = reusableToken.termLength(); - int newlen = normalizer.normalize(reusableToken.termBuffer(), oldlen); - if (oldlen != newlen) - reusableToken.setTermLength(newlen); - return reusableToken; + return false; } } } diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicStemFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicStemFilter.java index 39d7afa65cd..34beb5f9fa9 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicStemFilter.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicStemFilter.java @@ -19,43 +19,33 @@ package org.apache.lucene.analysis.ar; import java.io.IOException; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; /** * A TokenFilter that applies {@link ArabicStemmer} to stem Arabic words.. * */ -public class ArabicStemFilter extends TokenFilter { +public final class ArabicStemFilter extends TokenFilter { protected ArabicStemmer stemmer = null; - + private TermAttribute termAtt; + public ArabicStemFilter(TokenStream input) { super(input); stemmer = new ArabicStemmer(); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); } - - - /** - * @return Returns the next token in the stream, or null at EOS - */ - public Token next(Token reusableToken) throws IOException { - /** - * The actual token in the input stream. - */ - - - if ((reusableToken = input.next(reusableToken)) == null) { - return null; + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + int newlen = stemmer.stem(termAtt.termBuffer(), termAtt.termLength()); + termAtt.setTermLength(newlen); + return true; } else { - int oldlen = reusableToken.termLength(); - int newlen = stemmer.stem(reusableToken.termBuffer(), oldlen); - if (oldlen != newlen) - reusableToken.setTermLength(newlen); - return reusableToken; + return false; } } } diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianStemFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianStemFilter.java index 360a0df8281..3eff32f9faa 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianStemFilter.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianStemFilter.java @@ -17,13 +17,12 @@ package org.apache.lucene.analysis.br; * limitations under the License. */ -import org.apache.lucene.analysis.Token; +import java.io.IOException; +import java.util.Set; + import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; - -import java.io.IOException; -import java.util.HashSet; -import java.util.Set; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; /** * Based on GermanStemFilter @@ -36,10 +35,12 @@ public final class BrazilianStemFilter extends TokenFilter { */ private BrazilianStemmer stemmer = null; private Set exclusions = null; - + private TermAttribute termAtt; + public BrazilianStemFilter(TokenStream in) { super(in); stemmer = new BrazilianStemmer(); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); } public BrazilianStemFilter(TokenStream in, Set exclusiontable) { @@ -47,26 +48,20 @@ public final class BrazilianStemFilter extends TokenFilter { this.exclusions = exclusiontable; } - /** - * @return Returns the next token in the stream, or null at EOS. - */ - public final Token next(final Token reusableToken) - throws IOException { - assert reusableToken != null; - Token nextToken = input.next(reusableToken); - if (nextToken == null) - return null; - - String term = nextToken.term(); - - // Check the exclusion table. - if (exclusions == null || !exclusions.contains(term)) { - String s = stemmer.stem(term); - // If not stemmed, don't waste the time adjusting the token. - if ((s != null) && !s.equals(term)) - nextToken.setTermBuffer(s); + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + String term = termAtt.term(); + // Check the exclusion table. + if (exclusions == null || !exclusions.contains(term)) { + String s = stemmer.stem(term); + // If not stemmed, don't waste the time adjusting the token. + if ((s != null) && !s.equals(term)) + termAtt.setTermBuffer(s); + } + return true; + } else { + return false; } - return nextToken; } } diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java index 3f218fd72a8..5b7a42287d9 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java @@ -17,11 +17,14 @@ package org.apache.lucene.analysis.cjk; * limitations under the License. */ -import org.apache.lucene.analysis.Token; -import org.apache.lucene.analysis.Tokenizer; - +import java.io.IOException; import java.io.Reader; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; + /** * CJKTokenizer was modified from StopTokenizer which does a decent job for @@ -88,6 +91,10 @@ public final class CJKTokenizer extends Tokenizer { */ private boolean preIsTokened = false; + private TermAttribute termAtt; + private OffsetAttribute offsetAtt; + private TypeAttribute typeAtt; + //~ Constructors ----------------------------------------------------------- /** @@ -97,25 +104,26 @@ public final class CJKTokenizer extends Tokenizer { */ public CJKTokenizer(Reader in) { super(in); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); + offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); + typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class); } //~ Methods ---------------------------------------------------------------- /** - * Returns the next token in the stream, or null at EOS. + * Returns true for the next token in the stream, or false at EOS. * See http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.UnicodeBlock.html * for detail. * - * @param reusableToken a reusable token - * @return Token + * @return false for end of stream, true otherwise * * @throws java.io.IOException - throw IOException when read error
* happened in the InputStream * */ - public final Token next(final Token reusableToken) throws java.io.IOException { + public boolean incrementToken() throws IOException { /** how many character(s) has been stored in buffer */ - assert reusableToken != null; while(true) { // loop until we find a non-empty token @@ -147,7 +155,7 @@ public final class CJKTokenizer extends Tokenizer { break; } else { - return null; + return false; } } else { //get current character @@ -252,10 +260,12 @@ public final class CJKTokenizer extends Tokenizer { } if (length > 0) { - return reusableToken.reinit - (buffer, 0, length, input.correctOffset(start), input.correctOffset(start+length), TOKEN_TYPE_NAMES[tokenType]); + termAtt.setTermBuffer(buffer, 0, length); + offsetAtt.setOffset(input.correctOffset(start), input.correctOffset(start+length)); + typeAtt.setType(TOKEN_TYPE_NAMES[tokenType]); + return true; } else if (dataLen == -1) { - return null; + return false; } // Cycle back and try for the next token (don't diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseFilter.java index a85a2ed8842..31de4a7f0a5 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseFilter.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseFilter.java @@ -17,12 +17,13 @@ package org.apache.lucene.analysis.cn; * limitations under the License. */ +import java.io.IOException; import java.util.HashMap; import java.util.Map; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; /** * Title: ChineseFilter @@ -56,19 +57,21 @@ public final class ChineseFilter extends TokenFilter { private Map stopTable; + private TermAttribute termAtt; + public ChineseFilter(TokenStream in) { super(in); stopTable = new HashMap(STOP_WORDS.length); for (int i = 0; i < STOP_WORDS.length; i++) stopTable.put(STOP_WORDS[i], STOP_WORDS[i]); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); } - public final Token next(final Token reusableToken) throws java.io.IOException { - assert reusableToken != null; + public boolean incrementToken() throws IOException { - for (Token nextToken = input.next(reusableToken); nextToken != null; nextToken = input.next(reusableToken)) { - String text = nextToken.term(); + while (input.incrementToken()) { + String text = termAtt.term(); // why not key off token type here assuming ChineseTokenizer comes first? if (stopTable.get(text) == null) { @@ -79,7 +82,7 @@ public final class ChineseFilter extends TokenFilter { // English word/token should larger than 1 character. if (text.length()>1) { - return nextToken; + return true; } break; case Character.OTHER_LETTER: @@ -87,13 +90,13 @@ public final class ChineseFilter extends TokenFilter { // One Chinese character as one Chinese word. // Chinese word extraction to be added later here. - return nextToken; + return true; } } } - return null; + return false; } } \ No newline at end of file diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java index f9a1aec8fff..cc7f7453733 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java @@ -18,10 +18,12 @@ package org.apache.lucene.analysis.cn; */ +import java.io.IOException; import java.io.Reader; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; /** @@ -56,6 +58,8 @@ public final class ChineseTokenizer extends Tokenizer { public ChineseTokenizer(Reader in) { super(in); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); + offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); } private int offset = 0, bufferIndex=0, dataLen=0; @@ -68,7 +72,9 @@ public final class ChineseTokenizer extends Tokenizer { private int length; private int start; - + private TermAttribute termAtt; + private OffsetAttribute offsetAtt; + private final void push(char c) { if (length == 0) start = offset-1; // start of token @@ -76,19 +82,20 @@ public final class ChineseTokenizer extends Tokenizer { } - private final Token flush(final Token token) { + private final boolean flush() { if (length>0) { //System.out.println(new String(buffer, 0, //length)); - return token.reinit(buffer, 0, length, input.correctOffset(start), input.correctOffset(start+length)); + termAtt.setTermBuffer(buffer, 0, length); + offsetAtt.setOffset(input.correctOffset(start), input.correctOffset(start+length)); + return true; } else - return null; + return false; } - public final Token next(final Token reusableToken) throws java.io.IOException { - assert reusableToken != null; + public boolean incrementToken() throws IOException { length = 0; start = offset; @@ -104,7 +111,7 @@ public final class ChineseTokenizer extends Tokenizer { bufferIndex = 0; } - if (dataLen == -1) return flush(reusableToken); + if (dataLen == -1) return flush(); else c = ioBuffer[bufferIndex++]; @@ -115,20 +122,20 @@ public final class ChineseTokenizer extends Tokenizer { case Character.LOWERCASE_LETTER: case Character.UPPERCASE_LETTER: push(c); - if (length == MAX_WORD_LEN) return flush(reusableToken); + if (length == MAX_WORD_LEN) return flush(); break; case Character.OTHER_LETTER: if (length>0) { bufferIndex--; offset--; - return flush(reusableToken); + return flush(); } push(c); - return flush(reusableToken); + return flush(); default: - if (length>0) return flush(reusableToken); + if (length>0) return flush(); break; } } diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java index a43b35e5bf0..15cead80072 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java @@ -28,6 +28,12 @@ import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.FlagsAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; /** * Base class for decomposition token filters. @@ -54,6 +60,15 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter { protected final int minSubwordSize; protected final int maxSubwordSize; protected final boolean onlyLongestMatch; + + private TermAttribute termAtt; + private OffsetAttribute offsetAtt; + private FlagsAttribute flagsAtt; + private PositionIncrementAttribute posIncAtt; + private TypeAttribute typeAtt; + private PayloadAttribute payloadAtt; + + private final Token wrapper = new Token(); protected CompoundWordTokenFilterBase(TokenStream input, String[] dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) { this(input,makeDictionary(dictionary),minWordSize,minSubwordSize,maxSubwordSize, onlyLongestMatch); @@ -90,6 +105,13 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter { this.dictionary = new CharArraySet(dictionary.size(), false); addAllLowerCase(this.dictionary, dictionary); } + + termAtt = (TermAttribute) addAttribute(TermAttribute.class); + offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); + flagsAtt = (FlagsAttribute) addAttribute(FlagsAttribute.class); + posIncAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); + typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class); + payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class); } /** @@ -105,26 +127,54 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter { return dict; } - public Token next(final Token reusableToken) throws IOException { - assert reusableToken != null; + private final void setToken(final Token token) throws IOException { + termAtt.setTermBuffer(token.termBuffer(), 0, token.termLength()); + flagsAtt.setFlags(token.getFlags()); + typeAtt.setType(token.type()); + offsetAtt.setOffset(token.startOffset(), token.endOffset()); + posIncAtt.setPositionIncrement(token.getPositionIncrement()); + payloadAtt.setPayload(token.getPayload()); + } + + public final boolean incrementToken() throws IOException { if (tokens.size() > 0) { - return (Token)tokens.removeFirst(); + setToken((Token)tokens.removeFirst()); + return true; } - Token nextToken = input.next(reusableToken); - if (nextToken == null) { - return null; - } - - decompose(nextToken); + if (input.incrementToken() == false) + return false; + + wrapper.setTermBuffer(termAtt.termBuffer(), 0, termAtt.termLength()); + wrapper.setStartOffset(offsetAtt.startOffset()); + wrapper.setEndOffset(offsetAtt.endOffset()); + wrapper.setFlags(flagsAtt.getFlags()); + wrapper.setType(typeAtt.type()); + wrapper.setPositionIncrement(posIncAtt.getPositionIncrement()); + wrapper.setPayload(payloadAtt.getPayload()); + + decompose(wrapper); if (tokens.size() > 0) { - return (Token)tokens.removeFirst(); + setToken((Token)tokens.removeFirst()); + return true; } else { - return null; + return false; } } + /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should + * not be overridden. Delegates to the backwards compatibility layer. */ + public final Token next(final Token reusableToken) throws java.io.IOException { + return super.next(reusableToken); + } + + /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should + * not be overridden. Delegates to the backwards compatibility layer. */ + public final Token next() throws java.io.IOException { + return super.next(); + } + protected static final void addAllLowerCase(Set target, Collection col) { Iterator iter=col.iterator(); diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java index 9c90c26eaed..1929563ed71 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java @@ -17,13 +17,13 @@ package org.apache.lucene.analysis.de; * limitations under the License. */ -import org.apache.lucene.analysis.Token; -import org.apache.lucene.analysis.TokenFilter; -import org.apache.lucene.analysis.TokenStream; - import java.io.IOException; import java.util.Set; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; + /** * A filter that stems German words. It supports a table of words that should * not be stemmed at all. The stemmer used can be changed at runtime after the @@ -40,10 +40,13 @@ public final class GermanStemFilter extends TokenFilter private GermanStemmer stemmer = null; private Set exclusionSet = null; + private TermAttribute termAtt; + public GermanStemFilter( TokenStream in ) { super(in); stemmer = new GermanStemmer(); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); } /** @@ -56,26 +59,22 @@ public final class GermanStemFilter extends TokenFilter } /** - * @return Returns the next token in the stream, or null at EOS + * @return Returns true for next token in the stream, or false at EOS */ - public final Token next(final Token reusableToken) - throws IOException - { - assert reusableToken != null; - Token nextToken = input.next(reusableToken); - - if (nextToken == null) - return null; - - String term = nextToken.term(); - // Check the exclusion table. - if (exclusionSet == null || !exclusionSet.contains(term)) { - String s = stemmer.stem(term); - // If not stemmed, don't waste the time adjusting the token. - if ((s != null) && !s.equals(term)) - nextToken.setTermBuffer(s); + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + String term = termAtt.term(); + // Check the exclusion table. + if (exclusionSet == null || !exclusionSet.contains(term)) { + String s = stemmer.stem(term); + // If not stemmed, don't waste the time adjusting the token. + if ((s != null) && !s.equals(term)) + termAtt.setTermBuffer(s); + } + return true; + } else { + return false; } - return nextToken; } /** diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java index 9737a282c89..8e94ac55ef1 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java @@ -16,9 +16,11 @@ package org.apache.lucene.analysis.el; * limitations under the License. */ +import java.io.IOException; + import org.apache.lucene.analysis.TokenFilter; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; /** * Normalizes token text to lower case, analyzing given ("greek") charset. @@ -28,26 +30,26 @@ public final class GreekLowerCaseFilter extends TokenFilter { char[] charset; + private TermAttribute termAtt; + public GreekLowerCaseFilter(TokenStream in, char[] charset) { super(in); this.charset = charset; + termAtt = (TermAttribute) addAttribute(TermAttribute.class); } - public final Token next(final Token reusableToken) throws java.io.IOException - { - assert reusableToken != null; - Token nextToken = input.next(reusableToken); - - if (nextToken == null) - return null; - - char[] chArray = nextToken.termBuffer(); - int chLen = nextToken.termLength(); + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + char[] chArray = termAtt.termBuffer(); + int chLen = termAtt.termLength(); for (int i = 0; i < chLen; i++) { - chArray[i] = GreekCharsets.toLowerCase(chArray[i], charset); + chArray[i] = GreekCharsets.toLowerCase(chArray[i], charset); } - return nextToken; + return true; + } else { + return false; + } } } diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java index 3efaf6bc32f..b354e4dfdb1 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java @@ -25,6 +25,7 @@ import java.util.Iterator; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; /** * Removes elisions from a token stream. For example, "l'avion" (the plane) will be @@ -36,7 +37,8 @@ import org.apache.lucene.analysis.TokenFilter; */ public class ElisionFilter extends TokenFilter { private Set articles = null; - + private TermAttribute termAtt; + private static char[] apostrophes = {'\'', '’'}; public void setArticles(Set articles) { @@ -54,6 +56,7 @@ public class ElisionFilter extends TokenFilter { super(input); this.articles = new HashSet(Arrays.asList(new String[] { "l", "m", "t", "qu", "n", "s", "j" })); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); } /** @@ -62,6 +65,7 @@ public class ElisionFilter extends TokenFilter { public ElisionFilter(TokenStream input, Set articles) { super(input); setArticles(articles); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); } /** @@ -70,39 +74,50 @@ public class ElisionFilter extends TokenFilter { public ElisionFilter(TokenStream input, String[] articles) { super(input); setArticles(new HashSet(Arrays.asList(articles))); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); } /** * Returns the next input Token with term() without elisioned start */ - public Token next(final Token reusableToken) throws IOException { - assert reusableToken != null; - Token nextToken = input.next(reusableToken); - if (nextToken == null) - return null; + public final boolean incrementToken() throws IOException { + if (input.incrementToken()) { + char[] termBuffer = termAtt.termBuffer(); + int termLength = termAtt.termLength(); - char[] termBuffer = nextToken.termBuffer(); - int termLength = nextToken.termLength(); - - int minPoz = Integer.MAX_VALUE; - for (int i = 0; i < apostrophes.length; i++) { - char apos = apostrophes[i]; - // The equivalent of String.indexOf(ch) - for (int poz = 0; poz < termLength ; poz++) { - if (termBuffer[poz] == apos) { + int minPoz = Integer.MAX_VALUE; + for (int i = 0; i < apostrophes.length; i++) { + char apos = apostrophes[i]; + // The equivalent of String.indexOf(ch) + for (int poz = 0; poz < termLength ; poz++) { + if (termBuffer[poz] == apos) { minPoz = Math.min(poz, minPoz); break; + } } } - } - // An apostrophe has been found. If the prefix is an article strip it off. - if (minPoz != Integer.MAX_VALUE - && articles.contains(new String(nextToken.termBuffer(), 0, minPoz).toLowerCase())) { - nextToken.setTermBuffer(nextToken.termBuffer(), minPoz + 1, nextToken.termLength() - (minPoz + 1)); - } + // An apostrophe has been found. If the prefix is an article strip it off. + if (minPoz != Integer.MAX_VALUE + && articles.contains(new String(termAtt.termBuffer(), 0, minPoz).toLowerCase())) { + termAtt.setTermBuffer(termAtt.termBuffer(), minPoz + 1, termAtt.termLength() - (minPoz + 1)); + } - return nextToken; + return true; + } else { + return false; + } + } + + /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should + * not be overridden. Delegates to the backwards compatibility layer. */ + public final Token next(final Token reusableToken) throws java.io.IOException { + return super.next(reusableToken); } + /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should + * not be overridden. Delegates to the backwards compatibility layer. */ + public final Token next() throws java.io.IOException { + return super.next(); + } } diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java index d5723db56c5..991c4ec1e5f 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java @@ -20,6 +20,7 @@ package org.apache.lucene.analysis.fr; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import java.io.IOException; import java.util.HashSet; @@ -39,10 +40,13 @@ public final class FrenchStemFilter extends TokenFilter { */ private FrenchStemmer stemmer = null; private Set exclusions = null; + + private TermAttribute termAtt; public FrenchStemFilter( TokenStream in ) { super(in); stemmer = new FrenchStemmer(); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); } @@ -52,25 +56,23 @@ public final class FrenchStemFilter extends TokenFilter { } /** - * @return Returns the next token in the stream, or null at EOS + * @return Returns true for the next token in the stream, or false at EOS */ - public final Token next(final Token reusableToken) - throws IOException { - assert reusableToken != null; - Token nextToken = input.next(reusableToken); - if (nextToken == null) - return null; + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + String term = termAtt.term(); - String term = nextToken.term(); - - // Check the exclusion table - if ( exclusions == null || !exclusions.contains( term ) ) { - String s = stemmer.stem( term ); - // If not stemmed, don't waste the time adjusting the token. - if ((s != null) && !s.equals( term ) ) - nextToken.setTermBuffer(s); - } - return nextToken; + // Check the exclusion table + if ( exclusions == null || !exclusions.contains( term ) ) { + String s = stemmer.stem( term ); + // If not stemmed, don't waste the time adjusting the token. + if ((s != null) && !s.equals( term ) ) + termAtt.setTermBuffer(s); + } + return true; + } else { + return false; + } } /** * Set a alternative/custom FrenchStemmer for this filter. diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/EmptyTokenStream.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/EmptyTokenStream.java index 9a3d1b9816f..bd82911c5d9 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/EmptyTokenStream.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/EmptyTokenStream.java @@ -27,8 +27,19 @@ import java.io.IOException; */ public class EmptyTokenStream extends TokenStream { - public Token next(final Token reusableToken) throws IOException { - assert reusableToken != null; - return null; + public final boolean incrementToken() throws IOException { + return false; + } + + /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should + * not be overridden. Delegates to the backwards compatibility layer. */ + public final Token next(final Token reusableToken) throws java.io.IOException { + return super.next(reusableToken); + } + + /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should + * not be overridden. Delegates to the backwards compatibility layer. */ + public final Token next() throws java.io.IOException { + return super.next(); } } diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAndSuffixAwareTokenFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAndSuffixAwareTokenFilter.java index 3e62c43ce52..3065f9f7780 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAndSuffixAwareTokenFilter.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAndSuffixAwareTokenFilter.java @@ -24,6 +24,7 @@ import java.io.IOException; /** * Links two PrefixAwareTokenFilter + * @deprecated */ public class PrefixAndSuffixAwareTokenFilter extends TokenStream { diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAwareTokenFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAwareTokenFilter.java index 72948cbe622..b49c9f8241a 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAwareTokenFilter.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAwareTokenFilter.java @@ -29,6 +29,7 @@ import java.io.IOException; * to be used when updating the token values in the second stream based on that token. * * The default implementation adds last prefix token end offset to the suffix token start and end offsets. + * @deprecated */ public class PrefixAwareTokenFilter extends TokenStream { diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/SingleTokenTokenStream.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/SingleTokenTokenStream.java index 8efcff38017..13f0eb6cd1d 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/SingleTokenTokenStream.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/SingleTokenTokenStream.java @@ -17,10 +17,16 @@ package org.apache.lucene.analysis.miscellaneous; * limitations under the License. */ +import java.io.IOException; + import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; - -import java.io.IOException; +import org.apache.lucene.analysis.tokenattributes.FlagsAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; /** * A token stream containing a single token. @@ -29,34 +35,66 @@ public class SingleTokenTokenStream extends TokenStream { private boolean exhausted = false; // The token needs to be immutable, so work with clones! - private Token token; + private Token singleToken; + private TermAttribute termAtt; + private OffsetAttribute offsetAtt; + private FlagsAttribute flagsAtt; + private PositionIncrementAttribute posIncAtt; + private TypeAttribute typeAtt; + private PayloadAttribute payloadAtt; public SingleTokenTokenStream(Token token) { assert token != null; - this.token = (Token) token.clone(); + this.singleToken = (Token) token.clone(); + + termAtt = (TermAttribute) addAttribute(TermAttribute.class); + offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); + flagsAtt = (FlagsAttribute) addAttribute(FlagsAttribute.class); + posIncAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); + typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class); + payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class); } - public Token next(final Token reusableToken) throws IOException { - assert reusableToken != null; + public final boolean incrementToken() throws IOException { if (exhausted) { - return null; + return false; } + + Token clone = (Token) singleToken.clone(); + + termAtt.setTermBuffer(clone.termBuffer(), 0, clone.termLength()); + offsetAtt.setOffset(clone.startOffset(), clone.endOffset()); + flagsAtt.setFlags(clone.getFlags()); + typeAtt.setType(clone.type()); + posIncAtt.setPositionIncrement(clone.getPositionIncrement()); + payloadAtt.setPayload(clone.getPayload()); exhausted = true; - return (Token) token.clone(); + return true; + } + + /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should + * not be overridden. Delegates to the backwards compatibility layer. */ + public final Token next(final Token reusableToken) throws java.io.IOException { + return super.next(reusableToken); } + /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should + * not be overridden. Delegates to the backwards compatibility layer. */ + public final Token next() throws java.io.IOException { + return super.next(); + } public void reset() throws IOException { exhausted = false; } public Token getToken() { - return (Token) token.clone(); + return (Token) singleToken.clone(); } public void setToken(Token token) { - this.token = (Token) token.clone(); + this.singleToken = (Token) token.clone(); } } diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java index 7ae055cee7c..a00e1ce633a 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java @@ -20,9 +20,10 @@ package org.apache.lucene.analysis.ngram; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import java.io.IOException; -import java.util.LinkedList; /** * Tokenizes the given token into n-grams of given size(s). @@ -66,11 +67,18 @@ public class EdgeNGramTokenFilter extends TokenFilter { private int minGram; private int maxGram; private Side side; - private LinkedList ngrams; + private char[] curTermBuffer; + private int curTermLength; + private int curGramSize; + + private TermAttribute termAtt; + private OffsetAttribute offsetAtt; + protected EdgeNGramTokenFilter(TokenStream input) { super(input); - this.ngrams = new LinkedList(); + this.termAtt = (TermAttribute) addAttribute(TermAttribute.class); + this.offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); } /** @@ -99,7 +107,8 @@ public class EdgeNGramTokenFilter extends TokenFilter { this.minGram = minGram; this.maxGram = maxGram; this.side = side; - this.ngrams = new LinkedList(); + this.termAtt = (TermAttribute) addAttribute(TermAttribute.class); + this.offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); } /** @@ -114,54 +123,42 @@ public class EdgeNGramTokenFilter extends TokenFilter { this(input, Side.getSide(sideLabel), minGram, maxGram); } - /** Returns the next token in the stream, or null at EOS. */ - public final Token next(final Token reusableToken) throws IOException { - assert reusableToken != null; - if (!ngrams.isEmpty()) { - return (Token)ngrams.removeFirst(); - } - - Token token = null; - - while (ngrams.isEmpty() && (token = input.next()) != null) { - ngram(token); - } - - if (token == null) { - return null; - } - - if (!ngrams.isEmpty()) { - return (Token)ngrams.removeFirst(); - } else { - return null; + public final boolean incrementToken() throws IOException { + while (true) { + if (curTermBuffer == null) { + if (!input.incrementToken()) { + return false; + } else { + curTermBuffer = (char[]) termAtt.termBuffer().clone(); + curTermLength = termAtt.termLength(); + curGramSize = minGram; + } + } + if (curGramSize <= maxGram) { + if (! (curGramSize > curTermLength // if the remaining input is too short, we can't generate any n-grams + || curGramSize > maxGram)) { // if we have hit the end of our n-gram size range, quit + // grab gramSize chars from front or back + int start = side == Side.FRONT ? 0 : curTermLength - curGramSize; + int end = start + curGramSize; + offsetAtt.setOffset(start, end); + termAtt.setTermBuffer(curTermBuffer, start, curGramSize); + curGramSize++; + return true; + } + } + curTermBuffer = null; } } + + /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should + * not be overridden. Delegates to the backwards compatibility layer. */ + public final Token next(final Token reusableToken) throws java.io.IOException { + return super.next(reusableToken); + } - private void ngram(final Token token) { - int termLength = token.termLength(); - char[] termBuffer = token.termBuffer(); - int gramSize = minGram; - while (gramSize <= maxGram) { - // if the remaining input is too short, we can't generate any n-grams - if (gramSize > termLength) { - return; - } - - // if we have hit the end of our n-gram size range, quit - if (gramSize > maxGram) { - return; - } - - // grab gramSize chars from front or back - int start = side == Side.FRONT ? 0 : termLength - gramSize; - int end = start + gramSize; - Token tok = (Token) token.clone(); - tok.setStartOffset(start); - tok.setEndOffset(end); - tok.setTermBuffer(termBuffer, start, gramSize); - ngrams.add(tok); - gramSize++; - } + /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should + * not be overridden. Delegates to the backwards compatibility layer. */ + public final Token next() throws java.io.IOException { + return super.next(); } } diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java index e6fe22b02a8..179ab33208f 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java @@ -20,6 +20,8 @@ package org.apache.lucene.analysis.ngram; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter.Side; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import java.io.IOException; import java.io.Reader; @@ -35,6 +37,9 @@ public class EdgeNGramTokenizer extends Tokenizer { public static final Side DEFAULT_SIDE = Side.FRONT; public static final int DEFAULT_MAX_GRAM_SIZE = 1; public static final int DEFAULT_MIN_GRAM_SIZE = 1; + + private TermAttribute termAtt; + private OffsetAttribute offsetAtt; // Replace this with an enum when the Java 1.5 upgrade is made, the impl will be simplified /** Specifies which side of the input the n-gram should be generated from */ @@ -100,6 +105,9 @@ public class EdgeNGramTokenizer extends Tokenizer { this.minGram = minGram; this.maxGram = maxGram; this.side = side; + + this.termAtt = (TermAttribute) addAttribute(TermAttribute.class); + this.offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); } /** * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range @@ -114,8 +122,7 @@ public class EdgeNGramTokenizer extends Tokenizer { } /** Returns the next token in the stream, or null at EOS. */ - public final Token next(final Token reusableToken) throws IOException { - assert reusableToken != null; + public final boolean incrementToken() throws IOException { // if we are just starting, read the whole input if (!started) { started = true; @@ -128,21 +135,32 @@ public class EdgeNGramTokenizer extends Tokenizer { // if the remaining input is too short, we can't generate any n-grams if (gramSize > inLen) { - return null; + return false; } // if we have hit the end of our n-gram size range, quit if (gramSize > maxGram) { - return null; + return false; } // grab gramSize chars from front or back int start = side == Side.FRONT ? 0 : inLen - gramSize; int end = start + gramSize; - reusableToken.setTermBuffer(inStr, start, gramSize); - reusableToken.setStartOffset(input.correctOffset(start)); - reusableToken.setEndOffset(input.correctOffset(end)); + termAtt.setTermBuffer(inStr, start, gramSize); + offsetAtt.setOffset(input.correctOffset(start), input.correctOffset(end)); gramSize++; - return reusableToken; + return true; + } + + /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should + * not be overridden. Delegates to the backwards compatibility layer. */ + public final Token next(final Token reusableToken) throws java.io.IOException { + return super.next(reusableToken); + } + + /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should + * not be overridden. Delegates to the backwards compatibility layer. */ + public final Token next() throws java.io.IOException { + return super.next(); } } diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java index 761ec1891c8..ebf9fc0bdc0 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java @@ -17,12 +17,13 @@ package org.apache.lucene.analysis.ngram; * limitations under the License. */ +import java.io.IOException; + import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; - -import java.io.IOException; -import java.util.LinkedList; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; /** * Tokenizes the input into n-grams of the given size(s). @@ -32,7 +33,14 @@ public class NGramTokenFilter extends TokenFilter { public static final int DEFAULT_MAX_NGRAM_SIZE = 2; private int minGram, maxGram; - private LinkedList ngrams; + + private char[] curTermBuffer; + private int curTermLength; + private int curGramSize; + private int curPos; + + private TermAttribute termAtt; + private OffsetAttribute offsetAtt; /** * Creates NGramTokenFilter with given min and max n-grams. @@ -50,7 +58,9 @@ public class NGramTokenFilter extends TokenFilter { } this.minGram = minGram; this.maxGram = maxGram; - this.ngrams = new LinkedList(); + + this.termAtt = (TermAttribute) addAttribute(TermAttribute.class); + this.offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); } /** @@ -62,40 +72,41 @@ public class NGramTokenFilter extends TokenFilter { } /** Returns the next token in the stream, or null at EOS. */ - public final Token next(final Token reusableToken) throws IOException { - assert reusableToken != null; - if (!ngrams.isEmpty()) { - return (Token)ngrams.removeFirst(); - } - - Token token = null; - - while (ngrams.isEmpty() && (token = input.next()) != null) { - ngram(token); - } - - if (token == null) { - return null; - } - - if (!ngrams.isEmpty()) { - return (Token)ngrams.removeFirst(); - } else { - return null; + public final boolean incrementToken() throws IOException { + while (true) { + if (curTermBuffer == null) { + if (!input.incrementToken()) { + return false; + } else { + curTermBuffer = (char[]) termAtt.termBuffer().clone(); + curTermLength = termAtt.termLength(); + curGramSize = minGram; + curPos = 0; + } + } + while (curGramSize <= maxGram) { + while (curPos+curGramSize <= curTermLength) { // while there is input + termAtt.setTermBuffer(curTermBuffer, curPos, curGramSize); + offsetAtt.setOffset(curPos, curPos+curGramSize); + curPos++; + return true; + } + curGramSize++; // increase n-gram size + curPos = 0; + } + curTermBuffer = null; } } + + /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should + * not be overridden. Delegates to the backwards compatibility layer. */ + public final Token next(final Token reusableToken) throws java.io.IOException { + return super.next(reusableToken); + } - private void ngram(Token token) { - char[] termBuffer = token.termBuffer(); - int termLength = token.termLength(); - int gramSize = minGram; - while (gramSize <= maxGram) { - int pos = 0; // reset to beginning of string - while (pos+gramSize <= termLength) { // while there is input - ngrams.add(token.clone(termBuffer, pos, gramSize, pos, pos+gramSize)); - pos++; - } - gramSize++; // increase n-gram size - } + /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should + * not be overridden. Delegates to the backwards compatibility layer. */ + public final Token next() throws java.io.IOException { + return super.next(); } } diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java index 9bfb4d309e7..72f7d8be36f 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java @@ -19,6 +19,8 @@ package org.apache.lucene.analysis.ngram; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import java.io.IOException; import java.io.Reader; @@ -36,6 +38,9 @@ public class NGramTokenizer extends Tokenizer { private int inLen; private String inStr; private boolean started = false; + + private TermAttribute termAtt; + private OffsetAttribute offsetAtt; /** * Creates NGramTokenizer with given min and max n-grams. @@ -53,6 +58,9 @@ public class NGramTokenizer extends Tokenizer { } this.minGram = minGram; this.maxGram = maxGram; + + this.termAtt = (TermAttribute) addAttribute(TermAttribute.class); + this.offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); } /** * Creates NGramTokenizer with default min and max n-grams. @@ -63,8 +71,7 @@ public class NGramTokenizer extends Tokenizer { } /** Returns the next token in the stream, or null at EOS. */ - public final Token next(final Token reusableToken) throws IOException { - assert reusableToken != null; + public final boolean incrementToken() throws IOException { if (!started) { started = true; gramSize = minGram; @@ -78,13 +85,27 @@ public class NGramTokenizer extends Tokenizer { pos = 0; // reset to beginning of string gramSize++; // increase n-gram size if (gramSize > maxGram) // we are done - return null; + return false; if (pos+gramSize > inLen) - return null; + return false; } int oldPos = pos; pos++; - return reusableToken.reinit(inStr, oldPos, gramSize, input.correctOffset(oldPos), input.correctOffset(oldPos+gramSize)); + termAtt.setTermBuffer(inStr, oldPos, gramSize); + offsetAtt.setOffset(input.correctOffset(oldPos), input.correctOffset(oldPos+gramSize)); + return true; + } + + /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should + * not be overridden. Delegates to the backwards compatibility layer. */ + public final Token next(final Token reusableToken) throws java.io.IOException { + return super.next(reusableToken); + } + + /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should + * not be overridden. Delegates to the backwards compatibility layer. */ + public final Token next() throws java.io.IOException { + return super.next(); } } diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java index bacc5eec667..037ee028011 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java @@ -17,15 +17,15 @@ package org.apache.lucene.analysis.nl; * limitations under the License. */ -import org.apache.lucene.analysis.Token; -import org.apache.lucene.analysis.TokenFilter; -import org.apache.lucene.analysis.TokenStream; - import java.io.IOException; import java.util.HashMap; import java.util.HashSet; -import java.util.Set; import java.util.Map; +import java.util.Set; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; /** * A filter that stems Dutch words. It supports a table of words that should @@ -39,10 +39,13 @@ public final class DutchStemFilter extends TokenFilter { */ private DutchStemmer stemmer = null; private Set exclusions = null; + + private TermAttribute termAtt; public DutchStemFilter(TokenStream _in) { super(_in); stemmer = new DutchStemmer(); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); } /** @@ -62,24 +65,23 @@ public final class DutchStemFilter extends TokenFilter { } /** - * @return Returns the next token in the stream, or null at EOS + * Returns the next token in the stream, or null at EOS */ - public Token next(Token reusableToken) throws IOException { - assert reusableToken != null; - Token nextToken = input.next(reusableToken); - if (nextToken == null) - return null; + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + String term = termAtt.term(); - String term = nextToken.term(); - - // Check the exclusion table. - if (exclusions == null || !exclusions.contains(term)) { - String s = stemmer.stem(term); - // If not stemmed, don't waste the time adjusting the token. - if ((s != null) && !s.equals(term)) - nextToken.setTermBuffer(s); + // Check the exclusion table. + if (exclusions == null || !exclusions.contains(term)) { + String s = stemmer.stem(term); + // If not stemmed, don't waste the time adjusting the token. + if ((s != null) && !s.equals(term)) + termAtt.setTermBuffer(s); + } + return true; + } else { + return false; } - return nextToken; } /** diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/DelimitedPayloadTokenFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/DelimitedPayloadTokenFilter.java index f4931153506..ab022c2297f 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/DelimitedPayloadTokenFilter.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/DelimitedPayloadTokenFilter.java @@ -16,14 +16,13 @@ package org.apache.lucene.analysis.payloads; * limitations under the License. */ +import java.io.IOException; + import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute; -import java.io.IOException; - /** * Characters before the delimiter are the "token", those after are the payload. @@ -37,7 +36,7 @@ import java.io.IOException; * * @see PayloadEncoder */ -public class DelimitedPayloadTokenFilter extends TokenFilter { +public final class DelimitedPayloadTokenFilter extends TokenFilter { public static final char DEFAULT_DELIMITER = '|'; protected char delimiter = DEFAULT_DELIMITER; protected TermAttribute termAtt; @@ -83,27 +82,4 @@ public class DelimitedPayloadTokenFilter extends TokenFilter { } return result; } - - - public Token next(Token reusableToken) throws IOException { - Token result = input.next(reusableToken); - if (result != null) { - final char[] buffer = result.termBuffer(); - final int length = result.termLength(); - boolean seen = false; - for (int i = 0; i < length; i++) { - if (buffer[i] == delimiter) { - result.setTermBuffer(buffer, 0, i); - result.setPayload(encoder.encode(buffer, i + 1, (length - (i + 1)))); - seen = true; - break;//at this point, we know the whole piece, so we can exit. If we don't see the delimiter, then the termAtt is the same - } - } - if (seen == false) { - //no delimiter - payAtt.setPayload(null); - } - } - return result; - } } diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilter.java index 2e796492448..7999ca0b7c6 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilter.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilter.java @@ -20,6 +20,8 @@ package org.apache.lucene.analysis.payloads; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.index.Payload; import java.io.IOException; @@ -34,19 +36,37 @@ public class NumericPayloadTokenFilter extends TokenFilter { private String typeMatch; private Payload thePayload; + private PayloadAttribute payloadAtt; + private TypeAttribute typeAtt; + public NumericPayloadTokenFilter(TokenStream input, float payload, String typeMatch) { super(input); //Need to encode the payload thePayload = new Payload(PayloadHelper.encodeFloat(payload)); this.typeMatch = typeMatch; + payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class); + typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class); } - public Token next(final Token reusableToken) throws IOException { - assert reusableToken != null; - Token nextToken = input.next(reusableToken); - if (nextToken != null && nextToken.type().equals(typeMatch)){ - nextToken.setPayload(thePayload); + public final boolean incrementToken() throws IOException { + if (input.incrementToken()) { + if (typeAtt.type().equals(typeMatch)) + payloadAtt.setPayload(thePayload); + return true; + } else { + return false; } - return nextToken; + } + + /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should + * not be overridden. Delegates to the backwards compatibility layer. */ + public final Token next(final Token reusableToken) throws java.io.IOException { + return super.next(reusableToken); + } + + /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should + * not be overridden. Delegates to the backwards compatibility layer. */ + public final Token next() throws java.io.IOException { + return super.next(); } } diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilter.java index a3a56d12506..76add35b780 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilter.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilter.java @@ -17,13 +17,15 @@ package org.apache.lucene.analysis.payloads; */ +import java.io.IOException; + import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; import org.apache.lucene.index.Payload; -import java.io.IOException; - /** * Adds the {@link org.apache.lucene.analysis.Token#setStartOffset(int)} @@ -32,22 +34,37 @@ import java.io.IOException; * **/ public class TokenOffsetPayloadTokenFilter extends TokenFilter { - + protected OffsetAttribute offsetAtt; + protected PayloadAttribute payAtt; public TokenOffsetPayloadTokenFilter(TokenStream input) { super(input); + offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); + payAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class); } - public Token next(final Token reusableToken) throws IOException { - assert reusableToken != null; - Token nextToken = input.next(reusableToken); - if (nextToken != null){ + public final boolean incrementToken() throws IOException { + if (input.incrementToken()) { byte[] data = new byte[8]; - PayloadHelper.encodeInt(nextToken.startOffset(), data, 0); - PayloadHelper.encodeInt(nextToken.endOffset(), data, 4); + PayloadHelper.encodeInt(offsetAtt.startOffset(), data, 0); + PayloadHelper.encodeInt(offsetAtt.endOffset(), data, 4); Payload payload = new Payload(data); - nextToken.setPayload(payload); + payAtt.setPayload(payload); + return true; + } else { + return false; } - return nextToken; + } + + /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should + * not be overridden. Delegates to the backwards compatibility layer. */ + public final Token next(final Token reusableToken) throws java.io.IOException { + return super.next(reusableToken); + } + + /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should + * not be overridden. Delegates to the backwards compatibility layer. */ + public final Token next() throws java.io.IOException { + return super.next(); } } \ No newline at end of file diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilter.java index 19b191b6974..bd26e536e57 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilter.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilter.java @@ -20,6 +20,8 @@ package org.apache.lucene.analysis.payloads; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.index.Payload; import java.io.IOException; @@ -32,19 +34,37 @@ import java.io.IOException; * **/ public class TypeAsPayloadTokenFilter extends TokenFilter { + private PayloadAttribute payloadAtt; + private TypeAttribute typeAtt; public TypeAsPayloadTokenFilter(TokenStream input) { super(input); - + payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class); + typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class); } - public Token next(final Token reusableToken) throws IOException { - assert reusableToken != null; - Token nextToken = input.next(reusableToken); - if (nextToken != null && nextToken.type() != null && nextToken.type().equals("") == false){ - nextToken.setPayload(new Payload(nextToken.type().getBytes("UTF-8"))); + public final boolean incrementToken() throws IOException { + if (input.incrementToken()) { + String type = typeAtt.type(); + if (type != null && type.equals("") == false) { + payloadAtt.setPayload(new Payload(type.getBytes("UTF-8"))); + } + return true; + } else { + return false; } - return nextToken; + } + + /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should + * not be overridden. Delegates to the backwards compatibility layer. */ + public final Token next(final Token reusableToken) throws java.io.IOException { + return super.next(reusableToken); + } + + /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should + * not be overridden. Delegates to the backwards compatibility layer. */ + public final Token next() throws java.io.IOException { + return super.next(); } } \ No newline at end of file diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/position/PositionFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/position/PositionFilter.java index d1cc144f14e..6eb66372007 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/position/PositionFilter.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/position/PositionFilter.java @@ -22,6 +22,7 @@ import java.io.IOException; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; /** Set the positionIncrement of all tokens to the "positionIncrement", * except the first return token which retains its original positionIncrement value. @@ -34,6 +35,8 @@ public class PositionFilter extends TokenFilter { /** The first token must have non-zero positionIncrement **/ private boolean firstTokenPositioned = false; + + private PositionIncrementAttribute posIncrAtt; /** * Constructs a PositionFilter that assigns a position increment of zero to @@ -43,6 +46,7 @@ public class PositionFilter extends TokenFilter { */ public PositionFilter(final TokenStream input) { super(input); + posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); } /** @@ -58,18 +62,29 @@ public class PositionFilter extends TokenFilter { this.positionIncrement = positionIncrement; } - public Token next(Token reusableToken) throws IOException { - - assert reusableToken != null; - reusableToken = input.next(reusableToken); - if (null != reusableToken) { + public final boolean incrementToken() throws IOException { + if (input.incrementToken()) { if (firstTokenPositioned) { - reusableToken.setPositionIncrement(positionIncrement); + posIncrAtt.setPositionIncrement(positionIncrement); } else { firstTokenPositioned = true; } + return true; + } else { + return false; } - return reusableToken; + } + + /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should + * not be overridden. Delegates to the backwards compatibility layer. */ + public final Token next(final Token reusableToken) throws java.io.IOException { + return super.next(reusableToken); + } + + /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should + * not be overridden. Delegates to the backwards compatibility layer. */ + public final Token next() throws java.io.IOException { + return super.next(); } public void reset() throws IOException { diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/reverse/ReverseStringFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/reverse/ReverseStringFilter.java index 6a07a70eba6..90dc8812446 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/reverse/ReverseStringFilter.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/reverse/ReverseStringFilter.java @@ -19,7 +19,7 @@ package org.apache.lucene.analysis.reverse; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import java.io.IOException; @@ -30,16 +30,20 @@ import java.io.IOException; */ public final class ReverseStringFilter extends TokenFilter { + private TermAttribute termAtt; + public ReverseStringFilter(TokenStream in) { super(in); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); } - public final Token next(Token in) throws IOException { - assert in != null; - Token token=input.next(in); - if( token == null ) return null; - reverse( token.termBuffer(), token.termLength() ); - return token; + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + reverse( termAtt.termBuffer(), termAtt.termLength() ); + return true; + } else { + return false; + } } public static String reverse( final String input ){ diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java index d27e1800f9f..cd54f0b5712 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java @@ -17,9 +17,12 @@ package org.apache.lucene.analysis.ru; * limitations under the License. */ +import java.io.IOException; + import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; /** * Normalizes token text to lower case, analyzing given ("russian") charset. @@ -31,26 +34,27 @@ public final class RussianLowerCaseFilter extends TokenFilter { char[] charset; + private TermAttribute termAtt; + public RussianLowerCaseFilter(TokenStream in, char[] charset) { super(in); this.charset = charset; + termAtt = (TermAttribute) addAttribute(TermAttribute.class); } - public final Token next(final Token reusableToken) throws java.io.IOException + public final boolean incrementToken() throws IOException { - assert reusableToken != null; - Token nextToken = input.next(reusableToken); - - if (nextToken == null) - return null; - - char[] chArray = nextToken.termBuffer(); - int chLen = nextToken.termLength(); + if (input.incrementToken()) { + char[] chArray = termAtt.termBuffer(); + int chLen = termAtt.termLength(); for (int i = 0; i < chLen; i++) { - chArray[i] = RussianCharsets.toLowerCase(chArray[i], charset); + chArray[i] = RussianCharsets.toLowerCase(chArray[i], charset); } - return nextToken; + return true; + } else { + return false; + } } } diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java index f39eea92444..ab87c2b2ea0 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java @@ -20,6 +20,8 @@ package org.apache.lucene.analysis.ru; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; + import java.io.IOException; /** @@ -37,29 +39,32 @@ public final class RussianStemFilter extends TokenFilter */ private RussianStemmer stemmer = null; + private TermAttribute termAtt; + public RussianStemFilter(TokenStream in, char[] charset) { super(in); stemmer = new RussianStemmer(charset); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); } /** - * @return Returns the next token in the stream, or null at EOS + * Returns the next token in the stream, or null at EOS */ - public final Token next(final Token reusableToken) throws IOException + public final boolean incrementToken() throws IOException { - assert reusableToken != null; - Token nextToken = input.next(reusableToken); - if (nextToken == null) - return null; - - String term = nextToken.term(); + if (input.incrementToken()) { + String term = termAtt.term(); String s = stemmer.stem(term); if (s != null && !s.equals(term)) - nextToken.setTermBuffer(s); - return nextToken; + termAtt.setTermBuffer(s); + return true; + } else { + return false; + } } + /** * Set a alternative/custom RussianStemmer for this filter. */ diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java index 0a5f99bcbac..055a0b1674e 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java @@ -22,6 +22,9 @@ import java.lang.Character.UnicodeBlock; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; + import java.text.BreakIterator; /** @@ -32,46 +35,62 @@ import java.text.BreakIterator; public class ThaiWordFilter extends TokenFilter { private BreakIterator breaker = null; - private Token thaiToken = null; + private TermAttribute termAtt; + private OffsetAttribute offsetAtt; + + private State thaiState = null; + public ThaiWordFilter(TokenStream input) { super(input); breaker = BreakIterator.getWordInstance(new Locale("th")); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); + offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); } - public Token next(final Token reusableToken) throws IOException { - assert reusableToken != null; - if (thaiToken != null) { + public final boolean incrementToken() throws IOException { + if (thaiState != null) { int start = breaker.current(); int end = breaker.next(); if (end != BreakIterator.DONE) { - reusableToken.reinit(thaiToken, thaiToken.termBuffer(), start, end - start); - reusableToken.setStartOffset(thaiToken.startOffset()+start); - reusableToken.setEndOffset(thaiToken.startOffset()+end); - return reusableToken; + restoreState(thaiState); + termAtt.setTermBuffer(termAtt.termBuffer(), start, end - start); + offsetAtt.setOffset(offsetAtt.startOffset() + start, offsetAtt.startOffset() + end); + return true; } - thaiToken = null; + thaiState = null; } - Token nextToken = input.next(reusableToken); - if (nextToken == null || nextToken.termLength() == 0) { - return null; - } + if (input.incrementToken() == false || termAtt.termLength() == 0) + return false; - String text = nextToken.term(); + String text = termAtt.term(); if (UnicodeBlock.of(text.charAt(0)) != UnicodeBlock.THAI) { - nextToken.setTermBuffer(text.toLowerCase()); - return nextToken; + termAtt.setTermBuffer(text.toLowerCase()); + return true; } + + thaiState = captureState(); - thaiToken = (Token) nextToken.clone(); breaker.setText(text); int end = breaker.next(); if (end != BreakIterator.DONE) { - nextToken.setTermBuffer(text, 0, end); - nextToken.setEndOffset(nextToken.startOffset() + end); - return nextToken; + termAtt.setTermBuffer(text, 0, end); + offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset() + end); + return true; } - return null; + return false; + } + + /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should + * not be overridden. Delegates to the backwards compatibility layer. */ + public final Token next(final Token reusableToken) throws java.io.IOException { + return super.next(reusableToken); + } + + /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should + * not be overridden. Delegates to the backwards compatibility layer. */ + public final Token next() throws java.io.IOException { + return super.next(); } } diff --git a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicNormalizationFilter.java b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicNormalizationFilter.java index bbebe979a0c..99d170eb31e 100644 --- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicNormalizationFilter.java +++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicNormalizationFilter.java @@ -17,18 +17,12 @@ package org.apache.lucene.analysis.ar; * limitations under the License. */ -import java.io.BufferedReader; -import java.io.File; -import java.io.FileInputStream; import java.io.IOException; -import java.io.InputStreamReader; import java.io.StringReader; import junit.framework.TestCase; -import org.apache.lucene.analysis.Token; -import org.apache.lucene.analysis.WhitespaceTokenizer; -import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; /** * Test the Arabic Normalization Filter @@ -95,11 +89,10 @@ public class TestArabicNormalizationFilter extends TestCase { private void check(final String input, final String expected) throws IOException { ArabicLetterTokenizer tokenStream = new ArabicLetterTokenizer(new StringReader(input)); ArabicNormalizationFilter filter = new ArabicNormalizationFilter(tokenStream); - final Token reusableToken = new Token(); - Token nextToken = filter.next(reusableToken); - if (nextToken == null) - fail(); - assertEquals(expected, nextToken.term()); + TermAttribute termAtt = (TermAttribute) filter.getAttribute(TermAttribute.class); + + assertTrue(filter.incrementToken()); + assertEquals(expected, termAtt.term()); filter.close(); } diff --git a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java index 01dc5449ade..9e4bcfdf53b 100644 --- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java +++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java @@ -17,17 +17,12 @@ package org.apache.lucene.analysis.ar; * limitations under the License. */ -import java.io.BufferedReader; -import java.io.File; -import java.io.FileInputStream; import java.io.IOException; -import java.io.InputStreamReader; import java.io.StringReader; import junit.framework.TestCase; -import org.apache.lucene.analysis.Token; -import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; /** * Test the Arabic Normalization Filter @@ -118,11 +113,10 @@ public class TestArabicStemFilter extends TestCase { private void check(final String input, final String expected) throws IOException { ArabicLetterTokenizer tokenStream = new ArabicLetterTokenizer(new StringReader(input)); ArabicStemFilter filter = new ArabicStemFilter(tokenStream); - final Token reusableToken = new Token(); - Token nextToken = filter.next(reusableToken); - if (nextToken == null) - fail(); - assertEquals(expected, nextToken.term()); + TermAttribute termAtt = (TermAttribute) filter.getAttribute(TermAttribute.class); + + assertTrue(filter.incrementToken()); + assertEquals(expected, termAtt.term()); filter.close(); } diff --git a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java index 9c0fdc36f23..e1c9062425f 100644 --- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java +++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java @@ -23,8 +23,8 @@ import java.io.StringReader; import junit.framework.TestCase; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; /** * Test the Brazilian Stem Filter, which only modifies the term text. @@ -122,12 +122,10 @@ public class TestBrazilianStemmer extends TestCase { private void check(final String input, final String expected) throws IOException { Analyzer analyzer = new BrazilianAnalyzer(); TokenStream stream = analyzer.tokenStream("dummy", new StringReader(input)); - final Token reusableToken = new Token(); - Token nextToken = stream.next(reusableToken); - if (nextToken == null) - fail(); - assertEquals(expected, nextToken.term()); - assertTrue(stream.next(nextToken) == null); + TermAttribute text = (TermAttribute) stream.getAttribute(TermAttribute.class); + assertTrue(stream.incrementToken()); + assertEquals(expected, text.term()); + assertFalse(stream.incrementToken()); stream.close(); } diff --git a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java index 36268792d10..c15ea48d964 100644 --- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java +++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java @@ -21,50 +21,49 @@ import java.io.IOException; import java.io.StringReader; import junit.framework.TestCase; -import org.apache.lucene.analysis.Token; + +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; public class TestCJKTokenizer extends TestCase{ + + class TestToken { + String termText; + int start; + int end; + String type; + } - public Token newToken(String termText, int start, int end, int type) { - Token token = new Token(start, end); - token.setTermBuffer(termText); - token.setType(CJKTokenizer.TOKEN_TYPE_NAMES[type]); + public TestToken newToken(String termText, int start, int end, int type) { + TestToken token = new TestToken(); + token.termText = termText; + token.type = CJKTokenizer.TOKEN_TYPE_NAMES[type]; + token.start = start; + token.end = end; return token; } - public void checkCJKToken(final String str, final Token[] out_tokens) throws IOException { + public void checkCJKToken(final String str, final TestToken[] out_tokens) throws IOException { CJKTokenizer tokenizer = new CJKTokenizer(new StringReader(str)); - int i = 0; - System.out.println("string[" + str + "]"); - System.out.print("tokens["); - final Token reusableToken = new Token(); - for (Token token = tokenizer.next(reusableToken) ; - token != null ; - token = tokenizer.next(reusableToken) ) { - if (token.term().equals(out_tokens[i].term()) - && token.startOffset() == out_tokens[i].startOffset() - && token.endOffset() == out_tokens[i].endOffset() - && token.type().equals(out_tokens[i].type()) ) { - System.out.print( token.term() + " "); - } - else { - fail(token.term() + " (start: " + token.startOffset() - + " end: " + token.endOffset() + " type: " + token.type() + ") != " - + out_tokens[i].term() + " (start: " + out_tokens[i].startOffset() - + " end: " + out_tokens[i].endOffset() - + " type: " + out_tokens[i].type() + ")"); - break; - } - ++i; + TermAttribute termAtt = (TermAttribute) tokenizer.getAttribute(TermAttribute.class); + OffsetAttribute offsetAtt = (OffsetAttribute) tokenizer.getAttribute(OffsetAttribute.class); + TypeAttribute typeAtt = (TypeAttribute) tokenizer.getAttribute(TypeAttribute.class); + for (int i = 0; i < out_tokens.length; i++) { + assertTrue(tokenizer.incrementToken()); + assertEquals(termAtt.term(), out_tokens[i].termText); + assertEquals(offsetAtt.startOffset(), out_tokens[i].start); + assertEquals(offsetAtt.endOffset(), out_tokens[i].end); + assertEquals(typeAtt.type(), out_tokens[i].type); } - System.out.println("]" + System.getProperty("line.separator")); + assertFalse(tokenizer.incrementToken()); } public void testJa1() throws IOException { String str = "\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341"; - Token[] out_tokens = { + TestToken[] out_tokens = { newToken("\u4e00\u4e8c", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE), newToken("\u4e8c\u4e09", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE), newToken("\u4e09\u56db", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE), @@ -81,7 +80,7 @@ public class TestCJKTokenizer extends TestCase{ public void testJa2() throws IOException { String str = "\u4e00 \u4e8c\u4e09\u56db \u4e94\u516d\u4e03\u516b\u4e5d \u5341"; - Token[] out_tokens = { + TestToken[] out_tokens = { newToken("\u4e00", 0, 1, CJKTokenizer.DOUBLE_TOKEN_TYPE), newToken("\u4e8c\u4e09", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE), newToken("\u4e09\u56db", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE), @@ -97,7 +96,7 @@ public class TestCJKTokenizer extends TestCase{ public void testC() throws IOException { String str = "abc defgh ijklmn opqrstu vwxy z"; - Token[] out_tokens = { + TestToken[] out_tokens = { newToken("abc", 0, 3, CJKTokenizer.SINGLE_TOKEN_TYPE), newToken("defgh", 4, 9, CJKTokenizer.SINGLE_TOKEN_TYPE), newToken("ijklmn", 10, 16, CJKTokenizer.SINGLE_TOKEN_TYPE), @@ -111,7 +110,7 @@ public class TestCJKTokenizer extends TestCase{ public void testMix() throws IOException { String str = "\u3042\u3044\u3046\u3048\u304aabc\u304b\u304d\u304f\u3051\u3053"; - Token[] out_tokens = { + TestToken[] out_tokens = { newToken("\u3042\u3044", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE), newToken("\u3044\u3046", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE), newToken("\u3046\u3048", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE), @@ -128,7 +127,7 @@ public class TestCJKTokenizer extends TestCase{ public void testMix2() throws IOException { String str = "\u3042\u3044\u3046\u3048\u304aab\u3093c\u304b\u304d\u304f\u3051 \u3053"; - Token[] out_tokens = { + TestToken[] out_tokens = { newToken("\u3042\u3044", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE), newToken("\u3044\u3046", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE), newToken("\u3046\u3048", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE), @@ -147,7 +146,7 @@ public class TestCJKTokenizer extends TestCase{ public void testSingleChar() throws IOException { String str = "\u4e00"; - Token[] out_tokens = { + TestToken[] out_tokens = { newToken("\u4e00", 0, 1, CJKTokenizer.DOUBLE_TOKEN_TYPE), }; checkCJKToken(str, out_tokens); diff --git a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java index 2990f40cda1..32417f26ce6 100644 --- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java +++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java @@ -22,7 +22,7 @@ import java.io.StringReader; import junit.framework.TestCase; -import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; public class TestChineseTokenizer extends TestCase @@ -34,12 +34,12 @@ public class TestChineseTokenizer extends TestCase int correctStartOffset = 0; int correctEndOffset = 1; - final Token reusableToken = new Token(); - for (Token nextToken = tokenizer.next(reusableToken); nextToken != null; nextToken = tokenizer.next(reusableToken)) { - assertEquals(correctStartOffset, nextToken.startOffset()); - assertEquals(correctEndOffset, nextToken.endOffset()); - correctStartOffset++; - correctEndOffset++; + OffsetAttribute offsetAtt = (OffsetAttribute) tokenizer.getAttribute(OffsetAttribute.class); + while (tokenizer.incrementToken()) { + assertEquals(correctStartOffset, offsetAtt.startOffset()); + assertEquals(correctEndOffset, offsetAtt.endOffset()); + correctStartOffset++; + correctEndOffset++; } } } diff --git a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java index 581d47ebf52..d51edb064c6 100644 --- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java +++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java @@ -31,15 +31,14 @@ import java.util.List; import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; -import org.apache.lucene.analysis.Token; +import junit.framework.TestCase; + import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.WhitespaceTokenizer; -import org.apache.lucene.analysis.compound.CompoundWordTokenFilterBase; -import org.apache.lucene.analysis.compound.DictionaryCompoundWordTokenFilter; -import org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilter; import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree; - -import junit.framework.TestCase; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; public class TestCompoundWordTokenFilter extends TestCase { private static String[] locations = { @@ -155,16 +154,18 @@ public class TestCompoundWordTokenFilter extends TestCase { private void assertFiltersTo(TokenFilter tf, String[] s, int[] startOffset, int[] endOffset, int[] posIncr) throws Exception { - final Token reusableToken = new Token(); + TermAttribute termAtt = (TermAttribute) tf.getAttribute(TermAttribute.class); + OffsetAttribute offsetAtt = (OffsetAttribute) tf.getAttribute(OffsetAttribute.class); + PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) tf.getAttribute(PositionIncrementAttribute.class); + for (int i = 0; i < s.length; ++i) { - Token nextToken = tf.next(reusableToken); - assertNotNull(nextToken); - assertEquals(s[i], nextToken.term()); - assertEquals(startOffset[i], nextToken.startOffset()); - assertEquals(endOffset[i], nextToken.endOffset()); - assertEquals(posIncr[i], nextToken.getPositionIncrement()); + assertTrue(tf.incrementToken()); + assertEquals(s[i], termAtt.term()); + assertEquals(startOffset[i], offsetAtt.startOffset()); + assertEquals(endOffset[i], offsetAtt.endOffset()); + assertEquals(posIncr[i], posIncAtt.getPositionIncrement()); } - assertNull(tf.next(reusableToken)); + assertFalse(tf.incrementToken()); } private void getHyphenationPatternFileContents() { diff --git a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java index 0848f522152..5460c95d5f7 100644 --- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java +++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java @@ -22,8 +22,8 @@ import java.io.StringReader; import junit.framework.TestCase; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; /** * Test the CzechAnalyzer @@ -39,13 +39,12 @@ public class TestCzechAnalyzer extends TestCase { private void assertAnalyzesTo(Analyzer a, String input, String[] output) throws Exception { TokenStream ts = a.tokenStream("dummy", new StringReader(input)); - final Token reusableToken = new Token(); + TermAttribute text = (TermAttribute) ts.getAttribute(TermAttribute.class); for (int i=0; i */ -public class SentenceTokenizer extends Tokenizer { +public final class SentenceTokenizer extends Tokenizer { /** * End of sentence punctuation: 。,!?;,!?; @@ -39,12 +41,19 @@ public class SentenceTokenizer extends Tokenizer { private final StringBuffer buffer = new StringBuffer(); private int tokenStart = 0, tokenEnd = 0; + + private TermAttribute termAtt; + private OffsetAttribute offsetAtt; + private TypeAttribute typeAtt; public SentenceTokenizer(Reader reader) { super(reader); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); + offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); + typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class); } - public Token next(final Token reusableToken) throws IOException { + public boolean incrementToken() throws IOException { buffer.setLength(0); int ci; char ch, pch; @@ -83,11 +92,12 @@ public class SentenceTokenizer extends Tokenizer { } } if (buffer.length() == 0) - return null; + return false; else { - reusableToken.clear(); - reusableToken.reinit(buffer.toString(), input.correctOffset(tokenStart), input.correctOffset(tokenEnd), "sentence"); - return reusableToken; + termAtt.setTermBuffer(buffer.toString()); + offsetAtt.setOffset(input.correctOffset(tokenStart), input.correctOffset(tokenEnd)); + typeAtt.setType("sentence"); + return true; } } diff --git a/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordSegmenter.java b/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordSegmenter.java index 1a79ae0508a..db3d9deb2e6 100644 --- a/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordSegmenter.java +++ b/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordSegmenter.java @@ -20,7 +20,6 @@ package org.apache.lucene.analysis.cn.smart; import java.util.ArrayList; import java.util.List; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.cn.smart.hhmm.HHMMSegmenter; import org.apache.lucene.analysis.cn.smart.hhmm.SegToken; import org.apache.lucene.analysis.cn.smart.hhmm.SegTokenFilter; @@ -37,11 +36,11 @@ class WordSegmenter { /** * Segment a sentence into words with {@link HHMMSegmenter} * - * @param sentenceToken sentence {@link Token} + * @param sentence input sentence + * @param startOffset start offset of sentence * @return {@link List} of {@link SegToken} */ - public List segmentSentence(Token sentenceToken) { - String sentence = sentenceToken.term(); + public List segmentSentence(String sentence, int startOffset) { List segTokenList = hhmmSegmenter.process(sentence); @@ -49,25 +48,25 @@ class WordSegmenter { // tokens from sentence, excluding WordType.SENTENCE_BEGIN and WordType.SENTENCE_END for (int i = 1; i < segTokenList.size() - 1; i++) { - result.add(convertSegToken((SegToken) segTokenList.get(i), sentence, - sentenceToken.startOffset(), "word")); + result.add(convertSegToken((SegToken) segTokenList.get(i), sentence, startOffset)); } return result; } /** - * Convert a {@link SegToken} to a Lucene {@link Token} + * Process a {@link SegToken} so that it is ready for indexing. + * + * This method calculates offsets and normalizes the token with {@link SegTokenFilter}. * * @param st input {@link SegToken} * @param sentence associated Sentence * @param sentenceStartOffset offset into sentence - * @param type token type, default is word - * @return Lucene {@link Token} + * @return Lucene {@link SegToken} */ - public Token convertSegToken(SegToken st, String sentence, - int sentenceStartOffset, String type) { - Token result; + public SegToken convertSegToken(SegToken st, String sentence, + int sentenceStartOffset) { + switch (st.wordType) { case WordType.STRING: case WordType.NUMBER: @@ -81,9 +80,8 @@ class WordSegmenter { } st = tokenFilter.filter(st); - - result = new Token(st.charArray, 0, st.charArray.length, st.startOffset - + sentenceStartOffset, st.endOffset + sentenceStartOffset); - return result; + st.startOffset += sentenceStartOffset; + st.endOffset += sentenceStartOffset; + return st; } } diff --git a/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordTokenFilter.java b/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordTokenFilter.java index df9fc845465..5882375e990 100644 --- a/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordTokenFilter.java +++ b/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordTokenFilter.java @@ -21,20 +21,27 @@ import java.io.IOException; import java.util.Iterator; import java.util.List; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.cn.smart.hhmm.SegToken; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; /** * A {@link TokenFilter} that breaks sentences into words. */ -public class WordTokenFilter extends TokenFilter { +public final class WordTokenFilter extends TokenFilter { private WordSegmenter wordSegmenter; private Iterator tokenIter; private List tokenBuffer; + + private TermAttribute termAtt; + private OffsetAttribute offsetAtt; + private TypeAttribute typeAtt; /** * Construct a new WordTokenizer. @@ -44,32 +51,34 @@ public class WordTokenFilter extends TokenFilter { public WordTokenFilter(TokenStream in) { super(in); this.wordSegmenter = new WordSegmenter(); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); + offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); + typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class); } - - public Token next(final Token reusableSentenceToken) throws IOException { - if (tokenIter != null && tokenIter.hasNext()) - return (Token) tokenIter.next(); - else { - Token nextToken = input.next(reusableSentenceToken); - if (processNextSentence(nextToken)) { - return (Token) tokenIter.next(); - } else - return null; - } - } - - /** - * Process the next input sentence, placing tokens into tokenBuffer - * - * @param reusableSentenceToken input sentence - * @return true if more tokens were placed into tokenBuffer. - * @throws IOException - */ - private boolean processNextSentence(final Token reusableSentenceToken) throws IOException { - if (reusableSentenceToken == null) - return false; - tokenBuffer = wordSegmenter.segmentSentence(reusableSentenceToken); - tokenIter = tokenBuffer.iterator(); - return tokenBuffer != null && tokenIter.hasNext(); + + public boolean incrementToken() throws IOException { + if (tokenIter == null || !tokenIter.hasNext()) { + // there are no remaining tokens from the current sentence... are there more sentences? + if (input.incrementToken()) { + // a new sentence is available: process it. + tokenBuffer = wordSegmenter.segmentSentence(termAtt.term(), offsetAtt.startOffset()); + tokenIter = tokenBuffer.iterator(); + /* + * it should not be possible to have a sentence with 0 words, check just in case. + * returning EOS isn't the best either, but its the behavior of the original code. + */ + if (!tokenIter.hasNext()) + return false; + } else { + return false; // no more sentences, end of stream! + } + } + + // There are remaining tokens from the current sentence, return the next one. + SegToken nextWord = (SegToken) tokenIter.next(); + termAtt.setTermBuffer(nextWord.charArray, 0, nextWord.charArray.length); + offsetAtt.setOffset(nextWord.startOffset, nextWord.endOffset); + typeAtt.setType("word"); + return true; } } diff --git a/contrib/analyzers/smartcn/src/test/org/apache/lucene/analysis/cn/TestSmartChineseAnalyzer.java b/contrib/analyzers/smartcn/src/test/org/apache/lucene/analysis/cn/TestSmartChineseAnalyzer.java index b267328a5f1..732f7f1fac2 100644 --- a/contrib/analyzers/smartcn/src/test/org/apache/lucene/analysis/cn/TestSmartChineseAnalyzer.java +++ b/contrib/analyzers/smartcn/src/test/org/apache/lucene/analysis/cn/TestSmartChineseAnalyzer.java @@ -29,6 +29,9 @@ import junit.framework.TestCase; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; public class TestSmartChineseAnalyzer extends TestCase { @@ -108,22 +111,23 @@ public class TestSmartChineseAnalyzer extends TestCase { public void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[]) throws Exception { - TokenStream ts = a.tokenStream("dummy", new StringReader(input)); - final Token reusableToken = new Token(); - for (int i = 0; i < output.length; i++) { - Token nextToken = ts.next(reusableToken); - assertNotNull(nextToken); - assertEquals(nextToken.term(), output[i]); + TokenStream ts = a.tokenStream("dummy", new StringReader(input)); + TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class); + OffsetAttribute offsetAtt = (OffsetAttribute) ts.getAttribute(OffsetAttribute.class); + TypeAttribute typeAtt = (TypeAttribute) ts.getAttribute(TypeAttribute.class); + for (int i = 0; i < output.length; i++) { + assertTrue(ts.incrementToken()); + assertEquals(termAtt.term(), output[i]); if (startOffsets != null) - assertEquals(nextToken.startOffset(), startOffsets[i]); + assertEquals(offsetAtt.startOffset(), startOffsets[i]); if (endOffsets != null) - assertEquals(nextToken.endOffset(), endOffsets[i]); + assertEquals(offsetAtt.endOffset(), endOffsets[i]); if (types != null) - assertEquals(nextToken.type(), types[i]); + assertEquals(typeAtt.type(), types[i]); + } + assertFalse(ts.incrementToken()); + ts.close(); } - assertNull(ts.next(reusableToken)); - ts.close(); -} public void assertAnalyzesTo(Analyzer a, String input, String[] output) throws Exception { assertAnalyzesTo(a, input, output, null, null, null); diff --git a/contrib/collation/src/java/org/apache/lucene/collation/CollationKeyFilter.java b/contrib/collation/src/java/org/apache/lucene/collation/CollationKeyFilter.java index 41aae62ab7d..54d9bc33d9d 100644 --- a/contrib/collation/src/java/org/apache/lucene/collation/CollationKeyFilter.java +++ b/contrib/collation/src/java/org/apache/lucene/collation/CollationKeyFilter.java @@ -21,6 +21,7 @@ package org.apache.lucene.collation; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.util.IndexableBinaryStringTools; import java.io.IOException; @@ -73,8 +74,9 @@ import java.text.Collator; * {@link ICUCollationKeyFilter} on the query side, or vice versa. *

*/ -public class CollationKeyFilter extends TokenFilter { +public final class CollationKeyFilter extends TokenFilter { private Collator collator = null; + private TermAttribute termAtt; /** * @param input Source token stream @@ -83,25 +85,26 @@ public class CollationKeyFilter extends TokenFilter { public CollationKeyFilter(TokenStream input, Collator collator) { super(input); this.collator = collator; + termAtt = (TermAttribute) addAttribute(TermAttribute.class); } - public final Token next(final Token reusableToken) throws IOException { - assert reusableToken != null; - Token nextToken = input.next(reusableToken); - if (nextToken != null) { - char[] termBuffer = nextToken.termBuffer(); - String termText = new String(termBuffer, 0, nextToken.termLength()); + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + char[] termBuffer = termAtt.termBuffer(); + String termText = new String(termBuffer, 0, termAtt.termLength()); byte[] collationKey = collator.getCollationKey(termText).toByteArray(); ByteBuffer collationKeyBuf = ByteBuffer.wrap(collationKey); int encodedLength = IndexableBinaryStringTools.getEncodedLength(collationKeyBuf); if (encodedLength > termBuffer.length) { - nextToken.resizeTermBuffer(encodedLength); + termAtt.resizeTermBuffer(encodedLength); } - nextToken.setTermLength(encodedLength); - CharBuffer wrappedTermBuffer = CharBuffer.wrap(nextToken.termBuffer()); + termAtt.setTermLength(encodedLength); + CharBuffer wrappedTermBuffer = CharBuffer.wrap(termAtt.termBuffer()); IndexableBinaryStringTools.encode(collationKeyBuf, wrappedTermBuffer); + return true; + } else { + return false; } - return nextToken; } } diff --git a/contrib/collation/src/java/org/apache/lucene/collation/ICUCollationKeyFilter.java b/contrib/collation/src/java/org/apache/lucene/collation/ICUCollationKeyFilter.java index 27abe24c2e0..1bd4a510b67 100644 --- a/contrib/collation/src/java/org/apache/lucene/collation/ICUCollationKeyFilter.java +++ b/contrib/collation/src/java/org/apache/lucene/collation/ICUCollationKeyFilter.java @@ -24,6 +24,7 @@ import com.ibm.icu.text.RawCollationKey; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.util.IndexableBinaryStringTools; import java.io.IOException; @@ -69,9 +70,10 @@ import java.nio.CharBuffer; * java.text.Collator over several languages. *

*/ -public class ICUCollationKeyFilter extends TokenFilter { +public final class ICUCollationKeyFilter extends TokenFilter { private Collator collator = null; private RawCollationKey reusableKey = new RawCollationKey(); + private TermAttribute termAtt; /** * @@ -81,25 +83,26 @@ public class ICUCollationKeyFilter extends TokenFilter { public ICUCollationKeyFilter(TokenStream input, Collator collator) { super(input); this.collator = collator; + termAtt = (TermAttribute) addAttribute(TermAttribute.class); } - public final Token next(final Token reusableToken) throws IOException { - assert reusableToken != null; - Token nextToken = input.next(reusableToken); - if (nextToken != null) { - char[] termBuffer = nextToken.termBuffer(); - String termText = new String(termBuffer, 0, nextToken.termLength()); + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + char[] termBuffer = termAtt.termBuffer(); + String termText = new String(termBuffer, 0, termAtt.termLength()); collator.getRawCollationKey(termText, reusableKey); ByteBuffer collationKeyBuf = ByteBuffer.wrap(reusableKey.bytes, 0, reusableKey.size); int encodedLength = IndexableBinaryStringTools.getEncodedLength(collationKeyBuf); if (encodedLength > termBuffer.length) { - nextToken.resizeTermBuffer(encodedLength); + termAtt.resizeTermBuffer(encodedLength); } - nextToken.setTermLength(encodedLength); - CharBuffer wrappedTermBuffer = CharBuffer.wrap(nextToken.termBuffer()); + termAtt.setTermLength(encodedLength); + CharBuffer wrappedTermBuffer = CharBuffer.wrap(termAtt.termBuffer()); IndexableBinaryStringTools.encode(collationKeyBuf, wrappedTermBuffer); + return true; + } else { + return false; } - return nextToken; } } diff --git a/contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/AbstractTestCase.java b/contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/AbstractTestCase.java index 9e6ee39aa73..1cdfc8cd271 100644 --- a/contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/AbstractTestCase.java +++ b/contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/AbstractTestCase.java @@ -28,6 +28,8 @@ import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.WhitespaceAnalyzer; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.Field.Index; @@ -193,11 +195,15 @@ public abstract class AbstractTestCase extends TestCase { ch = 0; } - public Token next( Token reusableToken ) throws IOException { + TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class); + OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); + public boolean incrementToken() throws IOException { if( !getNextPartialSnippet() ) - return null; - reusableToken.reinit( snippet, startTerm, lenTerm, startOffset, startOffset + lenTerm ); - return reusableToken; + return false; + + termAtt.setTermBuffer(snippet, startTerm, lenTerm); + offsetAtt.setOffset(startOffset, startOffset + lenTerm); + return true; } public int getFinalOffset() { diff --git a/contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/IndexTimeSynonymTest.java b/contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/IndexTimeSynonymTest.java index cdabf127b25..f3634379630 100644 --- a/contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/IndexTimeSynonymTest.java +++ b/contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/IndexTimeSynonymTest.java @@ -295,14 +295,21 @@ public class IndexTimeSynonymTest extends AbstractTestCase { public TokenArrayAnalyzer( Token... tokens ){ this.tokens = tokens; } + public TokenStream tokenStream(String fieldName, Reader reader) { - return new TokenStream(){ + final Token reusableToken = new Token(); + + TokenStream.setOnlyUseNewAPI(true); + TokenStream ts = new TokenStream(){ int p = 0; - public Token next( Token reusableToken ) throws IOException { - if( p >= tokens.length ) return null; - return tokens[p++]; + public boolean incrementToken() throws IOException { + if( p >= tokens.length ) return false; + tokens[p++].copyTo(reusableToken); + return true; } }; + ts.addAttributeImpl(reusableToken); + return ts; } } } diff --git a/contrib/instantiated/src/test/org/apache/lucene/store/instantiated/TestIndicesEquals.java b/contrib/instantiated/src/test/org/apache/lucene/store/instantiated/TestIndicesEquals.java index 62470a25ac2..496fba45e63 100644 --- a/contrib/instantiated/src/test/org/apache/lucene/store/instantiated/TestIndicesEquals.java +++ b/contrib/instantiated/src/test/org/apache/lucene/store/instantiated/TestIndicesEquals.java @@ -27,6 +27,7 @@ import junit.framework.TestCase; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; @@ -44,6 +45,7 @@ import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocCollector; import org.apache.lucene.search.TopScoreDocCollector; +import org.apache.lucene.util.AttributeSource; /** * Asserts equality of content and behaviour of two index readers. @@ -175,23 +177,26 @@ public class TestIndicesEquals extends TestCase { t.setPayload(new Payload(new byte[]{2})); tokens.add(t); tokens.add(createToken("fin", 7, 9)); - document.add(new Field("f", new TokenStream() { + final Token reusableToken = new Token(); + TokenStream ts = new TokenStream() { Iterator it = tokens.iterator(); - - public Token next(final Token reusableToken) throws IOException { - assert reusableToken != null; + + public final boolean incrementToken() throws IOException { if (!it.hasNext()) { - return null; + return false; } - // Resettable token streams need to return clones. - Token nextToken = (Token) it.next(); - return (Token) nextToken.clone(); + + reusableToken.reinit(it.next()); + return true; } public void reset() throws IOException { it = tokens.iterator(); } - })); + }; + ts.addAttributeImpl(reusableToken); + + document.add(new Field("f", ts)); } } } diff --git a/contrib/lucli/src/java/lucli/LuceneMethods.java b/contrib/lucli/src/java/lucli/LuceneMethods.java index b12f1508b20..5430b34447d 100644 --- a/contrib/lucli/src/java/lucli/LuceneMethods.java +++ b/contrib/lucli/src/java/lucli/LuceneMethods.java @@ -75,6 +75,8 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; @@ -317,11 +319,14 @@ class LuceneMethods { int position = 0; // Tokenize field and add to postingTable TokenStream stream = analyzer.tokenStream(fieldName, reader); + TermAttribute termAtt = (TermAttribute) stream.addAttribute(TermAttribute.class); + PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) stream.addAttribute(PositionIncrementAttribute.class); + try { - for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) { - position += (nextToken.getPositionIncrement() - 1); + while (stream.incrementToken()) { + position += (posIncrAtt.getPositionIncrement() - 1); position++; - String name = nextToken.term(); + String name = termAtt.term(); Integer Count = (Integer) tokenMap.get(name); if (Count == null) { // not in there yet tokenMap.put(name, new Integer(1)); //first one diff --git a/contrib/memory/src/java/org/apache/lucene/index/memory/AnalyzerUtil.java b/contrib/memory/src/java/org/apache/lucene/index/memory/AnalyzerUtil.java index 6a9a5544cda..0ec2bcaf107 100644 --- a/contrib/memory/src/java/org/apache/lucene/index/memory/AnalyzerUtil.java +++ b/contrib/memory/src/java/org/apache/lucene/index/memory/AnalyzerUtil.java @@ -31,9 +31,13 @@ import java.util.regex.Pattern; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.PorterStemFilter; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.apache.lucene.util.AttributeSource; /** * Various fulltext analysis utilities avoiding redundant code in several @@ -71,21 +75,24 @@ public class AnalyzerUtil { public TokenStream tokenStream(final String fieldName, Reader reader) { return new TokenFilter(child.tokenStream(fieldName, reader)) { private int position = -1; - - public Token next(final Token reusableToken) throws IOException { - assert reusableToken != null; - Token nextToken = input.next(reusableToken); // from filter super class - log.println(toString(nextToken)); - return nextToken; + private TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class); + private PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); + private OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); + private TypeAttribute typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class); + + public boolean incrementToken() throws IOException { + boolean hasNext = input.incrementToken(); + log.println(toString(hasNext)); + return hasNext; } - private String toString(Token token) { - if (token == null) return "[" + logName + ":EOS:" + fieldName + "]\n"; + private String toString(boolean hasNext) { + if (!hasNext) return "[" + logName + ":EOS:" + fieldName + "]\n"; - position += token.getPositionIncrement(); + position += posIncrAtt.getPositionIncrement(); return "[" + logName + ":" + position + ":" + fieldName + ":" - + token.term() + ":" + token.startOffset() - + "-" + token.endOffset() + ":" + token.type() + + termAtt.term() + ":" + offsetAtt.startOffset() + + "-" + offsetAtt.endOffset() + ":" + typeAtt.type() + "]"; } }; @@ -121,9 +128,8 @@ public class AnalyzerUtil { return new TokenFilter(child.tokenStream(fieldName, reader)) { private int todo = maxTokens; - public Token next(final Token reusableToken) throws IOException { - assert reusableToken != null; - return --todo >= 0 ? input.next(reusableToken) : null; + public boolean incrementToken() throws IOException { + return --todo >= 0 ? input.incrementToken() : false; } }; } @@ -240,11 +246,10 @@ public class AnalyzerUtil { final ArrayList tokens2 = new ArrayList(); TokenStream tokenStream = new TokenFilter(child.tokenStream(fieldName, reader)) { - public Token next(final Token reusableToken) throws IOException { - assert reusableToken != null; - Token nextToken = input.next(reusableToken); // from filter super class - if (nextToken != null) tokens2.add(nextToken.clone()); - return nextToken; + public boolean incrementToken() throws IOException { + boolean hasNext = input.incrementToken(); + if (hasNext) tokens2.add(captureState()); + return hasNext; } }; @@ -255,10 +260,10 @@ public class AnalyzerUtil { private Iterator iter = tokens.iterator(); - public Token next(Token token) { - assert token != null; - if (!iter.hasNext()) return null; - return (Token) iter.next(); + public boolean incrementToken() { + if (!iter.hasNext()) return false; + restoreState((AttributeSource.State) iter.next()); + return true; } }; } @@ -302,13 +307,13 @@ public class AnalyzerUtil { // compute frequencies of distinct terms HashMap map = new HashMap(); TokenStream stream = analyzer.tokenStream("", new StringReader(text)); + TermAttribute termAtt = (TermAttribute) stream.addAttribute(TermAttribute.class); try { - final Token reusableToken = new Token(); - for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) { - MutableInteger freq = (MutableInteger) map.get(nextToken.term()); + while (stream.incrementToken()) { + MutableInteger freq = (MutableInteger) map.get(termAtt.term()); if (freq == null) { freq = new MutableInteger(1); - map.put(nextToken.term(), freq); + map.put(termAtt.term(), freq); } else { freq.setValue(freq.intValue() + 1); } diff --git a/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java b/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java index 186be49d0d4..9b4d1a512be 100644 --- a/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java +++ b/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java @@ -28,8 +28,10 @@ import java.util.Iterator; import java.util.Map; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.document.FieldSelector; import org.apache.lucene.index.IndexReader; @@ -274,18 +276,21 @@ public class MemoryIndex implements Serializable { return new TokenStream() { private Iterator iter = keywords.iterator(); private int start = 0; - public Token next(final Token reusableToken) { - assert reusableToken != null; - if (!iter.hasNext()) return null; + private TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class); + private OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); + + public boolean incrementToken() { + if (!iter.hasNext()) return false; Object obj = iter.next(); if (obj == null) throw new IllegalArgumentException("keyword must not be null"); String term = obj.toString(); - reusableToken.reinit(term, start, start+reusableToken.termLength()); + termAtt.setTermBuffer(term); + offsetAtt.setOffset(start, start+termAtt.termLength()); start += term.length() + 1; // separate words by 1 (blank) character - return reusableToken; + return true; } }; } @@ -350,13 +355,17 @@ public class MemoryIndex implements Serializable { int numTokens = 0; int numOverlapTokens = 0; int pos = -1; - final Token reusableToken = new Token(); - for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) { - String term = nextToken.term(); + + TermAttribute termAtt = (TermAttribute) stream.addAttribute(TermAttribute.class); + PositionIncrementAttribute posIncrAttribute = (PositionIncrementAttribute) stream.addAttribute(PositionIncrementAttribute.class); + OffsetAttribute offsetAtt = (OffsetAttribute) stream.addAttribute(OffsetAttribute.class); + + while (stream.incrementToken()) { + String term = termAtt.term(); if (term.length() == 0) continue; // nothing to do // if (DEBUG) System.err.println("token='" + term + "'"); numTokens++; - final int posIncr = nextToken.getPositionIncrement(); + final int posIncr = posIncrAttribute.getPositionIncrement(); if (posIncr == 0) numOverlapTokens++; pos += posIncr; @@ -369,7 +378,7 @@ public class MemoryIndex implements Serializable { if (stride == 1) { positions.add(pos); } else { - positions.add(pos, nextToken.startOffset(), nextToken.endOffset()); + positions.add(pos, offsetAtt.startOffset(), offsetAtt.endOffset()); } } diff --git a/contrib/memory/src/java/org/apache/lucene/index/memory/PatternAnalyzer.java b/contrib/memory/src/java/org/apache/lucene/index/memory/PatternAnalyzer.java index f2bb2a01808..a48cba815ba 100644 --- a/contrib/memory/src/java/org/apache/lucene/index/memory/PatternAnalyzer.java +++ b/contrib/memory/src/java/org/apache/lucene/index/memory/PatternAnalyzer.java @@ -30,8 +30,9 @@ import java.util.regex.Pattern; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.StopAnalyzer; import org.apache.lucene.analysis.StopFilter; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; /** * Efficient Lucene analyzer/tokenizer that preferably operates on a String rather than a @@ -331,6 +332,8 @@ public class PatternAnalyzer extends Analyzer { private Matcher matcher; private int pos = 0; private static final Locale locale = Locale.getDefault(); + private TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class); + private OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); public PatternTokenizer(String str, Pattern pattern, boolean toLowerCase) { this.str = str; @@ -338,9 +341,8 @@ public class PatternAnalyzer extends Analyzer { this.toLowerCase = toLowerCase; } - public Token next(final Token reusableToken) { - assert reusableToken != null; - if (matcher == null) return null; + public final boolean incrementToken() { + if (matcher == null) return false; while (true) { // loop takes care of leading and trailing boundary cases int start = pos; @@ -357,9 +359,11 @@ public class PatternAnalyzer extends Analyzer { if (start != end) { // non-empty match (header/trailer) String text = str.substring(start, end); if (toLowerCase) text = text.toLowerCase(locale); - return reusableToken.reinit(text, start, end); + termAtt.setTermBuffer(text); + offsetAtt.setOffset(start, end); + return true; } - if (!isMatch) return null; + if (!isMatch) return false; } } @@ -381,6 +385,8 @@ public class PatternAnalyzer extends Analyzer { private final boolean toLowerCase; private final Set stopWords; private static final Locale locale = Locale.getDefault(); + private TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class); + private OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); public FastStringTokenizer(String str, boolean isLetter, boolean toLowerCase, Set stopWords) { this.str = str; @@ -389,8 +395,7 @@ public class PatternAnalyzer extends Analyzer { this.stopWords = stopWords; } - public Token next(final Token reusableToken) { - assert reusableToken != null; + public boolean incrementToken() { // cache loop instance vars (performance) String s = str; int len = s.length(); @@ -430,9 +435,11 @@ public class PatternAnalyzer extends Analyzer { pos = i; if (text == null) { - return null; + return false; } - return reusableToken.reinit(text, start, i); + termAtt.setTermBuffer(text); + offsetAtt.setOffset(start, i); + return true; } private boolean isTokenChar(char c, boolean isLetter) { diff --git a/contrib/memory/src/java/org/apache/lucene/index/memory/SynonymTokenFilter.java b/contrib/memory/src/java/org/apache/lucene/index/memory/SynonymTokenFilter.java index 9a7bad4d539..b65ff174f58 100644 --- a/contrib/memory/src/java/org/apache/lucene/index/memory/SynonymTokenFilter.java +++ b/contrib/memory/src/java/org/apache/lucene/index/memory/SynonymTokenFilter.java @@ -19,9 +19,12 @@ package org.apache.lucene.index.memory; import java.io.IOException; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.apache.lucene.util.AttributeSource; /** * Injects additional tokens for synonyms of token terms fetched from the @@ -39,9 +42,13 @@ public class SynonymTokenFilter extends TokenFilter { private String[] stack = null; private int index = 0; - private Token current = null; + private AttributeSource.State current = null; private int todo = 0; + private TermAttribute termAtt; + private TypeAttribute typeAtt; + private PositionIncrementAttribute posIncrAtt; + /** * Creates an instance for the given underlying stream and synonym table. * @@ -64,28 +71,29 @@ public class SynonymTokenFilter extends TokenFilter { this.synonyms = synonyms; this.maxSynonyms = maxSynonyms; + + this.termAtt = (TermAttribute) addAttribute(TermAttribute.class); + this.typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class); + this.posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); } /** Returns the next token in the stream, or null at EOS. */ - public Token next(final Token reusableToken) throws IOException { - assert reusableToken != null; + public final boolean incrementToken() throws IOException { while (todo > 0 && index < stack.length) { // pop from stack - Token nextToken = createToken(stack[index++], current, reusableToken); - if (nextToken != null) { + if (createToken(stack[index++], current)) { todo--; - return nextToken; + return true; } } - Token nextToken = input.next(reusableToken); - if (nextToken == null) return null; // EOS; iterator exhausted + if (!input.incrementToken()) return false; // EOS; iterator exhausted - stack = synonyms.getSynonyms(nextToken.term()); // push onto stack + stack = synonyms.getSynonyms(termAtt.term()); // push onto stack if (stack.length > maxSynonyms) randomize(stack); index = 0; - current = (Token) nextToken.clone(); + current = captureState(); todo = maxSynonyms; - return nextToken; + return true; } /** @@ -101,12 +109,12 @@ public class SynonymTokenFilter extends TokenFilter { * @return a new token, or null to indicate that the given synonym should be * ignored */ - protected Token createToken(String synonym, Token current, final Token reusableToken) { - reusableToken.reinit(current, synonym); - reusableToken.setTermBuffer(synonym); - reusableToken.setType(SYNONYM_TOKEN_TYPE); - reusableToken.setPositionIncrement(0); - return reusableToken; + protected boolean createToken(String synonym, AttributeSource.State current) { + restoreState(current); + termAtt.setTermBuffer(synonym); + typeAtt.setType(SYNONYM_TOKEN_TYPE); + posIncrAtt.setPositionIncrement(0); + return true; } /** diff --git a/contrib/miscellaneous/src/java/org/apache/lucene/queryParser/analyzing/AnalyzingQueryParser.java b/contrib/miscellaneous/src/java/org/apache/lucene/queryParser/analyzing/AnalyzingQueryParser.java index 88147eafdff..c3f686a769b 100644 --- a/contrib/miscellaneous/src/java/org/apache/lucene/queryParser/analyzing/AnalyzingQueryParser.java +++ b/contrib/miscellaneous/src/java/org/apache/lucene/queryParser/analyzing/AnalyzingQueryParser.java @@ -25,6 +25,7 @@ import java.util.List; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.search.Query; @@ -105,20 +106,16 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryPar // get Analyzer from superclass and tokenize the term TokenStream source = getAnalyzer().tokenStream(field, new StringReader(termStr)); - final Token reusableToken = new Token(); - Token nextToken; - + TermAttribute termAtt = (TermAttribute) source.addAttribute(TermAttribute.class); + int countTokens = 0; while (true) { try { - nextToken = source.next(reusableToken); + if (!source.incrementToken()) break; } catch (IOException e) { - nextToken = null; - } - if (nextToken == null) { break; } - String term = nextToken.term(); + String term = termAtt.term(); if (!"".equals(term)) { try { tlist.set(countTokens++, term); @@ -191,19 +188,15 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryPar // get Analyzer from superclass and tokenize the term TokenStream source = getAnalyzer().tokenStream(field, new StringReader(termStr)); List tlist = new ArrayList(); - final Token reusableToken = new Token(); - Token nextToken; - + TermAttribute termAtt = (TermAttribute) source.addAttribute(TermAttribute.class); + while (true) { try { - nextToken = source.next(reusableToken); + if (!source.incrementToken()) break; } catch (IOException e) { - nextToken = null; - } - if (nextToken == null) { break; } - tlist.add(nextToken.term()); + tlist.add(termAtt.term()); } try { @@ -241,13 +234,15 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryPar throws ParseException { // get Analyzer from superclass and tokenize the term TokenStream source = getAnalyzer().tokenStream(field, new StringReader(termStr)); - final Token reusableToken = new Token(); - Token nextToken; + TermAttribute termAtt = (TermAttribute) source.addAttribute(TermAttribute.class); + String nextToken = null; boolean multipleTokens = false; - + try { - nextToken = source.next(reusableToken); - multipleTokens = source.next(reusableToken) != null; + if (source.incrementToken()) { + nextToken = termAtt.term(); + } + multipleTokens = source.incrementToken(); } catch (IOException e) { nextToken = null; } @@ -263,7 +258,7 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryPar + " - tokens were added"); } - return (nextToken == null) ? null : super.getFuzzyQuery(field, nextToken.term(), minSimilarity); + return (nextToken == null) ? null : super.getFuzzyQuery(field, nextToken, minSimilarity); } /** @@ -274,20 +269,17 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryPar throws ParseException { // get Analyzer from superclass and tokenize the terms TokenStream source = getAnalyzer().tokenStream(field, new StringReader(part1)); - final Token reusableToken = new Token(); - Token nextToken; - Token multipleToken; + TermAttribute termAtt = (TermAttribute) source.addAttribute(TermAttribute.class); boolean multipleTokens = false; // part1 try { - nextToken = source.next(reusableToken); - if (nextToken != null) { - part1 = nextToken.term(); + if (source.incrementToken()) { + part1 = termAtt.term(); } - multipleTokens = source.next(reusableToken) != null; + multipleTokens = source.incrementToken(); } catch (IOException e) { - nextToken = null; + // ignore } try { source.close(); @@ -301,14 +293,15 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryPar // part2 source = getAnalyzer().tokenStream(field, new StringReader(part2)); + termAtt = (TermAttribute) source.addAttribute(TermAttribute.class); + try { - nextToken = source.next(reusableToken); - if (nextToken != null) { - part2 = nextToken.term(); + if (source.incrementToken()) { + part2 = termAtt.term(); } - multipleTokens = source.next(reusableToken) != null; + multipleTokens = source.incrementToken(); } catch (IOException e) { - nextToken = null; + // ignore } try { source.close(); diff --git a/contrib/miscellaneous/src/test/org/apache/lucene/queryParser/precedence/TestPrecedenceQueryParser.java b/contrib/miscellaneous/src/test/org/apache/lucene/queryParser/precedence/TestPrecedenceQueryParser.java index b1d9854c21b..d85a4014283 100644 --- a/contrib/miscellaneous/src/test/org/apache/lucene/queryParser/precedence/TestPrecedenceQueryParser.java +++ b/contrib/miscellaneous/src/test/org/apache/lucene/queryParser/precedence/TestPrecedenceQueryParser.java @@ -26,6 +26,8 @@ import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.WhitespaceAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.document.DateTools; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.FuzzyQuery; @@ -57,28 +59,27 @@ public class TestPrecedenceQueryParser extends TestCase { boolean inPhrase = false; int savedStart = 0, savedEnd = 0; - public Token next(final Token reusableToken) throws IOException { - assert reusableToken != null; + TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class); + OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); + + public boolean incrementToken() throws IOException { if (inPhrase) { inPhrase = false; - reusableToken.setTermBuffer("phrase2"); - reusableToken.setStartOffset(savedStart); - reusableToken.setEndOffset(savedEnd); - return reusableToken; + termAtt.setTermBuffer("phrase2"); + offsetAtt.setOffset(savedStart, savedEnd); + return true; } else - for (Token nextToken = input.next(reusableToken); nextToken != null; nextToken = input.next(reusableToken)) { - if (nextToken.term().equals("phrase")) { + while(input.incrementToken()) + if (termAtt.term().equals("phrase")) { inPhrase = true; - savedStart = nextToken.startOffset(); - savedEnd = nextToken.endOffset(); - nextToken.setTermBuffer("phrase1"); - nextToken.setStartOffset(savedStart); - nextToken.setEndOffset(savedEnd); - return nextToken; - } else if (!nextToken.term().equals("stop")) - return nextToken; - } - return null; + savedStart = offsetAtt.startOffset(); + savedEnd = offsetAtt.endOffset(); + termAtt.setTermBuffer("phrase1"); + offsetAtt.setOffset(savedStart, savedEnd); + return true; + } else if (!termAtt.term().equals("stop")) + return true; + return false; } } diff --git a/contrib/queries/src/java/org/apache/lucene/search/FuzzyLikeThisQuery.java b/contrib/queries/src/java/org/apache/lucene/search/FuzzyLikeThisQuery.java index 7c74aea2112..b6e9446f005 100644 --- a/contrib/queries/src/java/org/apache/lucene/search/FuzzyLikeThisQuery.java +++ b/contrib/queries/src/java/org/apache/lucene/search/FuzzyLikeThisQuery.java @@ -27,6 +27,7 @@ import java.util.Iterator; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermEnum; @@ -181,13 +182,14 @@ public class FuzzyLikeThisQuery extends Query { if(f.queryString==null) return; TokenStream ts=analyzer.tokenStream(f.fieldName,new StringReader(f.queryString)); - final Token reusableToken = new Token(); + TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class); + int corpusNumDocs=reader.numDocs(); Term internSavingTemplateTerm =new Term(f.fieldName); //optimization to avoid constructing new Term() objects HashSet processedTerms=new HashSet(); - for (Token nextToken = ts.next(reusableToken); nextToken!=null; nextToken = ts.next(reusableToken)) + while (ts.incrementToken()) { - String term = nextToken.term(); + String term = termAtt.term(); if(!processedTerms.contains(term)) { processedTerms.add(term); diff --git a/contrib/queries/src/java/org/apache/lucene/search/similar/MoreLikeThis.java b/contrib/queries/src/java/org/apache/lucene/search/similar/MoreLikeThis.java index 6fed4b5be1f..ba9ed35efe6 100644 --- a/contrib/queries/src/java/org/apache/lucene/search/similar/MoreLikeThis.java +++ b/contrib/queries/src/java/org/apache/lucene/search/similar/MoreLikeThis.java @@ -28,9 +28,9 @@ import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.Hits; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.document.Document; import java.util.Set; @@ -829,9 +829,10 @@ public final class MoreLikeThis { TokenStream ts = analyzer.tokenStream(fieldName, r); int tokenCount=0; // for every token - final Token reusableToken = new Token(); - for (Token nextToken = ts.next(reusableToken); nextToken != null; nextToken = ts.next(reusableToken)) { - String word = nextToken.term(); + TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class); + + while (ts.incrementToken()) { + String word = termAtt.term(); tokenCount++; if(tokenCount>maxNumTokensParsed) { diff --git a/contrib/queries/src/java/org/apache/lucene/search/similar/SimilarityQueries.java b/contrib/queries/src/java/org/apache/lucene/search/similar/SimilarityQueries.java index e3cea76d034..090d52c4c2b 100644 --- a/contrib/queries/src/java/org/apache/lucene/search/similar/SimilarityQueries.java +++ b/contrib/queries/src/java/org/apache/lucene/search/similar/SimilarityQueries.java @@ -21,8 +21,8 @@ import java.util.HashSet; import java.util.Set; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; @@ -86,11 +86,12 @@ public final class SimilarityQueries throws IOException { TokenStream ts = a.tokenStream( field, new StringReader( body)); + TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class); + BooleanQuery tmp = new BooleanQuery(); Set already = new HashSet(); // ignore dups - final Token reusableToken = new Token(); - for (Token nextToken = ts.next(reusableToken); nextToken != null; nextToken = ts.next(reusableToken)) { - String word = nextToken.term(); + while (ts.incrementToken()) { + String word = termAtt.term(); // ignore opt stop words if ( stop != null && stop.contains( word)) continue; diff --git a/contrib/snowball/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java b/contrib/snowball/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java index 86ccc8b3215..c4047746f45 100644 --- a/contrib/snowball/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java +++ b/contrib/snowball/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java @@ -22,6 +22,7 @@ import java.io.IOException; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.tartarus.snowball.SnowballProgram; /** @@ -33,9 +34,12 @@ public class SnowballFilter extends TokenFilter { private SnowballProgram stemmer; + private TermAttribute termAtt; + public SnowballFilter(TokenStream input, SnowballProgram stemmer) { super(input); this.stemmer = stemmer; + termAtt = (TermAttribute) addAttribute(TermAttribute.class); } /** @@ -56,21 +60,34 @@ public class SnowballFilter extends TokenFilter { } catch (Exception e) { throw new RuntimeException(e.toString()); } + termAtt = (TermAttribute) addAttribute(TermAttribute.class); } /** Returns the next input Token, after being stemmed */ - public final Token next(final Token reusableToken) throws IOException { - assert reusableToken != null; - Token nextToken = input.next(reusableToken); - if (nextToken == null) - return null; - String originalTerm = nextToken.term(); - stemmer.setCurrent(originalTerm); - stemmer.stem(); - String finalTerm = stemmer.getCurrent(); - // Don't bother updating, if it is unchanged. - if (!originalTerm.equals(finalTerm)) - nextToken.setTermBuffer(finalTerm); - return nextToken; + public final boolean incrementToken() throws IOException { + if (input.incrementToken()) { + String originalTerm = termAtt.term(); + stemmer.setCurrent(originalTerm); + stemmer.stem(); + String finalTerm = stemmer.getCurrent(); + // Don't bother updating, if it is unchanged. + if (!originalTerm.equals(finalTerm)) + termAtt.setTermBuffer(finalTerm); + return true; + } else { + return false; + } + } + + /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should + * not be overridden. Delegates to the backwards compatibility layer. */ + public final Token next(final Token reusableToken) throws java.io.IOException { + return super.next(reusableToken); + } + + /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should + * not be overridden. Delegates to the backwards compatibility layer. */ + public final Token next() throws java.io.IOException { + return super.next(); } } diff --git a/contrib/snowball/src/test/org/apache/lucene/analysis/snowball/TestSnowball.java b/contrib/snowball/src/test/org/apache/lucene/analysis/snowball/TestSnowball.java index 9cd65d625e2..45042c26140 100644 --- a/contrib/snowball/src/test/org/apache/lucene/analysis/snowball/TestSnowball.java +++ b/contrib/snowball/src/test/org/apache/lucene/analysis/snowball/TestSnowball.java @@ -22,9 +22,14 @@ import java.io.StringReader; import junit.framework.TestCase; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.Token; import org.apache.lucene.index.Payload; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.FlagsAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; public class TestSnowball extends TestCase { @@ -32,12 +37,12 @@ public class TestSnowball extends TestCase { String input, String[] output) throws Exception { TokenStream ts = a.tokenStream("dummy", new StringReader(input)); - final Token reusableToken = new Token(); + TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class); for (int i = 0; i < output.length; i++) { - Token nextToken = ts.next(reusableToken); - assertEquals(output[i], nextToken.term()); + assertTrue(ts.incrementToken()); + assertEquals(output[i], termAtt.term()); } - assertNull(ts.next(reusableToken)); + assertFalse(ts.incrementToken()); ts.close(); } @@ -49,33 +54,51 @@ public class TestSnowball extends TestCase { public void testFilterTokens() throws Exception { - final Token tok = new Token(2, 7, "wrd"); - tok.setTermBuffer("accents"); - tok.setPositionIncrement(3); - Payload tokPayload = new Payload(new byte[]{0,1,2,3}); - tok.setPayload(tokPayload); - int tokFlags = 77; - tok.setFlags(tokFlags); + SnowballFilter filter = new SnowballFilter(new TestTokenStream(), "English"); + TermAttribute termAtt = (TermAttribute) filter.getAttribute(TermAttribute.class); + OffsetAttribute offsetAtt = (OffsetAttribute) filter.getAttribute(OffsetAttribute.class); + TypeAttribute typeAtt = (TypeAttribute) filter.getAttribute(TypeAttribute.class); + PayloadAttribute payloadAtt = (PayloadAttribute) filter.getAttribute(PayloadAttribute.class); + PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) filter.getAttribute(PositionIncrementAttribute.class); + FlagsAttribute flagsAtt = (FlagsAttribute) filter.getAttribute(FlagsAttribute.class); + + filter.incrementToken(); - SnowballFilter filter = new SnowballFilter( - new TokenStream() { - public Token next(final Token reusableToken) { - assert reusableToken != null; - return tok; - } - }, - "English" - ); - - final Token reusableToken = new Token(); - Token nextToken = filter.next(reusableToken); - - assertEquals("accent", nextToken.term()); - assertEquals(2, nextToken.startOffset()); - assertEquals(7, nextToken.endOffset()); - assertEquals("wrd", nextToken.type()); - assertEquals(3, nextToken.getPositionIncrement()); - assertEquals(tokFlags, nextToken.getFlags()); - assertEquals(tokPayload, nextToken.getPayload()); + assertEquals("accent", termAtt.term()); + assertEquals(2, offsetAtt.startOffset()); + assertEquals(7, offsetAtt.endOffset()); + assertEquals("wrd", typeAtt.type()); + assertEquals(3, posIncAtt.getPositionIncrement()); + assertEquals(77, flagsAtt.getFlags()); + assertEquals(new Payload(new byte[]{0,1,2,3}), payloadAtt.getPayload()); + } + + private final class TestTokenStream extends TokenStream { + private TermAttribute termAtt; + private OffsetAttribute offsetAtt; + private TypeAttribute typeAtt; + private PayloadAttribute payloadAtt; + private PositionIncrementAttribute posIncAtt; + private FlagsAttribute flagsAtt; + + TestTokenStream() { + super(); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); + offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); + typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class); + payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class); + posIncAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); + flagsAtt = (FlagsAttribute) addAttribute(FlagsAttribute.class); + } + + public boolean incrementToken() { + termAtt.setTermBuffer("accents"); + offsetAtt.setOffset(2, 7); + typeAtt.setType("wrd"); + posIncAtt.setPositionIncrement(3); + payloadAtt.setPayload(new Payload(new byte[]{0,1,2,3})); + flagsAtt.setFlags(77); + return true; + } } } \ No newline at end of file diff --git a/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java b/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java index f8985cb1828..777df1307cf 100644 --- a/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java +++ b/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java @@ -20,6 +20,12 @@ package org.apache.lucene.wikipedia.analysis; import org.apache.lucene.analysis.CharReader; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.tokenattributes.FlagsAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.apache.lucene.util.AttributeSource; import java.io.IOException; import java.io.Reader; @@ -114,6 +120,12 @@ public class WikipediaTokenizer extends Tokenizer { private int tokenOutput = TOKENS_ONLY; private Set untokenizedTypes = Collections.EMPTY_SET; private Iterator tokens = null; + + private OffsetAttribute offsetAtt; + private TypeAttribute typeAtt; + private PositionIncrementAttribute posIncrAtt; + private TermAttribute termAtt; + private FlagsAttribute flagsAtt; void setInput(Reader reader) { this.input = CharReader.get(reader); @@ -142,41 +154,59 @@ public class WikipediaTokenizer extends Tokenizer { this.tokenOutput = tokenOutput; this.scanner = new WikipediaTokenizerImpl(input); this.untokenizedTypes = untokenizedTypes; + this.offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); + this.typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class); + this.posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); + this.termAtt = (TermAttribute) addAttribute(TermAttribute.class); + this.flagsAtt = (FlagsAttribute) addAttribute(FlagsAttribute.class); } + /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should + * not be overridden. Delegates to the backwards compatibility layer. */ + public final Token next(final Token reusableToken) throws java.io.IOException { + return super.next(reusableToken); + } + + /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should + * not be overridden. Delegates to the backwards compatibility layer. */ + public final Token next() throws java.io.IOException { + return super.next(); + } + /* * (non-Javadoc) * * @see org.apache.lucene.analysis.TokenStream#next() */ - public Token next(final Token reusableToken) throws IOException { - assert reusableToken != null; + public final boolean incrementToken() throws IOException { if (tokens != null && tokens.hasNext()){ - return (Token)tokens.next(); + AttributeSource.State state = (AttributeSource.State) tokens.next(); + restoreState(state); + return true; } int tokenType = scanner.getNextToken(); if (tokenType == WikipediaTokenizerImpl.YYEOF) { - return null; + return false; } String type = WikipediaTokenizerImpl.TOKEN_TYPES[tokenType]; if (tokenOutput == TOKENS_ONLY || untokenizedTypes.contains(type) == false){ - setupToken(reusableToken); + setupToken(); } else if (tokenOutput == UNTOKENIZED_ONLY && untokenizedTypes.contains(type) == true){ - collapseTokens(reusableToken, tokenType); + collapseTokens(tokenType); } else if (tokenOutput == BOTH){ //collapse into a single token, add it to tokens AND output the individual tokens //output the untokenized Token first - collapseAndSaveTokens(reusableToken, tokenType, type); + collapseAndSaveTokens(tokenType, type); } - reusableToken.setPositionIncrement(scanner.getPositionIncrement()); - reusableToken.setType(type); - return reusableToken; + posIncrAtt.setPositionIncrement(scanner.getPositionIncrement()); + typeAtt.setType(type); + return true; } - private void collapseAndSaveTokens(final Token reusableToken, int tokenType, String type) throws IOException { + private void collapseAndSaveTokens(int tokenType, String type) throws IOException { //collapse StringBuffer buffer = new StringBuffer(32); int numAdded = scanner.setText(buffer); @@ -186,9 +216,8 @@ public class WikipediaTokenizer extends Tokenizer { int tmpTokType; int numSeen = 0; List tmp = new ArrayList(); - Token saved = new Token(); - setupSavedToken(saved, 0, type); - tmp.add(saved); + setupSavedToken(0, type); + tmp.add(captureState()); //while we can get a token and that token is the same type and we have not transitioned to a new wiki-item of the same type while ((tmpTokType = scanner.getNextToken()) != WikipediaTokenizerImpl.YYEOF && tmpTokType == tokenType && scanner.getNumWikiTokensSeen() > numSeen){ int currPos = scanner.yychar(); @@ -197,18 +226,16 @@ public class WikipediaTokenizer extends Tokenizer { buffer.append(' '); } numAdded = scanner.setText(buffer); - saved = new Token(); - setupSavedToken(saved, scanner.getPositionIncrement(), type); - tmp.add(saved); + setupSavedToken(scanner.getPositionIncrement(), type); + tmp.add(captureState()); numSeen++; lastPos = currPos + numAdded; } //trim the buffer String s = buffer.toString().trim(); - reusableToken.setTermBuffer(s.toCharArray(), 0, s.length()); - reusableToken.setStartOffset(input.correctOffset(theStart)); - reusableToken.setEndOffset(input.correctOffset(theStart + s.length())); - reusableToken.setFlags(UNTOKENIZED_TOKEN_FLAG); + termAtt.setTermBuffer(s.toCharArray(), 0, s.length()); + offsetAtt.setOffset(input.correctOffset(theStart), input.correctOffset(theStart + s.length())); + flagsAtt.setFlags(UNTOKENIZED_TOKEN_FLAG); //The way the loop is written, we will have proceeded to the next token. We need to pushback the scanner to lastPos if (tmpTokType != WikipediaTokenizerImpl.YYEOF){ scanner.yypushback(scanner.yylength()); @@ -216,13 +243,13 @@ public class WikipediaTokenizer extends Tokenizer { tokens = tmp.iterator(); } - private void setupSavedToken(Token saved, int positionInc, String type){ - setupToken(saved); - saved.setPositionIncrement(positionInc); - saved.setType(type); + private void setupSavedToken(int positionInc, String type){ + setupToken(); + posIncrAtt.setPositionIncrement(positionInc); + typeAtt.setType(type); } - private void collapseTokens(final Token reusableToken, int tokenType) throws IOException { + private void collapseTokens(int tokenType) throws IOException { //collapse StringBuffer buffer = new StringBuffer(32); int numAdded = scanner.setText(buffer); @@ -244,10 +271,9 @@ public class WikipediaTokenizer extends Tokenizer { } //trim the buffer String s = buffer.toString().trim(); - reusableToken.setTermBuffer(s.toCharArray(), 0, s.length()); - reusableToken.setStartOffset(input.correctOffset(theStart)); - reusableToken.setEndOffset(input.correctOffset(theStart + s.length())); - reusableToken.setFlags(UNTOKENIZED_TOKEN_FLAG); + termAtt.setTermBuffer(s.toCharArray(), 0, s.length()); + offsetAtt.setOffset(input.correctOffset(theStart), input.correctOffset(theStart + s.length())); + flagsAtt.setFlags(UNTOKENIZED_TOKEN_FLAG); //The way the loop is written, we will have proceeded to the next token. We need to pushback the scanner to lastPos if (tmpTokType != WikipediaTokenizerImpl.YYEOF){ scanner.yypushback(scanner.yylength()); @@ -256,11 +282,10 @@ public class WikipediaTokenizer extends Tokenizer { } } - private void setupToken(final Token reusableToken) { - scanner.getText(reusableToken); + private void setupToken() { + scanner.getText(termAtt); final int start = scanner.yychar(); - reusableToken.setStartOffset(input.correctOffset(start)); - reusableToken.setEndOffset(input.correctOffset(start + reusableToken.termLength())); + offsetAtt.setOffset(input.correctOffset(start), input.correctOffset(start + termAtt.termLength())); } /* diff --git a/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerImpl.java b/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerImpl.java index 0be9b161eca..e6dced9b4c7 100644 --- a/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerImpl.java +++ b/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerImpl.java @@ -19,7 +19,7 @@ package org.apache.lucene.wikipedia.analysis; * limitations under the License. */ -import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; /** @@ -476,7 +476,7 @@ public final int getPositionIncrement(){ /** * Fills Lucene token with the current token text. */ -final void getText(Token t) { +final void getText(TermAttribute t) { t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead); } diff --git a/contrib/wikipedia/src/test/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerTest.java b/contrib/wikipedia/src/test/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerTest.java index d295aad1d4e..a594335e2fd 100644 --- a/contrib/wikipedia/src/test/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerTest.java +++ b/contrib/wikipedia/src/test/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerTest.java @@ -19,7 +19,6 @@ package org.apache.lucene.wikipedia.analysis; import junit.framework.TestCase; -import org.apache.lucene.analysis.Token; import java.io.StringReader; import java.io.IOException; @@ -28,6 +27,12 @@ import java.util.Map; import java.util.Set; import java.util.HashSet; +import org.apache.lucene.analysis.tokenattributes.FlagsAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; + /** * @@ -131,23 +136,24 @@ public class WikipediaTokenizerTest extends TestCase { int numBoldItalics = 0; int numCategory = 0; int numCitation = 0; - final Token reusableToken = new Token(); - for (Token nextToken = tf.next(reusableToken); nextToken != null; nextToken = tf.next(reusableToken)) { - String tokText = nextToken.term(); + TermAttribute termAtt = (TermAttribute) tf.addAttribute(TermAttribute.class); + TypeAttribute typeAtt = (TypeAttribute) tf.addAttribute(TypeAttribute.class); + + while (tf.incrementToken()) { + String tokText = termAtt.term(); //System.out.println("Text: " + tokText + " Type: " + token.type()); - assertTrue("nextToken is null and it shouldn't be", nextToken != null); String expectedType = (String) tcm.get(tokText); - assertTrue("expectedType is null and it shouldn't be for: " + nextToken, expectedType != null); - assertTrue(nextToken.type() + " is not equal to " + expectedType + " for " + nextToken, nextToken.type().equals(expectedType) == true); + assertTrue("expectedType is null and it shouldn't be for: " + tf.toString(), expectedType != null); + assertTrue(typeAtt.type() + " is not equal to " + expectedType + " for " + tf.toString(), typeAtt.type().equals(expectedType) == true); count++; - if (nextToken.type().equals(WikipediaTokenizer.ITALICS) == true){ + if (typeAtt.type().equals(WikipediaTokenizer.ITALICS) == true){ numItalics++; - } else if (nextToken.type().equals(WikipediaTokenizer.BOLD_ITALICS) == true){ + } else if (typeAtt.type().equals(WikipediaTokenizer.BOLD_ITALICS) == true){ numBoldItalics++; - } else if (nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true){ + } else if (typeAtt.type().equals(WikipediaTokenizer.CATEGORY) == true){ numCategory++; } - else if (nextToken.type().equals(WikipediaTokenizer.CITATION) == true){ + else if (typeAtt.type().equals(WikipediaTokenizer.CITATION) == true){ numCitation++; } } @@ -166,106 +172,93 @@ public class WikipediaTokenizerTest extends TestCase { } private void checkLinkPhrases(WikipediaTokenizer tf) throws IOException { - final Token reusableToken = new Token(); - Token nextToken = tf.next(reusableToken); - assertTrue("nextToken is null and it shouldn't be", nextToken != null); - assertTrue(nextToken.term() + " is not equal to " + "click", nextToken.term().equals("click") == true); - assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1); - nextToken = tf.next(reusableToken); - assertTrue("nextToken is null and it shouldn't be", nextToken != null); - assertTrue(nextToken.term() + " is not equal to " + "link", nextToken.term().equals("link") == true); - assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1); - nextToken = tf.next(reusableToken); - assertTrue("nextToken is null and it shouldn't be", nextToken != null); - assertTrue(nextToken.term() + " is not equal to " + "here", - nextToken.term().equals("here") == true); + TermAttribute termAtt = (TermAttribute) tf.addAttribute(TermAttribute.class); + PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) tf.addAttribute(PositionIncrementAttribute.class); + + assertTrue(tf.incrementToken()); + assertTrue(termAtt.term() + " is not equal to " + "click", termAtt.term().equals("click") == true); + assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1); + assertTrue(tf.incrementToken()); + assertTrue(termAtt.term() + " is not equal to " + "link", termAtt.term().equals("link") == true); + assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1); + assertTrue(tf.incrementToken()); + assertTrue(termAtt.term() + " is not equal to " + "here", + termAtt.term().equals("here") == true); //The link, and here should be at the same position for phrases to work - assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1); - nextToken = tf.next(reusableToken); - assertTrue("nextToken is null and it shouldn't be", nextToken != null); - assertTrue(nextToken.term() + " is not equal to " + "again", - nextToken.term().equals("again") == true); - assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1); + assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1); + assertTrue(tf.incrementToken()); + assertTrue(termAtt.term() + " is not equal to " + "again", + termAtt.term().equals("again") == true); + assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1); - nextToken = tf.next(reusableToken); - assertTrue("nextToken is null and it shouldn't be", nextToken != null); - assertTrue(nextToken.term() + " is not equal to " + "click", - nextToken.term().equals("click") == true); - assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1); + assertTrue(tf.incrementToken()); + assertTrue(termAtt.term() + " is not equal to " + "click", + termAtt.term().equals("click") == true); + assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1); - nextToken = tf.next(reusableToken); - assertTrue("nextToken is null and it shouldn't be", nextToken != null); - assertTrue(nextToken.term() + " is not equal to " + "http://lucene.apache.org", - nextToken.term().equals("http://lucene.apache.org") == true); - assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1); + assertTrue(tf.incrementToken()); + assertTrue(termAtt.term() + " is not equal to " + "http://lucene.apache.org", + termAtt.term().equals("http://lucene.apache.org") == true); + assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1); - nextToken = tf.next(reusableToken); - assertTrue("nextToken is null and it shouldn't be", nextToken != null); - assertTrue(nextToken.term() + " is not equal to " + "here", - nextToken.term().equals("here") == true); - assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 0, nextToken.getPositionIncrement() == 0); + assertTrue(tf.incrementToken()); + assertTrue(termAtt.term() + " is not equal to " + "here", + termAtt.term().equals("here") == true); + assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 0, posIncrAtt.getPositionIncrement() == 0); - nextToken = tf.next(reusableToken); - assertTrue("nextToken is null and it shouldn't be", nextToken != null); - assertTrue(nextToken.term() + " is not equal to " + "again", - nextToken.term().equals("again") == true); - assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1); + assertTrue(tf.incrementToken()); + assertTrue(termAtt.term() + " is not equal to " + "again", + termAtt.term().equals("again") == true); + assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1); - nextToken = tf.next(reusableToken); - assertTrue("nextToken is null and it shouldn't be", nextToken != null); - assertTrue(nextToken.term() + " is not equal to " + "a", - nextToken.term().equals("a") == true); - assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1); + assertTrue(tf.incrementToken()); + assertTrue(termAtt.term() + " is not equal to " + "a", + termAtt.term().equals("a") == true); + assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1); - nextToken = tf.next(reusableToken); - assertTrue("nextToken is null and it shouldn't be", nextToken != null); - assertTrue(nextToken.term() + " is not equal to " + "b", - nextToken.term().equals("b") == true); - assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1); + assertTrue(tf.incrementToken()); + assertTrue(termAtt.term() + " is not equal to " + "b", + termAtt.term().equals("b") == true); + assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1); - nextToken = tf.next(reusableToken); - assertTrue("nextToken is null and it shouldn't be", nextToken != null); - assertTrue(nextToken.term() + " is not equal to " + "c", - nextToken.term().equals("c") == true); - assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1); + assertTrue(tf.incrementToken()); + assertTrue(termAtt.term() + " is not equal to " + "c", + termAtt.term().equals("c") == true); + assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1); - nextToken = tf.next(reusableToken); - assertTrue("nextToken is null and it shouldn't be", nextToken != null); - assertTrue(nextToken.term() + " is not equal to " + "d", - nextToken.term().equals("d") == true); - assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1); + assertTrue(tf.incrementToken()); + assertTrue(termAtt.term() + " is not equal to " + "d", + termAtt.term().equals("d") == true); + assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1); - nextToken = tf.next(reusableToken); - assertTrue("nextToken is not null and it should be", nextToken == null); + assertFalse(tf.incrementToken()); } public void testLinks() throws Exception { String test = "[http://lucene.apache.org/java/docs/index.html#news here] [http://lucene.apache.org/java/docs/index.html?b=c here] [https://lucene.apache.org/java/docs/index.html?b=c here]"; WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(test)); - final Token reusableToken = new Token(); - Token nextToken = tf.next(reusableToken); - assertTrue("nextToken is null and it shouldn't be", nextToken != null); - assertTrue(nextToken.term() + " is not equal to " + "http://lucene.apache.org/java/docs/index.html#news", - nextToken.term().equals("http://lucene.apache.org/java/docs/index.html#news") == true); - assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.EXTERNAL_LINK_URL, nextToken.type().equals(WikipediaTokenizer.EXTERNAL_LINK_URL) == true); - tf.next(reusableToken);//skip here - nextToken = tf.next(reusableToken); - assertTrue("nextToken is null and it shouldn't be", nextToken != null); - assertTrue(nextToken.term() + " is not equal to " + "http://lucene.apache.org/java/docs/index.html?b=c", - nextToken.term().equals("http://lucene.apache.org/java/docs/index.html?b=c") == true); - assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.EXTERNAL_LINK_URL, nextToken.type().equals(WikipediaTokenizer.EXTERNAL_LINK_URL) == true); - tf.next(reusableToken);//skip here - nextToken = tf.next(reusableToken); - assertTrue("nextToken is null and it shouldn't be", nextToken != null); - assertTrue(nextToken.term() + " is not equal to " + "https://lucene.apache.org/java/docs/index.html?b=c", - nextToken.term().equals("https://lucene.apache.org/java/docs/index.html?b=c") == true); - assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.EXTERNAL_LINK_URL, nextToken.type().equals(WikipediaTokenizer.EXTERNAL_LINK_URL) == true); - nextToken = tf.next(reusableToken); - assertTrue("nextToken is null and it shouldn't be", nextToken != null); - - nextToken = tf.next(reusableToken); - assertTrue("nextToken is not null and it should be", nextToken == null); - + TermAttribute termAtt = (TermAttribute) tf.addAttribute(TermAttribute.class); + TypeAttribute typeAtt = (TypeAttribute) tf.addAttribute(TypeAttribute.class); + + assertTrue(tf.incrementToken()); + assertTrue(termAtt.term() + " is not equal to " + "http://lucene.apache.org/java/docs/index.html#news", + termAtt.term().equals("http://lucene.apache.org/java/docs/index.html#news") == true); + assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.EXTERNAL_LINK_URL, typeAtt.type().equals(WikipediaTokenizer.EXTERNAL_LINK_URL) == true); + tf.incrementToken();//skip here + + assertTrue(tf.incrementToken()); + assertTrue(termAtt.term() + " is not equal to " + "http://lucene.apache.org/java/docs/index.html?b=c", + termAtt.term().equals("http://lucene.apache.org/java/docs/index.html?b=c") == true); + assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.EXTERNAL_LINK_URL, typeAtt.type().equals(WikipediaTokenizer.EXTERNAL_LINK_URL) == true); + tf.incrementToken();//skip here + + assertTrue(tf.incrementToken()); + assertTrue(termAtt.term() + " is not equal to " + "https://lucene.apache.org/java/docs/index.html?b=c", + termAtt.term().equals("https://lucene.apache.org/java/docs/index.html?b=c") == true); + assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.EXTERNAL_LINK_URL, typeAtt.type().equals(WikipediaTokenizer.EXTERNAL_LINK_URL) == true); + + assertTrue(tf.incrementToken()); + assertFalse(tf.incrementToken()); } public void testLucene1133() throws Exception { @@ -277,72 +270,73 @@ public class WikipediaTokenizerTest extends TestCase { checkLinkPhrases(tf); String test = "[[Category:a b c d]] [[Category:e f g]] [[link here]] [[link there]] ''italics here'' something ''more italics'' [[Category:h i j]]"; tf = new WikipediaTokenizer(new StringReader(test), WikipediaTokenizer.UNTOKENIZED_ONLY, untoks); - final Token reusableToken = new Token(); - Token nextToken = tf.next(reusableToken); - assertTrue("nextToken is null and it shouldn't be", nextToken != null); - assertTrue(nextToken.term() + " is not equal to " + "a b c d", - nextToken.term().equals("a b c d") == true); - assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1); - assertTrue(nextToken.startOffset() + " does not equal: " + 11, nextToken.startOffset() == 11); - assertTrue(nextToken.endOffset() + " does not equal: " + 18, nextToken.endOffset() == 18); - nextToken = tf.next(reusableToken); - assertTrue("nextToken is null and it shouldn't be", nextToken != null); - assertTrue(nextToken.term() + " is not equal to " + "e f g", - nextToken.term().equals("e f g") == true); - assertTrue(nextToken.startOffset() + " does not equal: " + 32, nextToken.startOffset() == 32); - assertTrue(nextToken.endOffset() + " does not equal: " + 37, nextToken.endOffset() == 37); - nextToken = tf.next(reusableToken); - assertTrue("nextToken is null and it shouldn't be", nextToken != null); - assertTrue(nextToken.term() + " is not equal to " + "link", - nextToken.term().equals("link") == true); - assertTrue(nextToken.startOffset() + " does not equal: " + 42, nextToken.startOffset() == 42); - assertTrue(nextToken.endOffset() + " does not equal: " + 46, nextToken.endOffset() == 46); - nextToken = tf.next(reusableToken); - assertTrue("nextToken is null and it shouldn't be", nextToken != null); - assertTrue(nextToken.term() + " is not equal to " + "here", - nextToken.term().equals("here") == true); - assertTrue(nextToken.startOffset() + " does not equal: " + 47, nextToken.startOffset() == 47); - assertTrue(nextToken.endOffset() + " does not equal: " + 51, nextToken.endOffset() == 51); - nextToken = tf.next(reusableToken); - assertTrue("nextToken is null and it shouldn't be", nextToken != null); - assertTrue(nextToken.term() + " is not equal to " + "link", - nextToken.term().equals("link") == true); - assertTrue(nextToken.startOffset() + " does not equal: " + 56, nextToken.startOffset() == 56); - assertTrue(nextToken.endOffset() + " does not equal: " + 60, nextToken.endOffset() == 60); - nextToken = tf.next(reusableToken); - assertTrue("nextToken is null and it shouldn't be", nextToken != null); - assertTrue(nextToken.term() + " is not equal to " + "there", - nextToken.term().equals("there") == true); - assertTrue(nextToken.startOffset() + " does not equal: " + 61, nextToken.startOffset() == 61); - assertTrue(nextToken.endOffset() + " does not equal: " + 66, nextToken.endOffset() == 66); - nextToken = tf.next(reusableToken); - assertTrue("nextToken is null and it shouldn't be", nextToken != null); - assertTrue(nextToken.term() + " is not equal to " + "italics here", - nextToken.term().equals("italics here") == true); - assertTrue(nextToken.startOffset() + " does not equal: " + 71, nextToken.startOffset() == 71); - assertTrue(nextToken.endOffset() + " does not equal: " + 83, nextToken.endOffset() == 83); - nextToken = tf.next(reusableToken); - assertTrue("nextToken is null and it shouldn't be", nextToken != null); - assertTrue(nextToken.term() + " is not equal to " + "something", - nextToken.term().equals("something") == true); - assertTrue(nextToken.startOffset() + " does not equal: " + 86, nextToken.startOffset() == 86); - assertTrue(nextToken.endOffset() + " does not equal: " + 95, nextToken.endOffset() == 95); - nextToken = tf.next(reusableToken); - assertTrue("nextToken is null and it shouldn't be", nextToken != null); - assertTrue(nextToken.term() + " is not equal to " + "more italics", - nextToken.term().equals("more italics") == true); - assertTrue(nextToken.startOffset() + " does not equal: " + 98, nextToken.startOffset() == 98); - assertTrue(nextToken.endOffset() + " does not equal: " + 110, nextToken.endOffset() == 110); + TermAttribute termAtt = (TermAttribute) tf.addAttribute(TermAttribute.class); + PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) tf.addAttribute(PositionIncrementAttribute.class); + OffsetAttribute offsetAtt = (OffsetAttribute) tf.addAttribute(OffsetAttribute.class); + + assertTrue(tf.incrementToken()); + assertTrue(termAtt.term() + " is not equal to " + "a b c d", + termAtt.term().equals("a b c d") == true); + assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1); + assertTrue(offsetAtt.startOffset() + " does not equal: " + 11, offsetAtt.startOffset() == 11); + assertTrue(offsetAtt.endOffset() + " does not equal: " + 18, offsetAtt.endOffset() == 18); + + assertTrue(tf.incrementToken()); + assertTrue(termAtt.term() + " is not equal to " + "e f g", + termAtt.term().equals("e f g") == true); + assertTrue(offsetAtt.startOffset() + " does not equal: " + 32, offsetAtt.startOffset() == 32); + assertTrue(offsetAtt.endOffset() + " does not equal: " + 37, offsetAtt.endOffset() == 37); + + assertTrue(tf.incrementToken()); + assertTrue(termAtt.term() + " is not equal to " + "link", + termAtt.term().equals("link") == true); + assertTrue(offsetAtt.startOffset() + " does not equal: " + 42, offsetAtt.startOffset() == 42); + assertTrue(offsetAtt.endOffset() + " does not equal: " + 46, offsetAtt.endOffset() == 46); + + assertTrue(tf.incrementToken()); + assertTrue(termAtt.term() + " is not equal to " + "here", + termAtt.term().equals("here") == true); + assertTrue(offsetAtt.startOffset() + " does not equal: " + 47, offsetAtt.startOffset() == 47); + assertTrue(offsetAtt.endOffset() + " does not equal: " + 51, offsetAtt.endOffset() == 51); - nextToken = tf.next(reusableToken); - assertTrue("nextToken is null and it shouldn't be", nextToken != null); - assertTrue(nextToken.term() + " is not equal to " + "h i j", - nextToken.term().equals("h i j") == true); - assertTrue(nextToken.startOffset() + " does not equal: " + 124, nextToken.startOffset() == 124); - assertTrue(nextToken.endOffset() + " does not equal: " + 133, nextToken.endOffset() == 133); + assertTrue(tf.incrementToken()); + assertTrue(termAtt.term() + " is not equal to " + "link", + termAtt.term().equals("link") == true); + assertTrue(offsetAtt.startOffset() + " does not equal: " + 56, offsetAtt.startOffset() == 56); + assertTrue(offsetAtt.endOffset() + " does not equal: " + 60, offsetAtt.endOffset() == 60); + + assertTrue(tf.incrementToken()); + assertTrue(termAtt.term() + " is not equal to " + "there", + termAtt.term().equals("there") == true); - nextToken = tf.next(reusableToken); - assertTrue("nextToken is not null and it should be", nextToken == null); + assertTrue(offsetAtt.startOffset() + " does not equal: " + 61, offsetAtt.startOffset() == 61); + assertTrue(offsetAtt.endOffset() + " does not equal: " + 66, offsetAtt.endOffset() == 66); + + assertTrue(tf.incrementToken()); + assertTrue(termAtt.term() + " is not equal to " + "italics here", + termAtt.term().equals("italics here") == true); + assertTrue(offsetAtt.startOffset() + " does not equal: " + 71, offsetAtt.startOffset() == 71); + assertTrue(offsetAtt.endOffset() + " does not equal: " + 83, offsetAtt.endOffset() == 83); + + assertTrue(tf.incrementToken()); + assertTrue(termAtt.term() + " is not equal to " + "something", + termAtt.term().equals("something") == true); + assertTrue(offsetAtt.startOffset() + " does not equal: " + 86, offsetAtt.startOffset() == 86); + assertTrue(offsetAtt.endOffset() + " does not equal: " + 95, offsetAtt.endOffset() == 95); + + assertTrue(tf.incrementToken()); + assertTrue(termAtt.term() + " is not equal to " + "more italics", + termAtt.term().equals("more italics") == true); + assertTrue(offsetAtt.startOffset() + " does not equal: " + 98, offsetAtt.startOffset() == 98); + assertTrue(offsetAtt.endOffset() + " does not equal: " + 110, offsetAtt.endOffset() == 110); + + assertTrue(tf.incrementToken()); + assertTrue(termAtt.term() + " is not equal to " + "h i j", + termAtt.term().equals("h i j") == true); + assertTrue(offsetAtt.startOffset() + " does not equal: " + 124, offsetAtt.startOffset() == 124); + assertTrue(offsetAtt.endOffset() + " does not equal: " + 133, offsetAtt.endOffset() == 133); + + assertFalse(tf.incrementToken()); } public void testBoth() throws Exception { @@ -352,225 +346,211 @@ public class WikipediaTokenizerTest extends TestCase { String test = "[[Category:a b c d]] [[Category:e f g]] [[link here]] [[link there]] ''italics here'' something ''more italics'' [[Category:h i j]]"; //should output all the indivual tokens plus the untokenized tokens as well. Untokenized tokens WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(test), WikipediaTokenizer.BOTH, untoks); - final Token reusableToken = new Token(); - Token nextToken = tf.next(reusableToken); - assertTrue("nextToken is null and it shouldn't be", nextToken != null); - assertTrue(nextToken.term() + " is not equal to " + "a b c d", - nextToken.term().equals("a b c d") == true); - assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1); - assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true); - assertTrue(nextToken.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, nextToken.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG); - assertTrue(nextToken.startOffset() + " does not equal: " + 11, nextToken.startOffset() == 11); - assertTrue(nextToken.endOffset() + " does not equal: " + 18, nextToken.endOffset() == 18); - nextToken = tf.next(reusableToken); - assertTrue("nextToken is null and it shouldn't be", nextToken != null); - assertTrue(nextToken.term() + " is not equal to " + "a", - nextToken.term().equals("a") == true); - assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 0, nextToken.getPositionIncrement() == 0); - assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true); - assertTrue(nextToken.getFlags() + " equals: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG + " and it shouldn't", nextToken.getFlags() != WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG); - assertTrue(nextToken.startOffset() + " does not equal: " + 11, nextToken.startOffset() == 11); - assertTrue(nextToken.endOffset() + " does not equal: " + 12, nextToken.endOffset() == 12); + TermAttribute termAtt = (TermAttribute) tf.addAttribute(TermAttribute.class); + TypeAttribute typeAtt = (TypeAttribute) tf.addAttribute(TypeAttribute.class); + PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) tf.addAttribute(PositionIncrementAttribute.class); + OffsetAttribute offsetAtt = (OffsetAttribute) tf.addAttribute(OffsetAttribute.class); + FlagsAttribute flagsAtt = (FlagsAttribute) tf.addAttribute(FlagsAttribute.class); + + assertTrue(tf.incrementToken()); + assertTrue(termAtt.term() + " is not equal to " + "a b c d", + termAtt.term().equals("a b c d") == true); + assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1); + assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, typeAtt.type().equals(WikipediaTokenizer.CATEGORY) == true); + assertTrue(flagsAtt.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, flagsAtt.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG); + assertTrue(offsetAtt.startOffset() + " does not equal: " + 11, offsetAtt.startOffset() == 11); + assertTrue(offsetAtt.endOffset() + " does not equal: " + 18, offsetAtt.endOffset() == 18); + + assertTrue(tf.incrementToken()); + assertTrue(termAtt.term() + " is not equal to " + "a", + termAtt.term().equals("a") == true); + assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 0, posIncrAtt.getPositionIncrement() == 0); + assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, typeAtt.type().equals(WikipediaTokenizer.CATEGORY) == true); + assertTrue(flagsAtt.getFlags() + " equals: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG + " and it shouldn't", flagsAtt.getFlags() != WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG); + assertTrue(offsetAtt.startOffset() + " does not equal: " + 11, offsetAtt.startOffset() == 11); + assertTrue(offsetAtt.endOffset() + " does not equal: " + 12, offsetAtt.endOffset() == 12); - nextToken = tf.next(reusableToken); - assertTrue("nextToken is null and it shouldn't be", nextToken != null); - assertTrue(nextToken.term() + " is not equal to " + "b", - nextToken.term().equals("b") == true); - assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1); - assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true); - assertTrue(nextToken.startOffset() + " does not equal: " + 13, nextToken.startOffset() == 13); - assertTrue(nextToken.endOffset() + " does not equal: " + 14, nextToken.endOffset() == 14); + assertTrue(tf.incrementToken()); + assertTrue(termAtt.term() + " is not equal to " + "b", + termAtt.term().equals("b") == true); + assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1); + assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, typeAtt.type().equals(WikipediaTokenizer.CATEGORY) == true); + assertTrue(offsetAtt.startOffset() + " does not equal: " + 13, offsetAtt.startOffset() == 13); + assertTrue(offsetAtt.endOffset() + " does not equal: " + 14, offsetAtt.endOffset() == 14); - nextToken = tf.next(reusableToken); - assertTrue("nextToken is null and it shouldn't be", nextToken != null); - assertTrue(nextToken.term() + " is not equal to " + "c", - nextToken.term().equals("c") == true); - assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1); - assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true); - assertTrue(nextToken.startOffset() + " does not equal: " + 15, nextToken.startOffset() == 15); - assertTrue(nextToken.endOffset() + " does not equal: " + 16, nextToken.endOffset() == 16); + assertTrue(tf.incrementToken()); + assertTrue(termAtt.term() + " is not equal to " + "c", + termAtt.term().equals("c") == true); + assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1); + assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, typeAtt.type().equals(WikipediaTokenizer.CATEGORY) == true); + assertTrue(offsetAtt.startOffset() + " does not equal: " + 15, offsetAtt.startOffset() == 15); + assertTrue(offsetAtt.endOffset() + " does not equal: " + 16, offsetAtt.endOffset() == 16); - nextToken = tf.next(reusableToken); - assertTrue("nextToken is null and it shouldn't be", nextToken != null); - assertTrue(nextToken.term() + " is not equal to " + "d", - nextToken.term().equals("d") == true); - assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1); - assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true); - assertTrue(nextToken.startOffset() + " does not equal: " + 17, nextToken.startOffset() == 17); - assertTrue(nextToken.endOffset() + " does not equal: " + 18, nextToken.endOffset() == 18); + assertTrue(tf.incrementToken()); + assertTrue(termAtt.term() + " is not equal to " + "d", + termAtt.term().equals("d") == true); + assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1); + assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, typeAtt.type().equals(WikipediaTokenizer.CATEGORY) == true); + assertTrue(offsetAtt.startOffset() + " does not equal: " + 17, offsetAtt.startOffset() == 17); + assertTrue(offsetAtt.endOffset() + " does not equal: " + 18, offsetAtt.endOffset() == 18); - nextToken = tf.next(reusableToken); - assertTrue("nextToken is null and it shouldn't be", nextToken != null); - assertTrue(nextToken.term() + " is not equal to " + "e f g", - nextToken.term().equals("e f g") == true); - assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true); - assertTrue(nextToken.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, nextToken.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG); - assertTrue(nextToken.startOffset() + " does not equal: " + 32, nextToken.startOffset() == 32); - assertTrue(nextToken.endOffset() + " does not equal: " + 37, nextToken.endOffset() == 37); + assertTrue(tf.incrementToken()); + assertTrue(termAtt.term() + " is not equal to " + "e f g", + termAtt.term().equals("e f g") == true); + assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, typeAtt.type().equals(WikipediaTokenizer.CATEGORY) == true); + assertTrue(flagsAtt.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, flagsAtt.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG); + assertTrue(offsetAtt.startOffset() + " does not equal: " + 32, offsetAtt.startOffset() == 32); + assertTrue(offsetAtt.endOffset() + " does not equal: " + 37, offsetAtt.endOffset() == 37); - nextToken = tf.next(reusableToken); - assertTrue("nextToken is null and it shouldn't be", nextToken != null); - assertTrue(nextToken.term() + " is not equal to " + "e", - nextToken.term().equals("e") == true); - assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true); - assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 0, nextToken.getPositionIncrement() == 0); - assertTrue(nextToken.startOffset() + " does not equal: " + 32, nextToken.startOffset() == 32); - assertTrue(nextToken.endOffset() + " does not equal: " + 33, nextToken.endOffset() == 33); + assertTrue(tf.incrementToken()); + assertTrue(termAtt.term() + " is not equal to " + "e", + termAtt.term().equals("e") == true); + assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, typeAtt.type().equals(WikipediaTokenizer.CATEGORY) == true); + assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 0, posIncrAtt.getPositionIncrement() == 0); + assertTrue(offsetAtt.startOffset() + " does not equal: " + 32, offsetAtt.startOffset() == 32); + assertTrue(offsetAtt.endOffset() + " does not equal: " + 33, offsetAtt.endOffset() == 33); - nextToken = tf.next(reusableToken); - assertTrue("nextToken is null and it shouldn't be", nextToken != null); - assertTrue(nextToken.term() + " is not equal to " + "f", - nextToken.term().equals("f") == true); - assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true); - assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1); - assertTrue(nextToken.startOffset() + " does not equal: " + 34, nextToken.startOffset() == 34); - assertTrue(nextToken.endOffset() + " does not equal: " + 35, nextToken.endOffset() == 35); + assertTrue(tf.incrementToken()); + assertTrue(termAtt.term() + " is not equal to " + "f", + termAtt.term().equals("f") == true); + assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, typeAtt.type().equals(WikipediaTokenizer.CATEGORY) == true); + assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1); + assertTrue(offsetAtt.startOffset() + " does not equal: " + 34, offsetAtt.startOffset() == 34); + assertTrue(offsetAtt.endOffset() + " does not equal: " + 35, offsetAtt.endOffset() == 35); - nextToken = tf.next(reusableToken); - assertTrue("nextToken is null and it shouldn't be", nextToken != null); - assertTrue(nextToken.term() + " is not equal to " + "g", - nextToken.term().equals("g") == true); - assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true); - assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1); - assertTrue(nextToken.startOffset() + " does not equal: " + 36, nextToken.startOffset() == 36); - assertTrue(nextToken.endOffset() + " does not equal: " + 37, nextToken.endOffset() == 37); + assertTrue(tf.incrementToken()); + assertTrue(termAtt.term() + " is not equal to " + "g", + termAtt.term().equals("g") == true); + assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, typeAtt.type().equals(WikipediaTokenizer.CATEGORY) == true); + assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1); + assertTrue(offsetAtt.startOffset() + " does not equal: " + 36, offsetAtt.startOffset() == 36); + assertTrue(offsetAtt.endOffset() + " does not equal: " + 37, offsetAtt.endOffset() == 37); - nextToken = tf.next(reusableToken); - assertTrue("nextToken is null and it shouldn't be", nextToken != null); - assertTrue(nextToken.term() + " is not equal to " + "link", - nextToken.term().equals("link") == true); - assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1); - assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.INTERNAL_LINK, nextToken.type().equals(WikipediaTokenizer.INTERNAL_LINK) == true); - assertTrue(nextToken.startOffset() + " does not equal: " + 42, nextToken.startOffset() == 42); - assertTrue(nextToken.endOffset() + " does not equal: " + 46, nextToken.endOffset() == 46); - nextToken = tf.next(reusableToken); - assertTrue("nextToken is null and it shouldn't be", nextToken != null); - assertTrue(nextToken.term() + " is not equal to " + "here", - nextToken.term().equals("here") == true); - assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1); - assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.INTERNAL_LINK, nextToken.type().equals(WikipediaTokenizer.INTERNAL_LINK) == true); - assertTrue(nextToken.startOffset() + " does not equal: " + 47, nextToken.startOffset() == 47); - assertTrue(nextToken.endOffset() + " does not equal: " + 51, nextToken.endOffset() == 51); - nextToken = tf.next(reusableToken); - assertTrue("nextToken is null and it shouldn't be", nextToken != null); - assertTrue(nextToken.term() + " is not equal to " + "link", - nextToken.term().equals("link") == true); - assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1); - assertTrue(nextToken.startOffset() + " does not equal: " + 56, nextToken.startOffset() == 56); - assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.INTERNAL_LINK, nextToken.type().equals(WikipediaTokenizer.INTERNAL_LINK) == true); - assertTrue(nextToken.endOffset() + " does not equal: " + 60, nextToken.endOffset() == 60); - nextToken = tf.next(reusableToken); - assertTrue("nextToken is null and it shouldn't be", nextToken != null); - assertTrue(nextToken.term() + " is not equal to " + "there", - nextToken.term().equals("there") == true); - assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1); - assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.INTERNAL_LINK, nextToken.type().equals(WikipediaTokenizer.INTERNAL_LINK) == true); - assertTrue(nextToken.startOffset() + " does not equal: " + 61, nextToken.startOffset() == 61); - assertTrue(nextToken.endOffset() + " does not equal: " + 66, nextToken.endOffset() == 66); - nextToken = tf.next(reusableToken); - assertTrue("nextToken is null and it shouldn't be", nextToken != null); - assertTrue(nextToken.term() + " is not equal to " + "italics here", - nextToken.term().equals("italics here") == true); - assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1); - assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.ITALICS, nextToken.type().equals(WikipediaTokenizer.ITALICS) == true); - assertTrue(nextToken.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, nextToken.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG); - assertTrue(nextToken.startOffset() + " does not equal: " + 71, nextToken.startOffset() == 71); - assertTrue(nextToken.endOffset() + " does not equal: " + 83, nextToken.endOffset() == 83); + assertTrue(tf.incrementToken()); + assertTrue(termAtt.term() + " is not equal to " + "link", + termAtt.term().equals("link") == true); + assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1); + assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.INTERNAL_LINK, typeAtt.type().equals(WikipediaTokenizer.INTERNAL_LINK) == true); + assertTrue(offsetAtt.startOffset() + " does not equal: " + 42, offsetAtt.startOffset() == 42); + assertTrue(offsetAtt.endOffset() + " does not equal: " + 46, offsetAtt.endOffset() == 46); + + assertTrue(tf.incrementToken()); + assertTrue(termAtt.term() + " is not equal to " + "here", + termAtt.term().equals("here") == true); + assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1); + assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.INTERNAL_LINK, typeAtt.type().equals(WikipediaTokenizer.INTERNAL_LINK) == true); + assertTrue(offsetAtt.startOffset() + " does not equal: " + 47, offsetAtt.startOffset() == 47); + assertTrue(offsetAtt.endOffset() + " does not equal: " + 51, offsetAtt.endOffset() == 51); + + assertTrue(tf.incrementToken()); + assertTrue(termAtt.term() + " is not equal to " + "link", + termAtt.term().equals("link") == true); + assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1); + assertTrue(offsetAtt.startOffset() + " does not equal: " + 56, offsetAtt.startOffset() == 56); + assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.INTERNAL_LINK, typeAtt.type().equals(WikipediaTokenizer.INTERNAL_LINK) == true); + assertTrue(offsetAtt.endOffset() + " does not equal: " + 60, offsetAtt.endOffset() == 60); + + assertTrue(tf.incrementToken()); + assertTrue(termAtt.term() + " is not equal to " + "there", + termAtt.term().equals("there") == true); + assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1); + assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.INTERNAL_LINK, typeAtt.type().equals(WikipediaTokenizer.INTERNAL_LINK) == true); + assertTrue(offsetAtt.startOffset() + " does not equal: " + 61, offsetAtt.startOffset() == 61); + assertTrue(offsetAtt.endOffset() + " does not equal: " + 66, offsetAtt.endOffset() == 66); + + assertTrue(tf.incrementToken()); + assertTrue(termAtt.term() + " is not equal to " + "italics here", + termAtt.term().equals("italics here") == true); + assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1); + assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.ITALICS, typeAtt.type().equals(WikipediaTokenizer.ITALICS) == true); + assertTrue(flagsAtt.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, flagsAtt.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG); + assertTrue(offsetAtt.startOffset() + " does not equal: " + 71, offsetAtt.startOffset() == 71); + assertTrue(offsetAtt.endOffset() + " does not equal: " + 83, offsetAtt.endOffset() == 83); - nextToken = tf.next(reusableToken); - assertTrue("nextToken is null and it shouldn't be", nextToken != null); - assertTrue(nextToken.term() + " is not equal to " + "italics", - nextToken.term().equals("italics") == true); - assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 0, nextToken.getPositionIncrement() == 0); - assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.ITALICS, nextToken.type().equals(WikipediaTokenizer.ITALICS) == true); - assertTrue(nextToken.startOffset() + " does not equal: " + 71, nextToken.startOffset() == 71); - assertTrue(nextToken.endOffset() + " does not equal: " + 78, nextToken.endOffset() == 78); + assertTrue(tf.incrementToken()); + assertTrue(termAtt.term() + " is not equal to " + "italics", + termAtt.term().equals("italics") == true); + assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 0, posIncrAtt.getPositionIncrement() == 0); + assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.ITALICS, typeAtt.type().equals(WikipediaTokenizer.ITALICS) == true); + assertTrue(offsetAtt.startOffset() + " does not equal: " + 71, offsetAtt.startOffset() == 71); + assertTrue(offsetAtt.endOffset() + " does not equal: " + 78, offsetAtt.endOffset() == 78); - nextToken = tf.next(reusableToken); - assertTrue("nextToken is null and it shouldn't be", nextToken != null); - assertTrue(nextToken.term() + " is not equal to " + "here", - nextToken.term().equals("here") == true); - assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1); - assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.ITALICS, nextToken.type().equals(WikipediaTokenizer.ITALICS) == true); - assertTrue(nextToken.startOffset() + " does not equal: " + 79, nextToken.startOffset() == 79); - assertTrue(nextToken.endOffset() + " does not equal: " + 83, nextToken.endOffset() == 83); + assertTrue(tf.incrementToken()); + assertTrue(termAtt.term() + " is not equal to " + "here", + termAtt.term().equals("here") == true); + assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1); + assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.ITALICS, typeAtt.type().equals(WikipediaTokenizer.ITALICS) == true); + assertTrue(offsetAtt.startOffset() + " does not equal: " + 79, offsetAtt.startOffset() == 79); + assertTrue(offsetAtt.endOffset() + " does not equal: " + 83, offsetAtt.endOffset() == 83); - nextToken = tf.next(reusableToken); - assertTrue("nextToken is null and it shouldn't be", nextToken != null); - assertTrue(nextToken.term() + " is not equal to " + "something", - nextToken.term().equals("something") == true); - assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1); - assertTrue(nextToken.startOffset() + " does not equal: " + 86, nextToken.startOffset() == 86); - assertTrue(nextToken.endOffset() + " does not equal: " + 95, nextToken.endOffset() == 95); - nextToken = tf.next(reusableToken); - assertTrue("nextToken is null and it shouldn't be", nextToken != null); - assertTrue(nextToken.term() + " is not equal to " + "more italics", - nextToken.term().equals("more italics") == true); - assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1); - assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.ITALICS, nextToken.type().equals(WikipediaTokenizer.ITALICS) == true); - assertTrue(nextToken.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, nextToken.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG); - assertTrue(nextToken.startOffset() + " does not equal: " + 98, nextToken.startOffset() == 98); - assertTrue(nextToken.endOffset() + " does not equal: " + 110, nextToken.endOffset() == 110); + assertTrue(tf.incrementToken()); + assertTrue(termAtt.term() + " is not equal to " + "something", + termAtt.term().equals("something") == true); + assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1); + assertTrue(offsetAtt.startOffset() + " does not equal: " + 86, offsetAtt.startOffset() == 86); + assertTrue(offsetAtt.endOffset() + " does not equal: " + 95, offsetAtt.endOffset() == 95); + + assertTrue(tf.incrementToken()); + assertTrue(termAtt.term() + " is not equal to " + "more italics", + termAtt.term().equals("more italics") == true); + assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1); + assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.ITALICS, typeAtt.type().equals(WikipediaTokenizer.ITALICS) == true); + assertTrue(flagsAtt.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, flagsAtt.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG); + assertTrue(offsetAtt.startOffset() + " does not equal: " + 98, offsetAtt.startOffset() == 98); + assertTrue(offsetAtt.endOffset() + " does not equal: " + 110, offsetAtt.endOffset() == 110); - nextToken = tf.next(reusableToken); - assertTrue("nextToken is null and it shouldn't be", nextToken != null); - assertTrue(nextToken.term() + " is not equal to " + "more", - nextToken.term().equals("more") == true); - assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 0, nextToken.getPositionIncrement() == 0); - assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.ITALICS, nextToken.type().equals(WikipediaTokenizer.ITALICS) == true); - assertTrue(nextToken.startOffset() + " does not equal: " + 98, nextToken.startOffset() == 98); - assertTrue(nextToken.endOffset() + " does not equal: " + 102, nextToken.endOffset() == 102); + assertTrue(tf.incrementToken()); + assertTrue(termAtt.term() + " is not equal to " + "more", + termAtt.term().equals("more") == true); + assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 0, posIncrAtt.getPositionIncrement() == 0); + assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.ITALICS, typeAtt.type().equals(WikipediaTokenizer.ITALICS) == true); + assertTrue(offsetAtt.startOffset() + " does not equal: " + 98, offsetAtt.startOffset() == 98); + assertTrue(offsetAtt.endOffset() + " does not equal: " + 102, offsetAtt.endOffset() == 102); - nextToken = tf.next(reusableToken); - assertTrue("nextToken is null and it shouldn't be", nextToken != null); - assertTrue(nextToken.term() + " is not equal to " + "italics", - nextToken.term().equals("italics") == true); - assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1); - assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.ITALICS, nextToken.type().equals(WikipediaTokenizer.ITALICS) == true); + assertTrue(tf.incrementToken()); + assertTrue(termAtt.term() + " is not equal to " + "italics", + termAtt.term().equals("italics") == true); + assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1); + assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.ITALICS, typeAtt.type().equals(WikipediaTokenizer.ITALICS) == true); - assertTrue(nextToken.startOffset() + " does not equal: " + 103, nextToken.startOffset() == 103); - assertTrue(nextToken.endOffset() + " does not equal: " + 110, nextToken.endOffset() == 110); + assertTrue(offsetAtt.startOffset() + " does not equal: " + 103, offsetAtt.startOffset() == 103); + assertTrue(offsetAtt.endOffset() + " does not equal: " + 110, offsetAtt.endOffset() == 110); - nextToken = tf.next(reusableToken); - assertTrue("nextToken is null and it shouldn't be", nextToken != null); - assertTrue(nextToken.term() + " is not equal to " + "h i j", - nextToken.term().equals("h i j") == true); - assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1); - assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true); - assertTrue(nextToken.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, nextToken.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG); - assertTrue(nextToken.startOffset() + " does not equal: " + 124, nextToken.startOffset() == 124); - assertTrue(nextToken.endOffset() + " does not equal: " + 133, nextToken.endOffset() == 133); + assertTrue(tf.incrementToken()); + assertTrue(termAtt.term() + " is not equal to " + "h i j", + termAtt.term().equals("h i j") == true); + assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1); + assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, typeAtt.type().equals(WikipediaTokenizer.CATEGORY) == true); + assertTrue(flagsAtt.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, flagsAtt.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG); + assertTrue(offsetAtt.startOffset() + " does not equal: " + 124, offsetAtt.startOffset() == 124); + assertTrue(offsetAtt.endOffset() + " does not equal: " + 133, offsetAtt.endOffset() == 133); - nextToken = tf.next(reusableToken); - assertTrue("nextToken is null and it shouldn't be", nextToken != null); - assertTrue(nextToken.term() + " is not equal to " + "h", - nextToken.term().equals("h") == true); - assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 0, nextToken.getPositionIncrement() == 0); - assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true); - assertTrue(nextToken.startOffset() + " does not equal: " + 124, nextToken.startOffset() == 124); - assertTrue(nextToken.endOffset() + " does not equal: " + 125, nextToken.endOffset() == 125); + assertTrue(tf.incrementToken()); + assertTrue(termAtt.term() + " is not equal to " + "h", + termAtt.term().equals("h") == true); + assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 0, posIncrAtt.getPositionIncrement() == 0); + assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, typeAtt.type().equals(WikipediaTokenizer.CATEGORY) == true); + assertTrue(offsetAtt.startOffset() + " does not equal: " + 124, offsetAtt.startOffset() == 124); + assertTrue(offsetAtt.endOffset() + " does not equal: " + 125, offsetAtt.endOffset() == 125); - nextToken = tf.next(reusableToken); - assertTrue("nextToken is null and it shouldn't be", nextToken != null); - assertTrue(nextToken.term() + " is not equal to " + "i", - nextToken.term().equals("i") == true); - assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1); - assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true); - assertTrue(nextToken.startOffset() + " does not equal: " + 128, nextToken.startOffset() == 128); - assertTrue(nextToken.endOffset() + " does not equal: " + 129, nextToken.endOffset() == 129); - nextToken = tf.next(reusableToken); - assertTrue("nextToken is null and it shouldn't be", nextToken != null); - assertTrue(nextToken.term() + " is not equal to " + "j", - nextToken.term().equals("j") == true); - assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1); - assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true); - assertTrue(nextToken.startOffset() + " does not equal: " + 132, nextToken.startOffset() == 132); - assertTrue(nextToken.endOffset() + " does not equal: " + 133, nextToken.endOffset() == 133); - - nextToken = tf.next(reusableToken); - assertTrue("nextToken is not null and it should be", nextToken == null); + assertTrue(tf.incrementToken()); + assertTrue(termAtt.term() + " is not equal to " + "i", + termAtt.term().equals("i") == true); + assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1); + assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, typeAtt.type().equals(WikipediaTokenizer.CATEGORY) == true); + assertTrue(offsetAtt.startOffset() + " does not equal: " + 128, offsetAtt.startOffset() == 128); + assertTrue(offsetAtt.endOffset() + " does not equal: " + 129, offsetAtt.endOffset() == 129); + + assertTrue(tf.incrementToken()); + assertTrue(termAtt.term() + " is not equal to " + "j", + termAtt.term().equals("j") == true); + assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1); + assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, typeAtt.type().equals(WikipediaTokenizer.CATEGORY) == true); + assertTrue(offsetAtt.startOffset() + " does not equal: " + 132, offsetAtt.startOffset() == 132); + assertTrue(offsetAtt.endOffset() + " does not equal: " + 133, offsetAtt.endOffset() == 133); + assertFalse(tf.incrementToken()); } } diff --git a/contrib/wordnet/src/java/org/apache/lucene/wordnet/SynExpand.java b/contrib/wordnet/src/java/org/apache/lucene/wordnet/SynExpand.java index 97a70d6d9b3..e3932541d20 100755 --- a/contrib/wordnet/src/java/org/apache/lucene/wordnet/SynExpand.java +++ b/contrib/wordnet/src/java/org/apache/lucene/wordnet/SynExpand.java @@ -27,9 +27,9 @@ import java.util.List; import java.util.Set; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause; @@ -114,10 +114,10 @@ public final class SynExpand { // [1] Parse query into separate words so that when we expand we can avoid dups TokenStream ts = a.tokenStream( field, new StringReader( query)); - - final Token reusableToken = new Token(); - for (Token nextToken = ts.next(reusableToken); nextToken != null; nextToken = ts.next(reusableToken)) { - String word = nextToken.term(); + TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class); + + while (ts.incrementToken()) { + String word = termAtt.term(); if ( already.add( word)) top.add( word); } diff --git a/contrib/wordnet/src/java/org/apache/lucene/wordnet/SynLookup.java b/contrib/wordnet/src/java/org/apache/lucene/wordnet/SynLookup.java index 509bbfc7fa6..087212244b9 100644 --- a/contrib/wordnet/src/java/org/apache/lucene/wordnet/SynLookup.java +++ b/contrib/wordnet/src/java/org/apache/lucene/wordnet/SynLookup.java @@ -27,8 +27,8 @@ import java.util.List; import java.util.Set; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause; @@ -101,9 +101,10 @@ public class SynLookup { // [1] Parse query into separate words so that when we expand we can avoid dups TokenStream ts = a.tokenStream( field, new StringReader( query)); - final Token reusableToken = new Token(); - for (Token nextToken = ts.next(reusableToken); nextToken != null; nextToken = ts.next(reusableToken)) { - String word = nextToken.term(); + TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class); + + while (ts.incrementToken()) { + String word = termAtt.term(); if ( already.add( word)) top.add( word); } diff --git a/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/LikeThisQueryBuilder.java b/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/LikeThisQueryBuilder.java index 431c1d13b5d..13bfdbcd443 100644 --- a/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/LikeThisQueryBuilder.java +++ b/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/LikeThisQueryBuilder.java @@ -9,8 +9,8 @@ import java.util.HashSet; import java.util.Set; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.search.similar.MoreLikeThisQuery; import org.apache.lucene.search.Query; import org.apache.lucene.xmlparser.DOMUtils; @@ -72,14 +72,14 @@ public class LikeThisQueryBuilder implements QueryBuilder { if((stopWords!=null)&&(fields!=null)) { stopWordsSet=new HashSet(); - final Token reusableToken = new Token(); for (int i = 0; i < fields.length; i++) { TokenStream ts = analyzer.tokenStream(fields[i],new StringReader(stopWords)); + TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class); try { - for (Token nextToken = ts.next(reusableToken); nextToken != null; nextToken = ts.next(reusableToken)) { - stopWordsSet.add(nextToken.term()); + while(ts.incrementToken()) { + stopWordsSet.add(termAtt.term()); } } catch(IOException ioe) diff --git a/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/SpanOrTermsBuilder.java b/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/SpanOrTermsBuilder.java index c8ed5665b9f..1c5bdaf86e7 100644 --- a/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/SpanOrTermsBuilder.java +++ b/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/SpanOrTermsBuilder.java @@ -5,8 +5,8 @@ import java.io.StringReader; import java.util.ArrayList; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.index.Term; import org.apache.lucene.search.spans.SpanOrQuery; import org.apache.lucene.search.spans.SpanQuery; @@ -52,9 +52,10 @@ public class SpanOrTermsBuilder extends SpanBuilderBase { ArrayList clausesList=new ArrayList(); TokenStream ts=analyzer.tokenStream(fieldName,new StringReader(value)); - final Token reusableToken = new Token(); - for (Token nextToken = ts.next(reusableToken); nextToken != null; nextToken = ts.next(reusableToken)) { - SpanTermQuery stq=new SpanTermQuery(new Term(fieldName,nextToken.term())); + TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class); + + while (ts.incrementToken()) { + SpanTermQuery stq=new SpanTermQuery(new Term(fieldName, termAtt.term())); clausesList.add(stq); } SpanOrQuery soq=new SpanOrQuery((SpanQuery[]) clausesList.toArray(new SpanQuery[clausesList.size()])); diff --git a/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/TermsFilterBuilder.java b/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/TermsFilterBuilder.java index 52091dccb92..93e27fdd7eb 100644 --- a/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/TermsFilterBuilder.java +++ b/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/TermsFilterBuilder.java @@ -4,8 +4,8 @@ import java.io.IOException; import java.io.StringReader; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.index.Term; import org.apache.lucene.search.Filter; import org.apache.lucene.search.TermsFilter; @@ -54,19 +54,19 @@ public class TermsFilterBuilder implements FilterBuilder String text = DOMUtils.getNonBlankTextOrFail(e); String fieldName = DOMUtils.getAttributeWithInheritanceOrFail(e, "fieldName"); TokenStream ts = analyzer.tokenStream(fieldName, new StringReader(text)); - + TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class); + try { - final Token reusableToken = new Token(); Term term = null; - for (Token nextToken = ts.next(reusableToken); nextToken != null; nextToken = ts.next(reusableToken)) { + while (ts.incrementToken()) { if (term == null) { - term = new Term(fieldName, nextToken.term()); + term = new Term(fieldName, termAtt.term()); } else { // create from previous to save fieldName.intern overhead - term = term.createTerm(nextToken.term()); + term = term.createTerm(termAtt.term()); } tf.addTerm(term); } diff --git a/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/TermsQueryBuilder.java b/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/TermsQueryBuilder.java index 40e1c2ca035..7a6d1e57c23 100644 --- a/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/TermsQueryBuilder.java +++ b/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/TermsQueryBuilder.java @@ -4,8 +4,8 @@ import java.io.IOException; import java.io.StringReader; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; @@ -57,16 +57,16 @@ public class TermsQueryBuilder implements QueryBuilder { TokenStream ts = analyzer.tokenStream(fieldName, new StringReader(text)); try { - final Token reusableToken = new Token(); + TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class); Term term = null; - for (Token nextToken = ts.next(reusableToken); nextToken != null; nextToken = ts.next(reusableToken)) { + while (ts.incrementToken()) { if (term == null) { - term = new Term(fieldName, nextToken.term()); + term = new Term(fieldName, termAtt.term()); } else { // create from previous to save fieldName.intern overhead - term = term.createTerm(nextToken.term()); + term = term.createTerm(termAtt.term()); } bq.add(new BooleanClause(new TermQuery(term),BooleanClause.Occur.SHOULD)); }