diff --git a/CHANGES.txt b/CHANGES.txt
index 7f9aced0962..6978e2737eb 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -356,6 +356,9 @@ API Changes
33. LUCENE-1705: Added IndexWriter.deleteAllDocuments. (Tim Smith via
Mike McCandless)
+34. LUCENE-1460: Changed TokenStreams/TokenFilters in contrib to
+ use the new TokenStream API. (Robert Muir, Michael Busch)
+
Bug fixes
1. LUCENE-1415: MultiPhraseQuery has incorrect hashCode() and equals()
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicNormalizationFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicNormalizationFilter.java
index 399b1f7ae85..4e12ab7a1c5 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicNormalizationFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicNormalizationFilter.java
@@ -19,35 +19,33 @@ package org.apache.lucene.analysis.ar;
import java.io.IOException;
-import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/**
* A TokenFilter that applies {@link ArabicNormalizer} to normalize the orthography.
*
*/
-public class ArabicNormalizationFilter extends TokenFilter {
+public final class ArabicNormalizationFilter extends TokenFilter {
protected ArabicNormalizer normalizer = null;
-
+ private TermAttribute termAtt;
+
public ArabicNormalizationFilter(TokenStream input) {
super(input);
normalizer = new ArabicNormalizer();
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
}
-
-
- public Token next(Token reusableToken) throws IOException {
- if ((reusableToken = input.next(reusableToken)) == null) {
- return null;
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ int newlen = normalizer.normalize(termAtt.termBuffer(), termAtt.termLength());
+ termAtt.setTermLength(newlen);
+ return true;
} else {
- int oldlen = reusableToken.termLength();
- int newlen = normalizer.normalize(reusableToken.termBuffer(), oldlen);
- if (oldlen != newlen)
- reusableToken.setTermLength(newlen);
- return reusableToken;
+ return false;
}
}
}
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicStemFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicStemFilter.java
index 39d7afa65cd..34beb5f9fa9 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicStemFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicStemFilter.java
@@ -19,43 +19,33 @@ package org.apache.lucene.analysis.ar;
import java.io.IOException;
-import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/**
* A TokenFilter that applies {@link ArabicStemmer} to stem Arabic words..
*
*/
-public class ArabicStemFilter extends TokenFilter {
+public final class ArabicStemFilter extends TokenFilter {
protected ArabicStemmer stemmer = null;
-
+ private TermAttribute termAtt;
+
public ArabicStemFilter(TokenStream input) {
super(input);
stemmer = new ArabicStemmer();
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
}
-
-
- /**
- * @return Returns the next token in the stream, or null at EOS
- */
- public Token next(Token reusableToken) throws IOException {
- /**
- * The actual token in the input stream.
- */
-
-
- if ((reusableToken = input.next(reusableToken)) == null) {
- return null;
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ int newlen = stemmer.stem(termAtt.termBuffer(), termAtt.termLength());
+ termAtt.setTermLength(newlen);
+ return true;
} else {
- int oldlen = reusableToken.termLength();
- int newlen = stemmer.stem(reusableToken.termBuffer(), oldlen);
- if (oldlen != newlen)
- reusableToken.setTermLength(newlen);
- return reusableToken;
+ return false;
}
}
}
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianStemFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianStemFilter.java
index 360a0df8281..3eff32f9faa 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianStemFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianStemFilter.java
@@ -17,13 +17,12 @@ package org.apache.lucene.analysis.br;
* limitations under the License.
*/
-import org.apache.lucene.analysis.Token;
+import java.io.IOException;
+import java.util.Set;
+
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
-
-import java.io.IOException;
-import java.util.HashSet;
-import java.util.Set;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/**
* Based on GermanStemFilter
@@ -36,10 +35,12 @@ public final class BrazilianStemFilter extends TokenFilter {
*/
private BrazilianStemmer stemmer = null;
private Set exclusions = null;
-
+ private TermAttribute termAtt;
+
public BrazilianStemFilter(TokenStream in) {
super(in);
stemmer = new BrazilianStemmer();
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
}
public BrazilianStemFilter(TokenStream in, Set exclusiontable) {
@@ -47,26 +48,20 @@ public final class BrazilianStemFilter extends TokenFilter {
this.exclusions = exclusiontable;
}
- /**
- * @return Returns the next token in the stream, or null at EOS.
- */
- public final Token next(final Token reusableToken)
- throws IOException {
- assert reusableToken != null;
- Token nextToken = input.next(reusableToken);
- if (nextToken == null)
- return null;
-
- String term = nextToken.term();
-
- // Check the exclusion table.
- if (exclusions == null || !exclusions.contains(term)) {
- String s = stemmer.stem(term);
- // If not stemmed, don't waste the time adjusting the token.
- if ((s != null) && !s.equals(term))
- nextToken.setTermBuffer(s);
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ String term = termAtt.term();
+ // Check the exclusion table.
+ if (exclusions == null || !exclusions.contains(term)) {
+ String s = stemmer.stem(term);
+ // If not stemmed, don't waste the time adjusting the token.
+ if ((s != null) && !s.equals(term))
+ termAtt.setTermBuffer(s);
+ }
+ return true;
+ } else {
+ return false;
}
- return nextToken;
}
}
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java
index 3f218fd72a8..5b7a42287d9 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java
@@ -17,11 +17,14 @@ package org.apache.lucene.analysis.cjk;
* limitations under the License.
*/
-import org.apache.lucene.analysis.Token;
-import org.apache.lucene.analysis.Tokenizer;
-
+import java.io.IOException;
import java.io.Reader;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+
/**
* CJKTokenizer was modified from StopTokenizer which does a decent job for
@@ -88,6 +91,10 @@ public final class CJKTokenizer extends Tokenizer {
*/
private boolean preIsTokened = false;
+ private TermAttribute termAtt;
+ private OffsetAttribute offsetAtt;
+ private TypeAttribute typeAtt;
+
//~ Constructors -----------------------------------------------------------
/**
@@ -97,25 +104,26 @@ public final class CJKTokenizer extends Tokenizer {
*/
public CJKTokenizer(Reader in) {
super(in);
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
+ typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
}
//~ Methods ----------------------------------------------------------------
/**
- * Returns the next token in the stream, or null at EOS.
+ * Returns true for the next token in the stream, or false at EOS.
* See http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.UnicodeBlock.html
* for detail.
*
- * @param reusableToken a reusable token
- * @return Token
+ * @return false for end of stream, true otherwise
*
* @throws java.io.IOException - throw IOException when read error
* happened in the InputStream
*
*/
- public final Token next(final Token reusableToken) throws java.io.IOException {
+ public boolean incrementToken() throws IOException {
/** how many character(s) has been stored in buffer */
- assert reusableToken != null;
while(true) { // loop until we find a non-empty token
@@ -147,7 +155,7 @@ public final class CJKTokenizer extends Tokenizer {
break;
} else {
- return null;
+ return false;
}
} else {
//get current character
@@ -252,10 +260,12 @@ public final class CJKTokenizer extends Tokenizer {
}
if (length > 0) {
- return reusableToken.reinit
- (buffer, 0, length, input.correctOffset(start), input.correctOffset(start+length), TOKEN_TYPE_NAMES[tokenType]);
+ termAtt.setTermBuffer(buffer, 0, length);
+ offsetAtt.setOffset(input.correctOffset(start), input.correctOffset(start+length));
+ typeAtt.setType(TOKEN_TYPE_NAMES[tokenType]);
+ return true;
} else if (dataLen == -1) {
- return null;
+ return false;
}
// Cycle back and try for the next token (don't
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseFilter.java
index a85a2ed8842..31de4a7f0a5 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseFilter.java
@@ -17,12 +17,13 @@ package org.apache.lucene.analysis.cn;
* limitations under the License.
*/
+import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
-import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/**
* Title: ChineseFilter
@@ -56,19 +57,21 @@ public final class ChineseFilter extends TokenFilter {
private Map stopTable;
+ private TermAttribute termAtt;
+
public ChineseFilter(TokenStream in) {
super(in);
stopTable = new HashMap(STOP_WORDS.length);
for (int i = 0; i < STOP_WORDS.length; i++)
stopTable.put(STOP_WORDS[i], STOP_WORDS[i]);
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
}
- public final Token next(final Token reusableToken) throws java.io.IOException {
- assert reusableToken != null;
+ public boolean incrementToken() throws IOException {
- for (Token nextToken = input.next(reusableToken); nextToken != null; nextToken = input.next(reusableToken)) {
- String text = nextToken.term();
+ while (input.incrementToken()) {
+ String text = termAtt.term();
// why not key off token type here assuming ChineseTokenizer comes first?
if (stopTable.get(text) == null) {
@@ -79,7 +82,7 @@ public final class ChineseFilter extends TokenFilter {
// English word/token should larger than 1 character.
if (text.length()>1) {
- return nextToken;
+ return true;
}
break;
case Character.OTHER_LETTER:
@@ -87,13 +90,13 @@ public final class ChineseFilter extends TokenFilter {
// One Chinese character as one Chinese word.
// Chinese word extraction to be added later here.
- return nextToken;
+ return true;
}
}
}
- return null;
+ return false;
}
}
\ No newline at end of file
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java
index f9a1aec8fff..cc7f7453733 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java
@@ -18,10 +18,12 @@ package org.apache.lucene.analysis.cn;
*/
+import java.io.IOException;
import java.io.Reader;
-import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/**
@@ -56,6 +58,8 @@ public final class ChineseTokenizer extends Tokenizer {
public ChineseTokenizer(Reader in) {
super(in);
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
}
private int offset = 0, bufferIndex=0, dataLen=0;
@@ -68,7 +72,9 @@ public final class ChineseTokenizer extends Tokenizer {
private int length;
private int start;
-
+ private TermAttribute termAtt;
+ private OffsetAttribute offsetAtt;
+
private final void push(char c) {
if (length == 0) start = offset-1; // start of token
@@ -76,19 +82,20 @@ public final class ChineseTokenizer extends Tokenizer {
}
- private final Token flush(final Token token) {
+ private final boolean flush() {
if (length>0) {
//System.out.println(new String(buffer, 0,
//length));
- return token.reinit(buffer, 0, length, input.correctOffset(start), input.correctOffset(start+length));
+ termAtt.setTermBuffer(buffer, 0, length);
+ offsetAtt.setOffset(input.correctOffset(start), input.correctOffset(start+length));
+ return true;
}
else
- return null;
+ return false;
}
- public final Token next(final Token reusableToken) throws java.io.IOException {
- assert reusableToken != null;
+ public boolean incrementToken() throws IOException {
length = 0;
start = offset;
@@ -104,7 +111,7 @@ public final class ChineseTokenizer extends Tokenizer {
bufferIndex = 0;
}
- if (dataLen == -1) return flush(reusableToken);
+ if (dataLen == -1) return flush();
else
c = ioBuffer[bufferIndex++];
@@ -115,20 +122,20 @@ public final class ChineseTokenizer extends Tokenizer {
case Character.LOWERCASE_LETTER:
case Character.UPPERCASE_LETTER:
push(c);
- if (length == MAX_WORD_LEN) return flush(reusableToken);
+ if (length == MAX_WORD_LEN) return flush();
break;
case Character.OTHER_LETTER:
if (length>0) {
bufferIndex--;
offset--;
- return flush(reusableToken);
+ return flush();
}
push(c);
- return flush(reusableToken);
+ return flush();
default:
- if (length>0) return flush(reusableToken);
+ if (length>0) return flush();
break;
}
}
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java
index a43b35e5bf0..15cead80072 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java
@@ -28,6 +28,12 @@ import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
/**
* Base class for decomposition token filters.
@@ -54,6 +60,15 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
protected final int minSubwordSize;
protected final int maxSubwordSize;
protected final boolean onlyLongestMatch;
+
+ private TermAttribute termAtt;
+ private OffsetAttribute offsetAtt;
+ private FlagsAttribute flagsAtt;
+ private PositionIncrementAttribute posIncAtt;
+ private TypeAttribute typeAtt;
+ private PayloadAttribute payloadAtt;
+
+ private final Token wrapper = new Token();
protected CompoundWordTokenFilterBase(TokenStream input, String[] dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
this(input,makeDictionary(dictionary),minWordSize,minSubwordSize,maxSubwordSize, onlyLongestMatch);
@@ -90,6 +105,13 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
this.dictionary = new CharArraySet(dictionary.size(), false);
addAllLowerCase(this.dictionary, dictionary);
}
+
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
+ flagsAtt = (FlagsAttribute) addAttribute(FlagsAttribute.class);
+ posIncAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
+ typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
+ payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
}
/**
@@ -105,26 +127,54 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
return dict;
}
- public Token next(final Token reusableToken) throws IOException {
- assert reusableToken != null;
+ private final void setToken(final Token token) throws IOException {
+ termAtt.setTermBuffer(token.termBuffer(), 0, token.termLength());
+ flagsAtt.setFlags(token.getFlags());
+ typeAtt.setType(token.type());
+ offsetAtt.setOffset(token.startOffset(), token.endOffset());
+ posIncAtt.setPositionIncrement(token.getPositionIncrement());
+ payloadAtt.setPayload(token.getPayload());
+ }
+
+ public final boolean incrementToken() throws IOException {
if (tokens.size() > 0) {
- return (Token)tokens.removeFirst();
+ setToken((Token)tokens.removeFirst());
+ return true;
}
- Token nextToken = input.next(reusableToken);
- if (nextToken == null) {
- return null;
- }
-
- decompose(nextToken);
+ if (input.incrementToken() == false)
+ return false;
+
+ wrapper.setTermBuffer(termAtt.termBuffer(), 0, termAtt.termLength());
+ wrapper.setStartOffset(offsetAtt.startOffset());
+ wrapper.setEndOffset(offsetAtt.endOffset());
+ wrapper.setFlags(flagsAtt.getFlags());
+ wrapper.setType(typeAtt.type());
+ wrapper.setPositionIncrement(posIncAtt.getPositionIncrement());
+ wrapper.setPayload(payloadAtt.getPayload());
+
+ decompose(wrapper);
if (tokens.size() > 0) {
- return (Token)tokens.removeFirst();
+ setToken((Token)tokens.removeFirst());
+ return true;
} else {
- return null;
+ return false;
}
}
+ /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+ * not be overridden. Delegates to the backwards compatibility layer. */
+ public final Token next(final Token reusableToken) throws java.io.IOException {
+ return super.next(reusableToken);
+ }
+
+ /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+ * not be overridden. Delegates to the backwards compatibility layer. */
+ public final Token next() throws java.io.IOException {
+ return super.next();
+ }
+
protected static final void addAllLowerCase(Set target, Collection col) {
Iterator iter=col.iterator();
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java
index 9c90c26eaed..1929563ed71 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java
@@ -17,13 +17,13 @@ package org.apache.lucene.analysis.de;
* limitations under the License.
*/
-import org.apache.lucene.analysis.Token;
-import org.apache.lucene.analysis.TokenFilter;
-import org.apache.lucene.analysis.TokenStream;
-
import java.io.IOException;
import java.util.Set;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+
/**
* A filter that stems German words. It supports a table of words that should
* not be stemmed at all. The stemmer used can be changed at runtime after the
@@ -40,10 +40,13 @@ public final class GermanStemFilter extends TokenFilter
private GermanStemmer stemmer = null;
private Set exclusionSet = null;
+ private TermAttribute termAtt;
+
public GermanStemFilter( TokenStream in )
{
super(in);
stemmer = new GermanStemmer();
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
}
/**
@@ -56,26 +59,22 @@ public final class GermanStemFilter extends TokenFilter
}
/**
- * @return Returns the next token in the stream, or null at EOS
+ * @return Returns true for next token in the stream, or false at EOS
*/
- public final Token next(final Token reusableToken)
- throws IOException
- {
- assert reusableToken != null;
- Token nextToken = input.next(reusableToken);
-
- if (nextToken == null)
- return null;
-
- String term = nextToken.term();
- // Check the exclusion table.
- if (exclusionSet == null || !exclusionSet.contains(term)) {
- String s = stemmer.stem(term);
- // If not stemmed, don't waste the time adjusting the token.
- if ((s != null) && !s.equals(term))
- nextToken.setTermBuffer(s);
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ String term = termAtt.term();
+ // Check the exclusion table.
+ if (exclusionSet == null || !exclusionSet.contains(term)) {
+ String s = stemmer.stem(term);
+ // If not stemmed, don't waste the time adjusting the token.
+ if ((s != null) && !s.equals(term))
+ termAtt.setTermBuffer(s);
+ }
+ return true;
+ } else {
+ return false;
}
- return nextToken;
}
/**
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java
index 9737a282c89..8e94ac55ef1 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java
@@ -16,9 +16,11 @@ package org.apache.lucene.analysis.el;
* limitations under the License.
*/
+import java.io.IOException;
+
import org.apache.lucene.analysis.TokenFilter;
-import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/**
* Normalizes token text to lower case, analyzing given ("greek") charset.
@@ -28,26 +30,26 @@ public final class GreekLowerCaseFilter extends TokenFilter
{
char[] charset;
+ private TermAttribute termAtt;
+
public GreekLowerCaseFilter(TokenStream in, char[] charset)
{
super(in);
this.charset = charset;
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
}
- public final Token next(final Token reusableToken) throws java.io.IOException
- {
- assert reusableToken != null;
- Token nextToken = input.next(reusableToken);
-
- if (nextToken == null)
- return null;
-
- char[] chArray = nextToken.termBuffer();
- int chLen = nextToken.termLength();
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ char[] chArray = termAtt.termBuffer();
+ int chLen = termAtt.termLength();
for (int i = 0; i < chLen; i++)
{
- chArray[i] = GreekCharsets.toLowerCase(chArray[i], charset);
+ chArray[i] = GreekCharsets.toLowerCase(chArray[i], charset);
}
- return nextToken;
+ return true;
+ } else {
+ return false;
+ }
}
}
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java
index 3efaf6bc32f..b354e4dfdb1 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java
@@ -25,6 +25,7 @@ import java.util.Iterator;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/**
* Removes elisions from a token stream. For example, "l'avion" (the plane) will be
@@ -36,7 +37,8 @@ import org.apache.lucene.analysis.TokenFilter;
*/
public class ElisionFilter extends TokenFilter {
private Set articles = null;
-
+ private TermAttribute termAtt;
+
private static char[] apostrophes = {'\'', '’'};
public void setArticles(Set articles) {
@@ -54,6 +56,7 @@ public class ElisionFilter extends TokenFilter {
super(input);
this.articles = new HashSet(Arrays.asList(new String[] { "l", "m", "t",
"qu", "n", "s", "j" }));
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
}
/**
@@ -62,6 +65,7 @@ public class ElisionFilter extends TokenFilter {
public ElisionFilter(TokenStream input, Set articles) {
super(input);
setArticles(articles);
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
}
/**
@@ -70,39 +74,50 @@ public class ElisionFilter extends TokenFilter {
public ElisionFilter(TokenStream input, String[] articles) {
super(input);
setArticles(new HashSet(Arrays.asList(articles)));
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
}
/**
* Returns the next input Token with term() without elisioned start
*/
- public Token next(final Token reusableToken) throws IOException {
- assert reusableToken != null;
- Token nextToken = input.next(reusableToken);
- if (nextToken == null)
- return null;
+ public final boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ char[] termBuffer = termAtt.termBuffer();
+ int termLength = termAtt.termLength();
- char[] termBuffer = nextToken.termBuffer();
- int termLength = nextToken.termLength();
-
- int minPoz = Integer.MAX_VALUE;
- for (int i = 0; i < apostrophes.length; i++) {
- char apos = apostrophes[i];
- // The equivalent of String.indexOf(ch)
- for (int poz = 0; poz < termLength ; poz++) {
- if (termBuffer[poz] == apos) {
+ int minPoz = Integer.MAX_VALUE;
+ for (int i = 0; i < apostrophes.length; i++) {
+ char apos = apostrophes[i];
+ // The equivalent of String.indexOf(ch)
+ for (int poz = 0; poz < termLength ; poz++) {
+ if (termBuffer[poz] == apos) {
minPoz = Math.min(poz, minPoz);
break;
+ }
}
}
- }
- // An apostrophe has been found. If the prefix is an article strip it off.
- if (minPoz != Integer.MAX_VALUE
- && articles.contains(new String(nextToken.termBuffer(), 0, minPoz).toLowerCase())) {
- nextToken.setTermBuffer(nextToken.termBuffer(), minPoz + 1, nextToken.termLength() - (minPoz + 1));
- }
+ // An apostrophe has been found. If the prefix is an article strip it off.
+ if (minPoz != Integer.MAX_VALUE
+ && articles.contains(new String(termAtt.termBuffer(), 0, minPoz).toLowerCase())) {
+ termAtt.setTermBuffer(termAtt.termBuffer(), minPoz + 1, termAtt.termLength() - (minPoz + 1));
+ }
- return nextToken;
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+ * not be overridden. Delegates to the backwards compatibility layer. */
+ public final Token next(final Token reusableToken) throws java.io.IOException {
+ return super.next(reusableToken);
}
+ /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+ * not be overridden. Delegates to the backwards compatibility layer. */
+ public final Token next() throws java.io.IOException {
+ return super.next();
+ }
}
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java
index d5723db56c5..991c4ec1e5f 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java
@@ -20,6 +20,7 @@ package org.apache.lucene.analysis.fr;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import java.io.IOException;
import java.util.HashSet;
@@ -39,10 +40,13 @@ public final class FrenchStemFilter extends TokenFilter {
*/
private FrenchStemmer stemmer = null;
private Set exclusions = null;
+
+ private TermAttribute termAtt;
public FrenchStemFilter( TokenStream in ) {
super(in);
stemmer = new FrenchStemmer();
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
}
@@ -52,25 +56,23 @@ public final class FrenchStemFilter extends TokenFilter {
}
/**
- * @return Returns the next token in the stream, or null at EOS
+ * @return Returns true for the next token in the stream, or false at EOS
*/
- public final Token next(final Token reusableToken)
- throws IOException {
- assert reusableToken != null;
- Token nextToken = input.next(reusableToken);
- if (nextToken == null)
- return null;
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ String term = termAtt.term();
- String term = nextToken.term();
-
- // Check the exclusion table
- if ( exclusions == null || !exclusions.contains( term ) ) {
- String s = stemmer.stem( term );
- // If not stemmed, don't waste the time adjusting the token.
- if ((s != null) && !s.equals( term ) )
- nextToken.setTermBuffer(s);
- }
- return nextToken;
+ // Check the exclusion table
+ if ( exclusions == null || !exclusions.contains( term ) ) {
+ String s = stemmer.stem( term );
+ // If not stemmed, don't waste the time adjusting the token.
+ if ((s != null) && !s.equals( term ) )
+ termAtt.setTermBuffer(s);
+ }
+ return true;
+ } else {
+ return false;
+ }
}
/**
* Set a alternative/custom FrenchStemmer for this filter.
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/EmptyTokenStream.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/EmptyTokenStream.java
index 9a3d1b9816f..bd82911c5d9 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/EmptyTokenStream.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/EmptyTokenStream.java
@@ -27,8 +27,19 @@ import java.io.IOException;
*/
public class EmptyTokenStream extends TokenStream {
- public Token next(final Token reusableToken) throws IOException {
- assert reusableToken != null;
- return null;
+ public final boolean incrementToken() throws IOException {
+ return false;
+ }
+
+ /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+ * not be overridden. Delegates to the backwards compatibility layer. */
+ public final Token next(final Token reusableToken) throws java.io.IOException {
+ return super.next(reusableToken);
+ }
+
+ /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+ * not be overridden. Delegates to the backwards compatibility layer. */
+ public final Token next() throws java.io.IOException {
+ return super.next();
}
}
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAndSuffixAwareTokenFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAndSuffixAwareTokenFilter.java
index 3e62c43ce52..3065f9f7780 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAndSuffixAwareTokenFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAndSuffixAwareTokenFilter.java
@@ -24,6 +24,7 @@ import java.io.IOException;
/**
* Links two PrefixAwareTokenFilter
+ * @deprecated
*/
public class PrefixAndSuffixAwareTokenFilter extends TokenStream {
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAwareTokenFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAwareTokenFilter.java
index 72948cbe622..b49c9f8241a 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAwareTokenFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAwareTokenFilter.java
@@ -29,6 +29,7 @@ import java.io.IOException;
* to be used when updating the token values in the second stream based on that token.
*
* The default implementation adds last prefix token end offset to the suffix token start and end offsets.
+ * @deprecated
*/
public class PrefixAwareTokenFilter extends TokenStream {
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/SingleTokenTokenStream.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/SingleTokenTokenStream.java
index 8efcff38017..13f0eb6cd1d 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/SingleTokenTokenStream.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/SingleTokenTokenStream.java
@@ -17,10 +17,16 @@ package org.apache.lucene.analysis.miscellaneous;
* limitations under the License.
*/
+import java.io.IOException;
+
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
-
-import java.io.IOException;
+import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
/**
* A token stream containing a single token.
@@ -29,34 +35,66 @@ public class SingleTokenTokenStream extends TokenStream {
private boolean exhausted = false;
// The token needs to be immutable, so work with clones!
- private Token token;
+ private Token singleToken;
+ private TermAttribute termAtt;
+ private OffsetAttribute offsetAtt;
+ private FlagsAttribute flagsAtt;
+ private PositionIncrementAttribute posIncAtt;
+ private TypeAttribute typeAtt;
+ private PayloadAttribute payloadAtt;
public SingleTokenTokenStream(Token token) {
assert token != null;
- this.token = (Token) token.clone();
+ this.singleToken = (Token) token.clone();
+
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
+ flagsAtt = (FlagsAttribute) addAttribute(FlagsAttribute.class);
+ posIncAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
+ typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
+ payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
}
- public Token next(final Token reusableToken) throws IOException {
- assert reusableToken != null;
+ public final boolean incrementToken() throws IOException {
if (exhausted) {
- return null;
+ return false;
}
+
+ Token clone = (Token) singleToken.clone();
+
+ termAtt.setTermBuffer(clone.termBuffer(), 0, clone.termLength());
+ offsetAtt.setOffset(clone.startOffset(), clone.endOffset());
+ flagsAtt.setFlags(clone.getFlags());
+ typeAtt.setType(clone.type());
+ posIncAtt.setPositionIncrement(clone.getPositionIncrement());
+ payloadAtt.setPayload(clone.getPayload());
exhausted = true;
- return (Token) token.clone();
+ return true;
+ }
+
+ /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+ * not be overridden. Delegates to the backwards compatibility layer. */
+ public final Token next(final Token reusableToken) throws java.io.IOException {
+ return super.next(reusableToken);
}
+ /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+ * not be overridden. Delegates to the backwards compatibility layer. */
+ public final Token next() throws java.io.IOException {
+ return super.next();
+ }
public void reset() throws IOException {
exhausted = false;
}
public Token getToken() {
- return (Token) token.clone();
+ return (Token) singleToken.clone();
}
public void setToken(Token token) {
- this.token = (Token) token.clone();
+ this.singleToken = (Token) token.clone();
}
}
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
index 7ae055cee7c..a00e1ce633a 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
@@ -20,9 +20,10 @@ package org.apache.lucene.analysis.ngram;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import java.io.IOException;
-import java.util.LinkedList;
/**
* Tokenizes the given token into n-grams of given size(s).
@@ -66,11 +67,18 @@ public class EdgeNGramTokenFilter extends TokenFilter {
private int minGram;
private int maxGram;
private Side side;
- private LinkedList ngrams;
+ private char[] curTermBuffer;
+ private int curTermLength;
+ private int curGramSize;
+
+ private TermAttribute termAtt;
+ private OffsetAttribute offsetAtt;
+
protected EdgeNGramTokenFilter(TokenStream input) {
super(input);
- this.ngrams = new LinkedList();
+ this.termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ this.offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
}
/**
@@ -99,7 +107,8 @@ public class EdgeNGramTokenFilter extends TokenFilter {
this.minGram = minGram;
this.maxGram = maxGram;
this.side = side;
- this.ngrams = new LinkedList();
+ this.termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ this.offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
}
/**
@@ -114,54 +123,42 @@ public class EdgeNGramTokenFilter extends TokenFilter {
this(input, Side.getSide(sideLabel), minGram, maxGram);
}
- /** Returns the next token in the stream, or null at EOS. */
- public final Token next(final Token reusableToken) throws IOException {
- assert reusableToken != null;
- if (!ngrams.isEmpty()) {
- return (Token)ngrams.removeFirst();
- }
-
- Token token = null;
-
- while (ngrams.isEmpty() && (token = input.next()) != null) {
- ngram(token);
- }
-
- if (token == null) {
- return null;
- }
-
- if (!ngrams.isEmpty()) {
- return (Token)ngrams.removeFirst();
- } else {
- return null;
+ public final boolean incrementToken() throws IOException {
+ while (true) {
+ if (curTermBuffer == null) {
+ if (!input.incrementToken()) {
+ return false;
+ } else {
+ curTermBuffer = (char[]) termAtt.termBuffer().clone();
+ curTermLength = termAtt.termLength();
+ curGramSize = minGram;
+ }
+ }
+ if (curGramSize <= maxGram) {
+ if (! (curGramSize > curTermLength // if the remaining input is too short, we can't generate any n-grams
+ || curGramSize > maxGram)) { // if we have hit the end of our n-gram size range, quit
+ // grab gramSize chars from front or back
+ int start = side == Side.FRONT ? 0 : curTermLength - curGramSize;
+ int end = start + curGramSize;
+ offsetAtt.setOffset(start, end);
+ termAtt.setTermBuffer(curTermBuffer, start, curGramSize);
+ curGramSize++;
+ return true;
+ }
+ }
+ curTermBuffer = null;
}
}
+
+ /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+ * not be overridden. Delegates to the backwards compatibility layer. */
+ public final Token next(final Token reusableToken) throws java.io.IOException {
+ return super.next(reusableToken);
+ }
- private void ngram(final Token token) {
- int termLength = token.termLength();
- char[] termBuffer = token.termBuffer();
- int gramSize = minGram;
- while (gramSize <= maxGram) {
- // if the remaining input is too short, we can't generate any n-grams
- if (gramSize > termLength) {
- return;
- }
-
- // if we have hit the end of our n-gram size range, quit
- if (gramSize > maxGram) {
- return;
- }
-
- // grab gramSize chars from front or back
- int start = side == Side.FRONT ? 0 : termLength - gramSize;
- int end = start + gramSize;
- Token tok = (Token) token.clone();
- tok.setStartOffset(start);
- tok.setEndOffset(end);
- tok.setTermBuffer(termBuffer, start, gramSize);
- ngrams.add(tok);
- gramSize++;
- }
+ /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+ * not be overridden. Delegates to the backwards compatibility layer. */
+ public final Token next() throws java.io.IOException {
+ return super.next();
}
}
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java
index e6fe22b02a8..179ab33208f 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java
@@ -20,6 +20,8 @@ package org.apache.lucene.analysis.ngram;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter.Side;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import java.io.IOException;
import java.io.Reader;
@@ -35,6 +37,9 @@ public class EdgeNGramTokenizer extends Tokenizer {
public static final Side DEFAULT_SIDE = Side.FRONT;
public static final int DEFAULT_MAX_GRAM_SIZE = 1;
public static final int DEFAULT_MIN_GRAM_SIZE = 1;
+
+ private TermAttribute termAtt;
+ private OffsetAttribute offsetAtt;
// Replace this with an enum when the Java 1.5 upgrade is made, the impl will be simplified
/** Specifies which side of the input the n-gram should be generated from */
@@ -100,6 +105,9 @@ public class EdgeNGramTokenizer extends Tokenizer {
this.minGram = minGram;
this.maxGram = maxGram;
this.side = side;
+
+ this.termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ this.offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
}
/**
* Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
@@ -114,8 +122,7 @@ public class EdgeNGramTokenizer extends Tokenizer {
}
/** Returns the next token in the stream, or null at EOS. */
- public final Token next(final Token reusableToken) throws IOException {
- assert reusableToken != null;
+ public final boolean incrementToken() throws IOException {
// if we are just starting, read the whole input
if (!started) {
started = true;
@@ -128,21 +135,32 @@ public class EdgeNGramTokenizer extends Tokenizer {
// if the remaining input is too short, we can't generate any n-grams
if (gramSize > inLen) {
- return null;
+ return false;
}
// if we have hit the end of our n-gram size range, quit
if (gramSize > maxGram) {
- return null;
+ return false;
}
// grab gramSize chars from front or back
int start = side == Side.FRONT ? 0 : inLen - gramSize;
int end = start + gramSize;
- reusableToken.setTermBuffer(inStr, start, gramSize);
- reusableToken.setStartOffset(input.correctOffset(start));
- reusableToken.setEndOffset(input.correctOffset(end));
+ termAtt.setTermBuffer(inStr, start, gramSize);
+ offsetAtt.setOffset(input.correctOffset(start), input.correctOffset(end));
gramSize++;
- return reusableToken;
+ return true;
+ }
+
+ /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+ * not be overridden. Delegates to the backwards compatibility layer. */
+ public final Token next(final Token reusableToken) throws java.io.IOException {
+ return super.next(reusableToken);
+ }
+
+ /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+ * not be overridden. Delegates to the backwards compatibility layer. */
+ public final Token next() throws java.io.IOException {
+ return super.next();
}
}
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
index 761ec1891c8..ebf9fc0bdc0 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
@@ -17,12 +17,13 @@ package org.apache.lucene.analysis.ngram;
* limitations under the License.
*/
+import java.io.IOException;
+
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
-
-import java.io.IOException;
-import java.util.LinkedList;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/**
* Tokenizes the input into n-grams of the given size(s).
@@ -32,7 +33,14 @@ public class NGramTokenFilter extends TokenFilter {
public static final int DEFAULT_MAX_NGRAM_SIZE = 2;
private int minGram, maxGram;
- private LinkedList ngrams;
+
+ private char[] curTermBuffer;
+ private int curTermLength;
+ private int curGramSize;
+ private int curPos;
+
+ private TermAttribute termAtt;
+ private OffsetAttribute offsetAtt;
/**
* Creates NGramTokenFilter with given min and max n-grams.
@@ -50,7 +58,9 @@ public class NGramTokenFilter extends TokenFilter {
}
this.minGram = minGram;
this.maxGram = maxGram;
- this.ngrams = new LinkedList();
+
+ this.termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ this.offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
}
/**
@@ -62,40 +72,41 @@ public class NGramTokenFilter extends TokenFilter {
}
/** Returns the next token in the stream, or null at EOS. */
- public final Token next(final Token reusableToken) throws IOException {
- assert reusableToken != null;
- if (!ngrams.isEmpty()) {
- return (Token)ngrams.removeFirst();
- }
-
- Token token = null;
-
- while (ngrams.isEmpty() && (token = input.next()) != null) {
- ngram(token);
- }
-
- if (token == null) {
- return null;
- }
-
- if (!ngrams.isEmpty()) {
- return (Token)ngrams.removeFirst();
- } else {
- return null;
+ public final boolean incrementToken() throws IOException {
+ while (true) {
+ if (curTermBuffer == null) {
+ if (!input.incrementToken()) {
+ return false;
+ } else {
+ curTermBuffer = (char[]) termAtt.termBuffer().clone();
+ curTermLength = termAtt.termLength();
+ curGramSize = minGram;
+ curPos = 0;
+ }
+ }
+ while (curGramSize <= maxGram) {
+ while (curPos+curGramSize <= curTermLength) { // while there is input
+ termAtt.setTermBuffer(curTermBuffer, curPos, curGramSize);
+ offsetAtt.setOffset(curPos, curPos+curGramSize);
+ curPos++;
+ return true;
+ }
+ curGramSize++; // increase n-gram size
+ curPos = 0;
+ }
+ curTermBuffer = null;
}
}
+
+ /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+ * not be overridden. Delegates to the backwards compatibility layer. */
+ public final Token next(final Token reusableToken) throws java.io.IOException {
+ return super.next(reusableToken);
+ }
- private void ngram(Token token) {
- char[] termBuffer = token.termBuffer();
- int termLength = token.termLength();
- int gramSize = minGram;
- while (gramSize <= maxGram) {
- int pos = 0; // reset to beginning of string
- while (pos+gramSize <= termLength) { // while there is input
- ngrams.add(token.clone(termBuffer, pos, gramSize, pos, pos+gramSize));
- pos++;
- }
- gramSize++; // increase n-gram size
- }
+ /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+ * not be overridden. Delegates to the backwards compatibility layer. */
+ public final Token next() throws java.io.IOException {
+ return super.next();
}
}
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
index 9bfb4d309e7..72f7d8be36f 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
@@ -19,6 +19,8 @@ package org.apache.lucene.analysis.ngram;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import java.io.IOException;
import java.io.Reader;
@@ -36,6 +38,9 @@ public class NGramTokenizer extends Tokenizer {
private int inLen;
private String inStr;
private boolean started = false;
+
+ private TermAttribute termAtt;
+ private OffsetAttribute offsetAtt;
/**
* Creates NGramTokenizer with given min and max n-grams.
@@ -53,6 +58,9 @@ public class NGramTokenizer extends Tokenizer {
}
this.minGram = minGram;
this.maxGram = maxGram;
+
+ this.termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ this.offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
}
/**
* Creates NGramTokenizer with default min and max n-grams.
@@ -63,8 +71,7 @@ public class NGramTokenizer extends Tokenizer {
}
/** Returns the next token in the stream, or null at EOS. */
- public final Token next(final Token reusableToken) throws IOException {
- assert reusableToken != null;
+ public final boolean incrementToken() throws IOException {
if (!started) {
started = true;
gramSize = minGram;
@@ -78,13 +85,27 @@ public class NGramTokenizer extends Tokenizer {
pos = 0; // reset to beginning of string
gramSize++; // increase n-gram size
if (gramSize > maxGram) // we are done
- return null;
+ return false;
if (pos+gramSize > inLen)
- return null;
+ return false;
}
int oldPos = pos;
pos++;
- return reusableToken.reinit(inStr, oldPos, gramSize, input.correctOffset(oldPos), input.correctOffset(oldPos+gramSize));
+ termAtt.setTermBuffer(inStr, oldPos, gramSize);
+ offsetAtt.setOffset(input.correctOffset(oldPos), input.correctOffset(oldPos+gramSize));
+ return true;
+ }
+
+ /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+ * not be overridden. Delegates to the backwards compatibility layer. */
+ public final Token next(final Token reusableToken) throws java.io.IOException {
+ return super.next(reusableToken);
+ }
+
+ /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+ * not be overridden. Delegates to the backwards compatibility layer. */
+ public final Token next() throws java.io.IOException {
+ return super.next();
}
}
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java
index bacc5eec667..037ee028011 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java
@@ -17,15 +17,15 @@ package org.apache.lucene.analysis.nl;
* limitations under the License.
*/
-import org.apache.lucene.analysis.Token;
-import org.apache.lucene.analysis.TokenFilter;
-import org.apache.lucene.analysis.TokenStream;
-
import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
-import java.util.Set;
import java.util.Map;
+import java.util.Set;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/**
* A filter that stems Dutch words. It supports a table of words that should
@@ -39,10 +39,13 @@ public final class DutchStemFilter extends TokenFilter {
*/
private DutchStemmer stemmer = null;
private Set exclusions = null;
+
+ private TermAttribute termAtt;
public DutchStemFilter(TokenStream _in) {
super(_in);
stemmer = new DutchStemmer();
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
}
/**
@@ -62,24 +65,23 @@ public final class DutchStemFilter extends TokenFilter {
}
/**
- * @return Returns the next token in the stream, or null at EOS
+ * Returns the next token in the stream, or null at EOS
*/
- public Token next(Token reusableToken) throws IOException {
- assert reusableToken != null;
- Token nextToken = input.next(reusableToken);
- if (nextToken == null)
- return null;
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ String term = termAtt.term();
- String term = nextToken.term();
-
- // Check the exclusion table.
- if (exclusions == null || !exclusions.contains(term)) {
- String s = stemmer.stem(term);
- // If not stemmed, don't waste the time adjusting the token.
- if ((s != null) && !s.equals(term))
- nextToken.setTermBuffer(s);
+ // Check the exclusion table.
+ if (exclusions == null || !exclusions.contains(term)) {
+ String s = stemmer.stem(term);
+ // If not stemmed, don't waste the time adjusting the token.
+ if ((s != null) && !s.equals(term))
+ termAtt.setTermBuffer(s);
+ }
+ return true;
+ } else {
+ return false;
}
- return nextToken;
}
/**
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/DelimitedPayloadTokenFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/DelimitedPayloadTokenFilter.java
index f4931153506..ab022c2297f 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/DelimitedPayloadTokenFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/DelimitedPayloadTokenFilter.java
@@ -16,14 +16,13 @@ package org.apache.lucene.analysis.payloads;
* limitations under the License.
*/
+import java.io.IOException;
+
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
-import java.io.IOException;
-
/**
* Characters before the delimiter are the "token", those after are the payload.
@@ -37,7 +36,7 @@ import java.io.IOException;
*
* @see PayloadEncoder
*/
-public class DelimitedPayloadTokenFilter extends TokenFilter {
+public final class DelimitedPayloadTokenFilter extends TokenFilter {
public static final char DEFAULT_DELIMITER = '|';
protected char delimiter = DEFAULT_DELIMITER;
protected TermAttribute termAtt;
@@ -83,27 +82,4 @@ public class DelimitedPayloadTokenFilter extends TokenFilter {
}
return result;
}
-
-
- public Token next(Token reusableToken) throws IOException {
- Token result = input.next(reusableToken);
- if (result != null) {
- final char[] buffer = result.termBuffer();
- final int length = result.termLength();
- boolean seen = false;
- for (int i = 0; i < length; i++) {
- if (buffer[i] == delimiter) {
- result.setTermBuffer(buffer, 0, i);
- result.setPayload(encoder.encode(buffer, i + 1, (length - (i + 1))));
- seen = true;
- break;//at this point, we know the whole piece, so we can exit. If we don't see the delimiter, then the termAtt is the same
- }
- }
- if (seen == false) {
- //no delimiter
- payAtt.setPayload(null);
- }
- }
- return result;
- }
}
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilter.java
index 2e796492448..7999ca0b7c6 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilter.java
@@ -20,6 +20,8 @@ package org.apache.lucene.analysis.payloads;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.index.Payload;
import java.io.IOException;
@@ -34,19 +36,37 @@ public class NumericPayloadTokenFilter extends TokenFilter {
private String typeMatch;
private Payload thePayload;
+ private PayloadAttribute payloadAtt;
+ private TypeAttribute typeAtt;
+
public NumericPayloadTokenFilter(TokenStream input, float payload, String typeMatch) {
super(input);
//Need to encode the payload
thePayload = new Payload(PayloadHelper.encodeFloat(payload));
this.typeMatch = typeMatch;
+ payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
+ typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
}
- public Token next(final Token reusableToken) throws IOException {
- assert reusableToken != null;
- Token nextToken = input.next(reusableToken);
- if (nextToken != null && nextToken.type().equals(typeMatch)){
- nextToken.setPayload(thePayload);
+ public final boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ if (typeAtt.type().equals(typeMatch))
+ payloadAtt.setPayload(thePayload);
+ return true;
+ } else {
+ return false;
}
- return nextToken;
+ }
+
+ /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+ * not be overridden. Delegates to the backwards compatibility layer. */
+ public final Token next(final Token reusableToken) throws java.io.IOException {
+ return super.next(reusableToken);
+ }
+
+ /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+ * not be overridden. Delegates to the backwards compatibility layer. */
+ public final Token next() throws java.io.IOException {
+ return super.next();
}
}
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilter.java
index a3a56d12506..76add35b780 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilter.java
@@ -17,13 +17,15 @@ package org.apache.lucene.analysis.payloads;
*/
+import java.io.IOException;
+
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.index.Payload;
-import java.io.IOException;
-
/**
* Adds the {@link org.apache.lucene.analysis.Token#setStartOffset(int)}
@@ -32,22 +34,37 @@ import java.io.IOException;
*
**/
public class TokenOffsetPayloadTokenFilter extends TokenFilter {
-
+ protected OffsetAttribute offsetAtt;
+ protected PayloadAttribute payAtt;
public TokenOffsetPayloadTokenFilter(TokenStream input) {
super(input);
+ offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
+ payAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
}
- public Token next(final Token reusableToken) throws IOException {
- assert reusableToken != null;
- Token nextToken = input.next(reusableToken);
- if (nextToken != null){
+ public final boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
byte[] data = new byte[8];
- PayloadHelper.encodeInt(nextToken.startOffset(), data, 0);
- PayloadHelper.encodeInt(nextToken.endOffset(), data, 4);
+ PayloadHelper.encodeInt(offsetAtt.startOffset(), data, 0);
+ PayloadHelper.encodeInt(offsetAtt.endOffset(), data, 4);
Payload payload = new Payload(data);
- nextToken.setPayload(payload);
+ payAtt.setPayload(payload);
+ return true;
+ } else {
+ return false;
}
- return nextToken;
+ }
+
+ /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+ * not be overridden. Delegates to the backwards compatibility layer. */
+ public final Token next(final Token reusableToken) throws java.io.IOException {
+ return super.next(reusableToken);
+ }
+
+ /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+ * not be overridden. Delegates to the backwards compatibility layer. */
+ public final Token next() throws java.io.IOException {
+ return super.next();
}
}
\ No newline at end of file
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilter.java
index 19b191b6974..bd26e536e57 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilter.java
@@ -20,6 +20,8 @@ package org.apache.lucene.analysis.payloads;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.index.Payload;
import java.io.IOException;
@@ -32,19 +34,37 @@ import java.io.IOException;
*
**/
public class TypeAsPayloadTokenFilter extends TokenFilter {
+ private PayloadAttribute payloadAtt;
+ private TypeAttribute typeAtt;
public TypeAsPayloadTokenFilter(TokenStream input) {
super(input);
-
+ payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
+ typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
}
- public Token next(final Token reusableToken) throws IOException {
- assert reusableToken != null;
- Token nextToken = input.next(reusableToken);
- if (nextToken != null && nextToken.type() != null && nextToken.type().equals("") == false){
- nextToken.setPayload(new Payload(nextToken.type().getBytes("UTF-8")));
+ public final boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ String type = typeAtt.type();
+ if (type != null && type.equals("") == false) {
+ payloadAtt.setPayload(new Payload(type.getBytes("UTF-8")));
+ }
+ return true;
+ } else {
+ return false;
}
- return nextToken;
+ }
+
+ /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+ * not be overridden. Delegates to the backwards compatibility layer. */
+ public final Token next(final Token reusableToken) throws java.io.IOException {
+ return super.next(reusableToken);
+ }
+
+ /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+ * not be overridden. Delegates to the backwards compatibility layer. */
+ public final Token next() throws java.io.IOException {
+ return super.next();
}
}
\ No newline at end of file
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/position/PositionFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/position/PositionFilter.java
index d1cc144f14e..6eb66372007 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/position/PositionFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/position/PositionFilter.java
@@ -22,6 +22,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
/** Set the positionIncrement of all tokens to the "positionIncrement",
* except the first return token which retains its original positionIncrement value.
@@ -34,6 +35,8 @@ public class PositionFilter extends TokenFilter {
/** The first token must have non-zero positionIncrement **/
private boolean firstTokenPositioned = false;
+
+ private PositionIncrementAttribute posIncrAtt;
/**
* Constructs a PositionFilter that assigns a position increment of zero to
@@ -43,6 +46,7 @@ public class PositionFilter extends TokenFilter {
*/
public PositionFilter(final TokenStream input) {
super(input);
+ posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
}
/**
@@ -58,18 +62,29 @@ public class PositionFilter extends TokenFilter {
this.positionIncrement = positionIncrement;
}
- public Token next(Token reusableToken) throws IOException {
-
- assert reusableToken != null;
- reusableToken = input.next(reusableToken);
- if (null != reusableToken) {
+ public final boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
if (firstTokenPositioned) {
- reusableToken.setPositionIncrement(positionIncrement);
+ posIncrAtt.setPositionIncrement(positionIncrement);
} else {
firstTokenPositioned = true;
}
+ return true;
+ } else {
+ return false;
}
- return reusableToken;
+ }
+
+ /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+ * not be overridden. Delegates to the backwards compatibility layer. */
+ public final Token next(final Token reusableToken) throws java.io.IOException {
+ return super.next(reusableToken);
+ }
+
+ /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+ * not be overridden. Delegates to the backwards compatibility layer. */
+ public final Token next() throws java.io.IOException {
+ return super.next();
}
public void reset() throws IOException {
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/reverse/ReverseStringFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/reverse/ReverseStringFilter.java
index 6a07a70eba6..90dc8812446 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/reverse/ReverseStringFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/reverse/ReverseStringFilter.java
@@ -19,7 +19,7 @@ package org.apache.lucene.analysis.reverse;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import java.io.IOException;
@@ -30,16 +30,20 @@ import java.io.IOException;
*/
public final class ReverseStringFilter extends TokenFilter {
+ private TermAttribute termAtt;
+
public ReverseStringFilter(TokenStream in) {
super(in);
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
}
- public final Token next(Token in) throws IOException {
- assert in != null;
- Token token=input.next(in);
- if( token == null ) return null;
- reverse( token.termBuffer(), token.termLength() );
- return token;
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ reverse( termAtt.termBuffer(), termAtt.termLength() );
+ return true;
+ } else {
+ return false;
+ }
}
public static String reverse( final String input ){
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java
index d27e1800f9f..cd54f0b5712 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java
@@ -17,9 +17,12 @@ package org.apache.lucene.analysis.ru;
* limitations under the License.
*/
+import java.io.IOException;
+
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/**
* Normalizes token text to lower case, analyzing given ("russian") charset.
@@ -31,26 +34,27 @@ public final class RussianLowerCaseFilter extends TokenFilter
{
char[] charset;
+ private TermAttribute termAtt;
+
public RussianLowerCaseFilter(TokenStream in, char[] charset)
{
super(in);
this.charset = charset;
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
}
- public final Token next(final Token reusableToken) throws java.io.IOException
+ public final boolean incrementToken() throws IOException
{
- assert reusableToken != null;
- Token nextToken = input.next(reusableToken);
-
- if (nextToken == null)
- return null;
-
- char[] chArray = nextToken.termBuffer();
- int chLen = nextToken.termLength();
+ if (input.incrementToken()) {
+ char[] chArray = termAtt.termBuffer();
+ int chLen = termAtt.termLength();
for (int i = 0; i < chLen; i++)
{
- chArray[i] = RussianCharsets.toLowerCase(chArray[i], charset);
+ chArray[i] = RussianCharsets.toLowerCase(chArray[i], charset);
}
- return nextToken;
+ return true;
+ } else {
+ return false;
+ }
}
}
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java
index f39eea92444..ab87c2b2ea0 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java
@@ -20,6 +20,8 @@ package org.apache.lucene.analysis.ru;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+
import java.io.IOException;
/**
@@ -37,29 +39,32 @@ public final class RussianStemFilter extends TokenFilter
*/
private RussianStemmer stemmer = null;
+ private TermAttribute termAtt;
+
public RussianStemFilter(TokenStream in, char[] charset)
{
super(in);
stemmer = new RussianStemmer(charset);
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
}
/**
- * @return Returns the next token in the stream, or null at EOS
+ * Returns the next token in the stream, or null at EOS
*/
- public final Token next(final Token reusableToken) throws IOException
+ public final boolean incrementToken() throws IOException
{
- assert reusableToken != null;
- Token nextToken = input.next(reusableToken);
- if (nextToken == null)
- return null;
-
- String term = nextToken.term();
+ if (input.incrementToken()) {
+ String term = termAtt.term();
String s = stemmer.stem(term);
if (s != null && !s.equals(term))
- nextToken.setTermBuffer(s);
- return nextToken;
+ termAtt.setTermBuffer(s);
+ return true;
+ } else {
+ return false;
+ }
}
+
/**
* Set a alternative/custom RussianStemmer for this filter.
*/
diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java
index 0a5f99bcbac..055a0b1674e 100644
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java
@@ -22,6 +22,9 @@ import java.lang.Character.UnicodeBlock;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+
import java.text.BreakIterator;
/**
@@ -32,46 +35,62 @@ import java.text.BreakIterator;
public class ThaiWordFilter extends TokenFilter {
private BreakIterator breaker = null;
- private Token thaiToken = null;
+ private TermAttribute termAtt;
+ private OffsetAttribute offsetAtt;
+
+ private State thaiState = null;
+
public ThaiWordFilter(TokenStream input) {
super(input);
breaker = BreakIterator.getWordInstance(new Locale("th"));
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
}
- public Token next(final Token reusableToken) throws IOException {
- assert reusableToken != null;
- if (thaiToken != null) {
+ public final boolean incrementToken() throws IOException {
+ if (thaiState != null) {
int start = breaker.current();
int end = breaker.next();
if (end != BreakIterator.DONE) {
- reusableToken.reinit(thaiToken, thaiToken.termBuffer(), start, end - start);
- reusableToken.setStartOffset(thaiToken.startOffset()+start);
- reusableToken.setEndOffset(thaiToken.startOffset()+end);
- return reusableToken;
+ restoreState(thaiState);
+ termAtt.setTermBuffer(termAtt.termBuffer(), start, end - start);
+ offsetAtt.setOffset(offsetAtt.startOffset() + start, offsetAtt.startOffset() + end);
+ return true;
}
- thaiToken = null;
+ thaiState = null;
}
- Token nextToken = input.next(reusableToken);
- if (nextToken == null || nextToken.termLength() == 0) {
- return null;
- }
+ if (input.incrementToken() == false || termAtt.termLength() == 0)
+ return false;
- String text = nextToken.term();
+ String text = termAtt.term();
if (UnicodeBlock.of(text.charAt(0)) != UnicodeBlock.THAI) {
- nextToken.setTermBuffer(text.toLowerCase());
- return nextToken;
+ termAtt.setTermBuffer(text.toLowerCase());
+ return true;
}
+
+ thaiState = captureState();
- thaiToken = (Token) nextToken.clone();
breaker.setText(text);
int end = breaker.next();
if (end != BreakIterator.DONE) {
- nextToken.setTermBuffer(text, 0, end);
- nextToken.setEndOffset(nextToken.startOffset() + end);
- return nextToken;
+ termAtt.setTermBuffer(text, 0, end);
+ offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset() + end);
+ return true;
}
- return null;
+ return false;
+ }
+
+ /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+ * not be overridden. Delegates to the backwards compatibility layer. */
+ public final Token next(final Token reusableToken) throws java.io.IOException {
+ return super.next(reusableToken);
+ }
+
+ /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+ * not be overridden. Delegates to the backwards compatibility layer. */
+ public final Token next() throws java.io.IOException {
+ return super.next();
}
}
diff --git a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicNormalizationFilter.java b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicNormalizationFilter.java
index bbebe979a0c..99d170eb31e 100644
--- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicNormalizationFilter.java
+++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicNormalizationFilter.java
@@ -17,18 +17,12 @@ package org.apache.lucene.analysis.ar;
* limitations under the License.
*/
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileInputStream;
import java.io.IOException;
-import java.io.InputStreamReader;
import java.io.StringReader;
import junit.framework.TestCase;
-import org.apache.lucene.analysis.Token;
-import org.apache.lucene.analysis.WhitespaceTokenizer;
-import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/**
* Test the Arabic Normalization Filter
@@ -95,11 +89,10 @@ public class TestArabicNormalizationFilter extends TestCase {
private void check(final String input, final String expected) throws IOException {
ArabicLetterTokenizer tokenStream = new ArabicLetterTokenizer(new StringReader(input));
ArabicNormalizationFilter filter = new ArabicNormalizationFilter(tokenStream);
- final Token reusableToken = new Token();
- Token nextToken = filter.next(reusableToken);
- if (nextToken == null)
- fail();
- assertEquals(expected, nextToken.term());
+ TermAttribute termAtt = (TermAttribute) filter.getAttribute(TermAttribute.class);
+
+ assertTrue(filter.incrementToken());
+ assertEquals(expected, termAtt.term());
filter.close();
}
diff --git a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java
index 01dc5449ade..9e4bcfdf53b 100644
--- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java
+++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java
@@ -17,17 +17,12 @@ package org.apache.lucene.analysis.ar;
* limitations under the License.
*/
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileInputStream;
import java.io.IOException;
-import java.io.InputStreamReader;
import java.io.StringReader;
import junit.framework.TestCase;
-import org.apache.lucene.analysis.Token;
-import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/**
* Test the Arabic Normalization Filter
@@ -118,11 +113,10 @@ public class TestArabicStemFilter extends TestCase {
private void check(final String input, final String expected) throws IOException {
ArabicLetterTokenizer tokenStream = new ArabicLetterTokenizer(new StringReader(input));
ArabicStemFilter filter = new ArabicStemFilter(tokenStream);
- final Token reusableToken = new Token();
- Token nextToken = filter.next(reusableToken);
- if (nextToken == null)
- fail();
- assertEquals(expected, nextToken.term());
+ TermAttribute termAtt = (TermAttribute) filter.getAttribute(TermAttribute.class);
+
+ assertTrue(filter.incrementToken());
+ assertEquals(expected, termAtt.term());
filter.close();
}
diff --git a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java
index 9c0fdc36f23..e1c9062425f 100644
--- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java
+++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java
@@ -23,8 +23,8 @@ import java.io.StringReader;
import junit.framework.TestCase;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/**
* Test the Brazilian Stem Filter, which only modifies the term text.
@@ -122,12 +122,10 @@ public class TestBrazilianStemmer extends TestCase {
private void check(final String input, final String expected) throws IOException {
Analyzer analyzer = new BrazilianAnalyzer();
TokenStream stream = analyzer.tokenStream("dummy", new StringReader(input));
- final Token reusableToken = new Token();
- Token nextToken = stream.next(reusableToken);
- if (nextToken == null)
- fail();
- assertEquals(expected, nextToken.term());
- assertTrue(stream.next(nextToken) == null);
+ TermAttribute text = (TermAttribute) stream.getAttribute(TermAttribute.class);
+ assertTrue(stream.incrementToken());
+ assertEquals(expected, text.term());
+ assertFalse(stream.incrementToken());
stream.close();
}
diff --git a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java
index 36268792d10..c15ea48d964 100644
--- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java
+++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java
@@ -21,50 +21,49 @@ import java.io.IOException;
import java.io.StringReader;
import junit.framework.TestCase;
-import org.apache.lucene.analysis.Token;
+
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
public class TestCJKTokenizer extends TestCase{
+
+ class TestToken {
+ String termText;
+ int start;
+ int end;
+ String type;
+ }
- public Token newToken(String termText, int start, int end, int type) {
- Token token = new Token(start, end);
- token.setTermBuffer(termText);
- token.setType(CJKTokenizer.TOKEN_TYPE_NAMES[type]);
+ public TestToken newToken(String termText, int start, int end, int type) {
+ TestToken token = new TestToken();
+ token.termText = termText;
+ token.type = CJKTokenizer.TOKEN_TYPE_NAMES[type];
+ token.start = start;
+ token.end = end;
return token;
}
- public void checkCJKToken(final String str, final Token[] out_tokens) throws IOException {
+ public void checkCJKToken(final String str, final TestToken[] out_tokens) throws IOException {
CJKTokenizer tokenizer = new CJKTokenizer(new StringReader(str));
- int i = 0;
- System.out.println("string[" + str + "]");
- System.out.print("tokens[");
- final Token reusableToken = new Token();
- for (Token token = tokenizer.next(reusableToken) ;
- token != null ;
- token = tokenizer.next(reusableToken) ) {
- if (token.term().equals(out_tokens[i].term())
- && token.startOffset() == out_tokens[i].startOffset()
- && token.endOffset() == out_tokens[i].endOffset()
- && token.type().equals(out_tokens[i].type()) ) {
- System.out.print( token.term() + " ");
- }
- else {
- fail(token.term() + " (start: " + token.startOffset()
- + " end: " + token.endOffset() + " type: " + token.type() + ") != "
- + out_tokens[i].term() + " (start: " + out_tokens[i].startOffset()
- + " end: " + out_tokens[i].endOffset()
- + " type: " + out_tokens[i].type() + ")");
- break;
- }
- ++i;
+ TermAttribute termAtt = (TermAttribute) tokenizer.getAttribute(TermAttribute.class);
+ OffsetAttribute offsetAtt = (OffsetAttribute) tokenizer.getAttribute(OffsetAttribute.class);
+ TypeAttribute typeAtt = (TypeAttribute) tokenizer.getAttribute(TypeAttribute.class);
+ for (int i = 0; i < out_tokens.length; i++) {
+ assertTrue(tokenizer.incrementToken());
+ assertEquals(termAtt.term(), out_tokens[i].termText);
+ assertEquals(offsetAtt.startOffset(), out_tokens[i].start);
+ assertEquals(offsetAtt.endOffset(), out_tokens[i].end);
+ assertEquals(typeAtt.type(), out_tokens[i].type);
}
- System.out.println("]" + System.getProperty("line.separator"));
+ assertFalse(tokenizer.incrementToken());
}
public void testJa1() throws IOException {
String str = "\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341";
- Token[] out_tokens = {
+ TestToken[] out_tokens = {
newToken("\u4e00\u4e8c", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u4e8c\u4e09", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u4e09\u56db", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
@@ -81,7 +80,7 @@ public class TestCJKTokenizer extends TestCase{
public void testJa2() throws IOException {
String str = "\u4e00 \u4e8c\u4e09\u56db \u4e94\u516d\u4e03\u516b\u4e5d \u5341";
- Token[] out_tokens = {
+ TestToken[] out_tokens = {
newToken("\u4e00", 0, 1, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u4e8c\u4e09", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u4e09\u56db", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE),
@@ -97,7 +96,7 @@ public class TestCJKTokenizer extends TestCase{
public void testC() throws IOException {
String str = "abc defgh ijklmn opqrstu vwxy z";
- Token[] out_tokens = {
+ TestToken[] out_tokens = {
newToken("abc", 0, 3, CJKTokenizer.SINGLE_TOKEN_TYPE),
newToken("defgh", 4, 9, CJKTokenizer.SINGLE_TOKEN_TYPE),
newToken("ijklmn", 10, 16, CJKTokenizer.SINGLE_TOKEN_TYPE),
@@ -111,7 +110,7 @@ public class TestCJKTokenizer extends TestCase{
public void testMix() throws IOException {
String str = "\u3042\u3044\u3046\u3048\u304aabc\u304b\u304d\u304f\u3051\u3053";
- Token[] out_tokens = {
+ TestToken[] out_tokens = {
newToken("\u3042\u3044", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u3044\u3046", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u3046\u3048", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
@@ -128,7 +127,7 @@ public class TestCJKTokenizer extends TestCase{
public void testMix2() throws IOException {
String str = "\u3042\u3044\u3046\u3048\u304aab\u3093c\u304b\u304d\u304f\u3051 \u3053";
- Token[] out_tokens = {
+ TestToken[] out_tokens = {
newToken("\u3042\u3044", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u3044\u3046", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE),
newToken("\u3046\u3048", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
@@ -147,7 +146,7 @@ public class TestCJKTokenizer extends TestCase{
public void testSingleChar() throws IOException {
String str = "\u4e00";
- Token[] out_tokens = {
+ TestToken[] out_tokens = {
newToken("\u4e00", 0, 1, CJKTokenizer.DOUBLE_TOKEN_TYPE),
};
checkCJKToken(str, out_tokens);
diff --git a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java
index 2990f40cda1..32417f26ce6 100644
--- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java
+++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java
@@ -22,7 +22,7 @@ import java.io.StringReader;
import junit.framework.TestCase;
-import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
public class TestChineseTokenizer extends TestCase
@@ -34,12 +34,12 @@ public class TestChineseTokenizer extends TestCase
int correctStartOffset = 0;
int correctEndOffset = 1;
- final Token reusableToken = new Token();
- for (Token nextToken = tokenizer.next(reusableToken); nextToken != null; nextToken = tokenizer.next(reusableToken)) {
- assertEquals(correctStartOffset, nextToken.startOffset());
- assertEquals(correctEndOffset, nextToken.endOffset());
- correctStartOffset++;
- correctEndOffset++;
+ OffsetAttribute offsetAtt = (OffsetAttribute) tokenizer.getAttribute(OffsetAttribute.class);
+ while (tokenizer.incrementToken()) {
+ assertEquals(correctStartOffset, offsetAtt.startOffset());
+ assertEquals(correctEndOffset, offsetAtt.endOffset());
+ correctStartOffset++;
+ correctEndOffset++;
}
}
}
diff --git a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java
index 581d47ebf52..d51edb064c6 100644
--- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java
+++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java
@@ -31,15 +31,14 @@ import java.util.List;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
-import org.apache.lucene.analysis.Token;
+import junit.framework.TestCase;
+
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.WhitespaceTokenizer;
-import org.apache.lucene.analysis.compound.CompoundWordTokenFilterBase;
-import org.apache.lucene.analysis.compound.DictionaryCompoundWordTokenFilter;
-import org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilter;
import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
-
-import junit.framework.TestCase;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
public class TestCompoundWordTokenFilter extends TestCase {
private static String[] locations = {
@@ -155,16 +154,18 @@ public class TestCompoundWordTokenFilter extends TestCase {
private void assertFiltersTo(TokenFilter tf, String[] s, int[] startOffset,
int[] endOffset, int[] posIncr) throws Exception {
- final Token reusableToken = new Token();
+ TermAttribute termAtt = (TermAttribute) tf.getAttribute(TermAttribute.class);
+ OffsetAttribute offsetAtt = (OffsetAttribute) tf.getAttribute(OffsetAttribute.class);
+ PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) tf.getAttribute(PositionIncrementAttribute.class);
+
for (int i = 0; i < s.length; ++i) {
- Token nextToken = tf.next(reusableToken);
- assertNotNull(nextToken);
- assertEquals(s[i], nextToken.term());
- assertEquals(startOffset[i], nextToken.startOffset());
- assertEquals(endOffset[i], nextToken.endOffset());
- assertEquals(posIncr[i], nextToken.getPositionIncrement());
+ assertTrue(tf.incrementToken());
+ assertEquals(s[i], termAtt.term());
+ assertEquals(startOffset[i], offsetAtt.startOffset());
+ assertEquals(endOffset[i], offsetAtt.endOffset());
+ assertEquals(posIncr[i], posIncAtt.getPositionIncrement());
}
- assertNull(tf.next(reusableToken));
+ assertFalse(tf.incrementToken());
}
private void getHyphenationPatternFileContents() {
diff --git a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java
index 0848f522152..5460c95d5f7 100644
--- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java
+++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java
@@ -22,8 +22,8 @@ import java.io.StringReader;
import junit.framework.TestCase;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/**
* Test the CzechAnalyzer
@@ -39,13 +39,12 @@ public class TestCzechAnalyzer extends TestCase {
private void assertAnalyzesTo(Analyzer a, String input, String[] output) throws Exception {
TokenStream ts = a.tokenStream("dummy", new StringReader(input));
- final Token reusableToken = new Token();
+ TermAttribute text = (TermAttribute) ts.getAttribute(TermAttribute.class);
for (int i=0; i
*/
-public class SentenceTokenizer extends Tokenizer {
+public final class SentenceTokenizer extends Tokenizer {
/**
* End of sentence punctuation: 。,!?;,!?;
@@ -39,12 +41,19 @@ public class SentenceTokenizer extends Tokenizer {
private final StringBuffer buffer = new StringBuffer();
private int tokenStart = 0, tokenEnd = 0;
+
+ private TermAttribute termAtt;
+ private OffsetAttribute offsetAtt;
+ private TypeAttribute typeAtt;
public SentenceTokenizer(Reader reader) {
super(reader);
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
+ typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
}
- public Token next(final Token reusableToken) throws IOException {
+ public boolean incrementToken() throws IOException {
buffer.setLength(0);
int ci;
char ch, pch;
@@ -83,11 +92,12 @@ public class SentenceTokenizer extends Tokenizer {
}
}
if (buffer.length() == 0)
- return null;
+ return false;
else {
- reusableToken.clear();
- reusableToken.reinit(buffer.toString(), input.correctOffset(tokenStart), input.correctOffset(tokenEnd), "sentence");
- return reusableToken;
+ termAtt.setTermBuffer(buffer.toString());
+ offsetAtt.setOffset(input.correctOffset(tokenStart), input.correctOffset(tokenEnd));
+ typeAtt.setType("sentence");
+ return true;
}
}
diff --git a/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordSegmenter.java b/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordSegmenter.java
index 1a79ae0508a..db3d9deb2e6 100644
--- a/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordSegmenter.java
+++ b/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordSegmenter.java
@@ -20,7 +20,6 @@ package org.apache.lucene.analysis.cn.smart;
import java.util.ArrayList;
import java.util.List;
-import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.cn.smart.hhmm.HHMMSegmenter;
import org.apache.lucene.analysis.cn.smart.hhmm.SegToken;
import org.apache.lucene.analysis.cn.smart.hhmm.SegTokenFilter;
@@ -37,11 +36,11 @@ class WordSegmenter {
/**
* Segment a sentence into words with {@link HHMMSegmenter}
*
- * @param sentenceToken sentence {@link Token}
+ * @param sentence input sentence
+ * @param startOffset start offset of sentence
* @return {@link List} of {@link SegToken}
*/
- public List segmentSentence(Token sentenceToken) {
- String sentence = sentenceToken.term();
+ public List segmentSentence(String sentence, int startOffset) {
List segTokenList = hhmmSegmenter.process(sentence);
@@ -49,25 +48,25 @@ class WordSegmenter {
// tokens from sentence, excluding WordType.SENTENCE_BEGIN and WordType.SENTENCE_END
for (int i = 1; i < segTokenList.size() - 1; i++) {
- result.add(convertSegToken((SegToken) segTokenList.get(i), sentence,
- sentenceToken.startOffset(), "word"));
+ result.add(convertSegToken((SegToken) segTokenList.get(i), sentence, startOffset));
}
return result;
}
/**
- * Convert a {@link SegToken} to a Lucene {@link Token}
+ * Process a {@link SegToken} so that it is ready for indexing.
+ *
+ * This method calculates offsets and normalizes the token with {@link SegTokenFilter}.
*
* @param st input {@link SegToken}
* @param sentence associated Sentence
* @param sentenceStartOffset offset into sentence
- * @param type token type, default is word
- * @return Lucene {@link Token}
+ * @return Lucene {@link SegToken}
*/
- public Token convertSegToken(SegToken st, String sentence,
- int sentenceStartOffset, String type) {
- Token result;
+ public SegToken convertSegToken(SegToken st, String sentence,
+ int sentenceStartOffset) {
+
switch (st.wordType) {
case WordType.STRING:
case WordType.NUMBER:
@@ -81,9 +80,8 @@ class WordSegmenter {
}
st = tokenFilter.filter(st);
-
- result = new Token(st.charArray, 0, st.charArray.length, st.startOffset
- + sentenceStartOffset, st.endOffset + sentenceStartOffset);
- return result;
+ st.startOffset += sentenceStartOffset;
+ st.endOffset += sentenceStartOffset;
+ return st;
}
}
diff --git a/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordTokenFilter.java b/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordTokenFilter.java
index df9fc845465..5882375e990 100644
--- a/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordTokenFilter.java
+++ b/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordTokenFilter.java
@@ -21,20 +21,27 @@ import java.io.IOException;
import java.util.Iterator;
import java.util.List;
-import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.cn.smart.hhmm.SegToken;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
/**
* A {@link TokenFilter} that breaks sentences into words.
*/
-public class WordTokenFilter extends TokenFilter {
+public final class WordTokenFilter extends TokenFilter {
private WordSegmenter wordSegmenter;
private Iterator tokenIter;
private List tokenBuffer;
+
+ private TermAttribute termAtt;
+ private OffsetAttribute offsetAtt;
+ private TypeAttribute typeAtt;
/**
* Construct a new WordTokenizer.
@@ -44,32 +51,34 @@ public class WordTokenFilter extends TokenFilter {
public WordTokenFilter(TokenStream in) {
super(in);
this.wordSegmenter = new WordSegmenter();
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
+ typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
}
-
- public Token next(final Token reusableSentenceToken) throws IOException {
- if (tokenIter != null && tokenIter.hasNext())
- return (Token) tokenIter.next();
- else {
- Token nextToken = input.next(reusableSentenceToken);
- if (processNextSentence(nextToken)) {
- return (Token) tokenIter.next();
- } else
- return null;
- }
- }
-
- /**
- * Process the next input sentence, placing tokens into tokenBuffer
- *
- * @param reusableSentenceToken input sentence
- * @return true if more tokens were placed into tokenBuffer.
- * @throws IOException
- */
- private boolean processNextSentence(final Token reusableSentenceToken) throws IOException {
- if (reusableSentenceToken == null)
- return false;
- tokenBuffer = wordSegmenter.segmentSentence(reusableSentenceToken);
- tokenIter = tokenBuffer.iterator();
- return tokenBuffer != null && tokenIter.hasNext();
+
+ public boolean incrementToken() throws IOException {
+ if (tokenIter == null || !tokenIter.hasNext()) {
+ // there are no remaining tokens from the current sentence... are there more sentences?
+ if (input.incrementToken()) {
+ // a new sentence is available: process it.
+ tokenBuffer = wordSegmenter.segmentSentence(termAtt.term(), offsetAtt.startOffset());
+ tokenIter = tokenBuffer.iterator();
+ /*
+ * it should not be possible to have a sentence with 0 words, check just in case.
+ * returning EOS isn't the best either, but its the behavior of the original code.
+ */
+ if (!tokenIter.hasNext())
+ return false;
+ } else {
+ return false; // no more sentences, end of stream!
+ }
+ }
+
+ // There are remaining tokens from the current sentence, return the next one.
+ SegToken nextWord = (SegToken) tokenIter.next();
+ termAtt.setTermBuffer(nextWord.charArray, 0, nextWord.charArray.length);
+ offsetAtt.setOffset(nextWord.startOffset, nextWord.endOffset);
+ typeAtt.setType("word");
+ return true;
}
}
diff --git a/contrib/analyzers/smartcn/src/test/org/apache/lucene/analysis/cn/TestSmartChineseAnalyzer.java b/contrib/analyzers/smartcn/src/test/org/apache/lucene/analysis/cn/TestSmartChineseAnalyzer.java
index b267328a5f1..732f7f1fac2 100644
--- a/contrib/analyzers/smartcn/src/test/org/apache/lucene/analysis/cn/TestSmartChineseAnalyzer.java
+++ b/contrib/analyzers/smartcn/src/test/org/apache/lucene/analysis/cn/TestSmartChineseAnalyzer.java
@@ -29,6 +29,9 @@ import junit.framework.TestCase;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
public class TestSmartChineseAnalyzer extends TestCase {
@@ -108,22 +111,23 @@ public class TestSmartChineseAnalyzer extends TestCase {
public void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[])
throws Exception {
- TokenStream ts = a.tokenStream("dummy", new StringReader(input));
- final Token reusableToken = new Token();
- for (int i = 0; i < output.length; i++) {
- Token nextToken = ts.next(reusableToken);
- assertNotNull(nextToken);
- assertEquals(nextToken.term(), output[i]);
+ TokenStream ts = a.tokenStream("dummy", new StringReader(input));
+ TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class);
+ OffsetAttribute offsetAtt = (OffsetAttribute) ts.getAttribute(OffsetAttribute.class);
+ TypeAttribute typeAtt = (TypeAttribute) ts.getAttribute(TypeAttribute.class);
+ for (int i = 0; i < output.length; i++) {
+ assertTrue(ts.incrementToken());
+ assertEquals(termAtt.term(), output[i]);
if (startOffsets != null)
- assertEquals(nextToken.startOffset(), startOffsets[i]);
+ assertEquals(offsetAtt.startOffset(), startOffsets[i]);
if (endOffsets != null)
- assertEquals(nextToken.endOffset(), endOffsets[i]);
+ assertEquals(offsetAtt.endOffset(), endOffsets[i]);
if (types != null)
- assertEquals(nextToken.type(), types[i]);
+ assertEquals(typeAtt.type(), types[i]);
+ }
+ assertFalse(ts.incrementToken());
+ ts.close();
}
- assertNull(ts.next(reusableToken));
- ts.close();
-}
public void assertAnalyzesTo(Analyzer a, String input, String[] output) throws Exception {
assertAnalyzesTo(a, input, output, null, null, null);
diff --git a/contrib/collation/src/java/org/apache/lucene/collation/CollationKeyFilter.java b/contrib/collation/src/java/org/apache/lucene/collation/CollationKeyFilter.java
index 41aae62ab7d..54d9bc33d9d 100644
--- a/contrib/collation/src/java/org/apache/lucene/collation/CollationKeyFilter.java
+++ b/contrib/collation/src/java/org/apache/lucene/collation/CollationKeyFilter.java
@@ -21,6 +21,7 @@ package org.apache.lucene.collation;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.util.IndexableBinaryStringTools;
import java.io.IOException;
@@ -73,8 +74,9 @@ import java.text.Collator;
* {@link ICUCollationKeyFilter} on the query side, or vice versa.
*
*/
-public class CollationKeyFilter extends TokenFilter {
+public final class CollationKeyFilter extends TokenFilter {
private Collator collator = null;
+ private TermAttribute termAtt;
/**
* @param input Source token stream
@@ -83,25 +85,26 @@ public class CollationKeyFilter extends TokenFilter {
public CollationKeyFilter(TokenStream input, Collator collator) {
super(input);
this.collator = collator;
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
}
- public final Token next(final Token reusableToken) throws IOException {
- assert reusableToken != null;
- Token nextToken = input.next(reusableToken);
- if (nextToken != null) {
- char[] termBuffer = nextToken.termBuffer();
- String termText = new String(termBuffer, 0, nextToken.termLength());
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ char[] termBuffer = termAtt.termBuffer();
+ String termText = new String(termBuffer, 0, termAtt.termLength());
byte[] collationKey = collator.getCollationKey(termText).toByteArray();
ByteBuffer collationKeyBuf = ByteBuffer.wrap(collationKey);
int encodedLength
= IndexableBinaryStringTools.getEncodedLength(collationKeyBuf);
if (encodedLength > termBuffer.length) {
- nextToken.resizeTermBuffer(encodedLength);
+ termAtt.resizeTermBuffer(encodedLength);
}
- nextToken.setTermLength(encodedLength);
- CharBuffer wrappedTermBuffer = CharBuffer.wrap(nextToken.termBuffer());
+ termAtt.setTermLength(encodedLength);
+ CharBuffer wrappedTermBuffer = CharBuffer.wrap(termAtt.termBuffer());
IndexableBinaryStringTools.encode(collationKeyBuf, wrappedTermBuffer);
+ return true;
+ } else {
+ return false;
}
- return nextToken;
}
}
diff --git a/contrib/collation/src/java/org/apache/lucene/collation/ICUCollationKeyFilter.java b/contrib/collation/src/java/org/apache/lucene/collation/ICUCollationKeyFilter.java
index 27abe24c2e0..1bd4a510b67 100644
--- a/contrib/collation/src/java/org/apache/lucene/collation/ICUCollationKeyFilter.java
+++ b/contrib/collation/src/java/org/apache/lucene/collation/ICUCollationKeyFilter.java
@@ -24,6 +24,7 @@ import com.ibm.icu.text.RawCollationKey;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.util.IndexableBinaryStringTools;
import java.io.IOException;
@@ -69,9 +70,10 @@ import java.nio.CharBuffer;
* java.text.Collator over several languages.
*
*/
-public class ICUCollationKeyFilter extends TokenFilter {
+public final class ICUCollationKeyFilter extends TokenFilter {
private Collator collator = null;
private RawCollationKey reusableKey = new RawCollationKey();
+ private TermAttribute termAtt;
/**
*
@@ -81,25 +83,26 @@ public class ICUCollationKeyFilter extends TokenFilter {
public ICUCollationKeyFilter(TokenStream input, Collator collator) {
super(input);
this.collator = collator;
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
}
- public final Token next(final Token reusableToken) throws IOException {
- assert reusableToken != null;
- Token nextToken = input.next(reusableToken);
- if (nextToken != null) {
- char[] termBuffer = nextToken.termBuffer();
- String termText = new String(termBuffer, 0, nextToken.termLength());
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ char[] termBuffer = termAtt.termBuffer();
+ String termText = new String(termBuffer, 0, termAtt.termLength());
collator.getRawCollationKey(termText, reusableKey);
ByteBuffer collationKeyBuf = ByteBuffer.wrap(reusableKey.bytes, 0, reusableKey.size);
int encodedLength
= IndexableBinaryStringTools.getEncodedLength(collationKeyBuf);
if (encodedLength > termBuffer.length) {
- nextToken.resizeTermBuffer(encodedLength);
+ termAtt.resizeTermBuffer(encodedLength);
}
- nextToken.setTermLength(encodedLength);
- CharBuffer wrappedTermBuffer = CharBuffer.wrap(nextToken.termBuffer());
+ termAtt.setTermLength(encodedLength);
+ CharBuffer wrappedTermBuffer = CharBuffer.wrap(termAtt.termBuffer());
IndexableBinaryStringTools.encode(collationKeyBuf, wrappedTermBuffer);
+ return true;
+ } else {
+ return false;
}
- return nextToken;
}
}
diff --git a/contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/AbstractTestCase.java b/contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/AbstractTestCase.java
index 9e6ee39aa73..1cdfc8cd271 100644
--- a/contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/AbstractTestCase.java
+++ b/contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/AbstractTestCase.java
@@ -28,6 +28,8 @@ import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
@@ -193,11 +195,15 @@ public abstract class AbstractTestCase extends TestCase {
ch = 0;
}
- public Token next( Token reusableToken ) throws IOException {
+ TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
+ public boolean incrementToken() throws IOException {
if( !getNextPartialSnippet() )
- return null;
- reusableToken.reinit( snippet, startTerm, lenTerm, startOffset, startOffset + lenTerm );
- return reusableToken;
+ return false;
+
+ termAtt.setTermBuffer(snippet, startTerm, lenTerm);
+ offsetAtt.setOffset(startOffset, startOffset + lenTerm);
+ return true;
}
public int getFinalOffset() {
diff --git a/contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/IndexTimeSynonymTest.java b/contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/IndexTimeSynonymTest.java
index cdabf127b25..f3634379630 100644
--- a/contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/IndexTimeSynonymTest.java
+++ b/contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/IndexTimeSynonymTest.java
@@ -295,14 +295,21 @@ public class IndexTimeSynonymTest extends AbstractTestCase {
public TokenArrayAnalyzer( Token... tokens ){
this.tokens = tokens;
}
+
public TokenStream tokenStream(String fieldName, Reader reader) {
- return new TokenStream(){
+ final Token reusableToken = new Token();
+
+ TokenStream.setOnlyUseNewAPI(true);
+ TokenStream ts = new TokenStream(){
int p = 0;
- public Token next( Token reusableToken ) throws IOException {
- if( p >= tokens.length ) return null;
- return tokens[p++];
+ public boolean incrementToken() throws IOException {
+ if( p >= tokens.length ) return false;
+ tokens[p++].copyTo(reusableToken);
+ return true;
}
};
+ ts.addAttributeImpl(reusableToken);
+ return ts;
}
}
}
diff --git a/contrib/instantiated/src/test/org/apache/lucene/store/instantiated/TestIndicesEquals.java b/contrib/instantiated/src/test/org/apache/lucene/store/instantiated/TestIndicesEquals.java
index 62470a25ac2..496fba45e63 100644
--- a/contrib/instantiated/src/test/org/apache/lucene/store/instantiated/TestIndicesEquals.java
+++ b/contrib/instantiated/src/test/org/apache/lucene/store/instantiated/TestIndicesEquals.java
@@ -27,6 +27,7 @@ import junit.framework.TestCase;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
@@ -44,6 +45,7 @@ import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocCollector;
import org.apache.lucene.search.TopScoreDocCollector;
+import org.apache.lucene.util.AttributeSource;
/**
* Asserts equality of content and behaviour of two index readers.
@@ -175,23 +177,26 @@ public class TestIndicesEquals extends TestCase {
t.setPayload(new Payload(new byte[]{2}));
tokens.add(t);
tokens.add(createToken("fin", 7, 9));
- document.add(new Field("f", new TokenStream() {
+ final Token reusableToken = new Token();
+ TokenStream ts = new TokenStream() {
Iterator it = tokens.iterator();
-
- public Token next(final Token reusableToken) throws IOException {
- assert reusableToken != null;
+
+ public final boolean incrementToken() throws IOException {
if (!it.hasNext()) {
- return null;
+ return false;
}
- // Resettable token streams need to return clones.
- Token nextToken = (Token) it.next();
- return (Token) nextToken.clone();
+
+ reusableToken.reinit(it.next());
+ return true;
}
public void reset() throws IOException {
it = tokens.iterator();
}
- }));
+ };
+ ts.addAttributeImpl(reusableToken);
+
+ document.add(new Field("f", ts));
}
}
}
diff --git a/contrib/lucli/src/java/lucli/LuceneMethods.java b/contrib/lucli/src/java/lucli/LuceneMethods.java
index b12f1508b20..5430b34447d 100644
--- a/contrib/lucli/src/java/lucli/LuceneMethods.java
+++ b/contrib/lucli/src/java/lucli/LuceneMethods.java
@@ -75,6 +75,8 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
@@ -317,11 +319,14 @@ class LuceneMethods {
int position = 0;
// Tokenize field and add to postingTable
TokenStream stream = analyzer.tokenStream(fieldName, reader);
+ TermAttribute termAtt = (TermAttribute) stream.addAttribute(TermAttribute.class);
+ PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) stream.addAttribute(PositionIncrementAttribute.class);
+
try {
- for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) {
- position += (nextToken.getPositionIncrement() - 1);
+ while (stream.incrementToken()) {
+ position += (posIncrAtt.getPositionIncrement() - 1);
position++;
- String name = nextToken.term();
+ String name = termAtt.term();
Integer Count = (Integer) tokenMap.get(name);
if (Count == null) { // not in there yet
tokenMap.put(name, new Integer(1)); //first one
diff --git a/contrib/memory/src/java/org/apache/lucene/index/memory/AnalyzerUtil.java b/contrib/memory/src/java/org/apache/lucene/index/memory/AnalyzerUtil.java
index 6a9a5544cda..0ec2bcaf107 100644
--- a/contrib/memory/src/java/org/apache/lucene/index/memory/AnalyzerUtil.java
+++ b/contrib/memory/src/java/org/apache/lucene/index/memory/AnalyzerUtil.java
@@ -31,9 +31,13 @@ import java.util.regex.Pattern;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.PorterStemFilter;
-import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.AttributeSource;
/**
* Various fulltext analysis utilities avoiding redundant code in several
@@ -71,21 +75,24 @@ public class AnalyzerUtil {
public TokenStream tokenStream(final String fieldName, Reader reader) {
return new TokenFilter(child.tokenStream(fieldName, reader)) {
private int position = -1;
-
- public Token next(final Token reusableToken) throws IOException {
- assert reusableToken != null;
- Token nextToken = input.next(reusableToken); // from filter super class
- log.println(toString(nextToken));
- return nextToken;
+ private TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ private PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
+ private OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
+ private TypeAttribute typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
+
+ public boolean incrementToken() throws IOException {
+ boolean hasNext = input.incrementToken();
+ log.println(toString(hasNext));
+ return hasNext;
}
- private String toString(Token token) {
- if (token == null) return "[" + logName + ":EOS:" + fieldName + "]\n";
+ private String toString(boolean hasNext) {
+ if (!hasNext) return "[" + logName + ":EOS:" + fieldName + "]\n";
- position += token.getPositionIncrement();
+ position += posIncrAtt.getPositionIncrement();
return "[" + logName + ":" + position + ":" + fieldName + ":"
- + token.term() + ":" + token.startOffset()
- + "-" + token.endOffset() + ":" + token.type()
+ + termAtt.term() + ":" + offsetAtt.startOffset()
+ + "-" + offsetAtt.endOffset() + ":" + typeAtt.type()
+ "]";
}
};
@@ -121,9 +128,8 @@ public class AnalyzerUtil {
return new TokenFilter(child.tokenStream(fieldName, reader)) {
private int todo = maxTokens;
- public Token next(final Token reusableToken) throws IOException {
- assert reusableToken != null;
- return --todo >= 0 ? input.next(reusableToken) : null;
+ public boolean incrementToken() throws IOException {
+ return --todo >= 0 ? input.incrementToken() : false;
}
};
}
@@ -240,11 +246,10 @@ public class AnalyzerUtil {
final ArrayList tokens2 = new ArrayList();
TokenStream tokenStream = new TokenFilter(child.tokenStream(fieldName, reader)) {
- public Token next(final Token reusableToken) throws IOException {
- assert reusableToken != null;
- Token nextToken = input.next(reusableToken); // from filter super class
- if (nextToken != null) tokens2.add(nextToken.clone());
- return nextToken;
+ public boolean incrementToken() throws IOException {
+ boolean hasNext = input.incrementToken();
+ if (hasNext) tokens2.add(captureState());
+ return hasNext;
}
};
@@ -255,10 +260,10 @@ public class AnalyzerUtil {
private Iterator iter = tokens.iterator();
- public Token next(Token token) {
- assert token != null;
- if (!iter.hasNext()) return null;
- return (Token) iter.next();
+ public boolean incrementToken() {
+ if (!iter.hasNext()) return false;
+ restoreState((AttributeSource.State) iter.next());
+ return true;
}
};
}
@@ -302,13 +307,13 @@ public class AnalyzerUtil {
// compute frequencies of distinct terms
HashMap map = new HashMap();
TokenStream stream = analyzer.tokenStream("", new StringReader(text));
+ TermAttribute termAtt = (TermAttribute) stream.addAttribute(TermAttribute.class);
try {
- final Token reusableToken = new Token();
- for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) {
- MutableInteger freq = (MutableInteger) map.get(nextToken.term());
+ while (stream.incrementToken()) {
+ MutableInteger freq = (MutableInteger) map.get(termAtt.term());
if (freq == null) {
freq = new MutableInteger(1);
- map.put(nextToken.term(), freq);
+ map.put(termAtt.term(), freq);
} else {
freq.setValue(freq.intValue() + 1);
}
diff --git a/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java b/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java
index 186be49d0d4..9b4d1a512be 100644
--- a/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java
+++ b/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java
@@ -28,8 +28,10 @@ import java.util.Iterator;
import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.FieldSelector;
import org.apache.lucene.index.IndexReader;
@@ -274,18 +276,21 @@ public class MemoryIndex implements Serializable {
return new TokenStream() {
private Iterator iter = keywords.iterator();
private int start = 0;
- public Token next(final Token reusableToken) {
- assert reusableToken != null;
- if (!iter.hasNext()) return null;
+ private TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ private OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
+
+ public boolean incrementToken() {
+ if (!iter.hasNext()) return false;
Object obj = iter.next();
if (obj == null)
throw new IllegalArgumentException("keyword must not be null");
String term = obj.toString();
- reusableToken.reinit(term, start, start+reusableToken.termLength());
+ termAtt.setTermBuffer(term);
+ offsetAtt.setOffset(start, start+termAtt.termLength());
start += term.length() + 1; // separate words by 1 (blank) character
- return reusableToken;
+ return true;
}
};
}
@@ -350,13 +355,17 @@ public class MemoryIndex implements Serializable {
int numTokens = 0;
int numOverlapTokens = 0;
int pos = -1;
- final Token reusableToken = new Token();
- for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) {
- String term = nextToken.term();
+
+ TermAttribute termAtt = (TermAttribute) stream.addAttribute(TermAttribute.class);
+ PositionIncrementAttribute posIncrAttribute = (PositionIncrementAttribute) stream.addAttribute(PositionIncrementAttribute.class);
+ OffsetAttribute offsetAtt = (OffsetAttribute) stream.addAttribute(OffsetAttribute.class);
+
+ while (stream.incrementToken()) {
+ String term = termAtt.term();
if (term.length() == 0) continue; // nothing to do
// if (DEBUG) System.err.println("token='" + term + "'");
numTokens++;
- final int posIncr = nextToken.getPositionIncrement();
+ final int posIncr = posIncrAttribute.getPositionIncrement();
if (posIncr == 0)
numOverlapTokens++;
pos += posIncr;
@@ -369,7 +378,7 @@ public class MemoryIndex implements Serializable {
if (stride == 1) {
positions.add(pos);
} else {
- positions.add(pos, nextToken.startOffset(), nextToken.endOffset());
+ positions.add(pos, offsetAtt.startOffset(), offsetAtt.endOffset());
}
}
diff --git a/contrib/memory/src/java/org/apache/lucene/index/memory/PatternAnalyzer.java b/contrib/memory/src/java/org/apache/lucene/index/memory/PatternAnalyzer.java
index f2bb2a01808..a48cba815ba 100644
--- a/contrib/memory/src/java/org/apache/lucene/index/memory/PatternAnalyzer.java
+++ b/contrib/memory/src/java/org/apache/lucene/index/memory/PatternAnalyzer.java
@@ -30,8 +30,9 @@ import java.util.regex.Pattern;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.StopAnalyzer;
import org.apache.lucene.analysis.StopFilter;
-import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/**
* Efficient Lucene analyzer/tokenizer that preferably operates on a String rather than a
@@ -331,6 +332,8 @@ public class PatternAnalyzer extends Analyzer {
private Matcher matcher;
private int pos = 0;
private static final Locale locale = Locale.getDefault();
+ private TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ private OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
public PatternTokenizer(String str, Pattern pattern, boolean toLowerCase) {
this.str = str;
@@ -338,9 +341,8 @@ public class PatternAnalyzer extends Analyzer {
this.toLowerCase = toLowerCase;
}
- public Token next(final Token reusableToken) {
- assert reusableToken != null;
- if (matcher == null) return null;
+ public final boolean incrementToken() {
+ if (matcher == null) return false;
while (true) { // loop takes care of leading and trailing boundary cases
int start = pos;
@@ -357,9 +359,11 @@ public class PatternAnalyzer extends Analyzer {
if (start != end) { // non-empty match (header/trailer)
String text = str.substring(start, end);
if (toLowerCase) text = text.toLowerCase(locale);
- return reusableToken.reinit(text, start, end);
+ termAtt.setTermBuffer(text);
+ offsetAtt.setOffset(start, end);
+ return true;
}
- if (!isMatch) return null;
+ if (!isMatch) return false;
}
}
@@ -381,6 +385,8 @@ public class PatternAnalyzer extends Analyzer {
private final boolean toLowerCase;
private final Set stopWords;
private static final Locale locale = Locale.getDefault();
+ private TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ private OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
public FastStringTokenizer(String str, boolean isLetter, boolean toLowerCase, Set stopWords) {
this.str = str;
@@ -389,8 +395,7 @@ public class PatternAnalyzer extends Analyzer {
this.stopWords = stopWords;
}
- public Token next(final Token reusableToken) {
- assert reusableToken != null;
+ public boolean incrementToken() {
// cache loop instance vars (performance)
String s = str;
int len = s.length();
@@ -430,9 +435,11 @@ public class PatternAnalyzer extends Analyzer {
pos = i;
if (text == null)
{
- return null;
+ return false;
}
- return reusableToken.reinit(text, start, i);
+ termAtt.setTermBuffer(text);
+ offsetAtt.setOffset(start, i);
+ return true;
}
private boolean isTokenChar(char c, boolean isLetter) {
diff --git a/contrib/memory/src/java/org/apache/lucene/index/memory/SynonymTokenFilter.java b/contrib/memory/src/java/org/apache/lucene/index/memory/SynonymTokenFilter.java
index 9a7bad4d539..b65ff174f58 100644
--- a/contrib/memory/src/java/org/apache/lucene/index/memory/SynonymTokenFilter.java
+++ b/contrib/memory/src/java/org/apache/lucene/index/memory/SynonymTokenFilter.java
@@ -19,9 +19,12 @@ package org.apache.lucene.index.memory;
import java.io.IOException;
-import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.AttributeSource;
/**
* Injects additional tokens for synonyms of token terms fetched from the
@@ -39,9 +42,13 @@ public class SynonymTokenFilter extends TokenFilter {
private String[] stack = null;
private int index = 0;
- private Token current = null;
+ private AttributeSource.State current = null;
private int todo = 0;
+ private TermAttribute termAtt;
+ private TypeAttribute typeAtt;
+ private PositionIncrementAttribute posIncrAtt;
+
/**
* Creates an instance for the given underlying stream and synonym table.
*
@@ -64,28 +71,29 @@ public class SynonymTokenFilter extends TokenFilter {
this.synonyms = synonyms;
this.maxSynonyms = maxSynonyms;
+
+ this.termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ this.typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
+ this.posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
}
/** Returns the next token in the stream, or null at EOS. */
- public Token next(final Token reusableToken) throws IOException {
- assert reusableToken != null;
+ public final boolean incrementToken() throws IOException {
while (todo > 0 && index < stack.length) { // pop from stack
- Token nextToken = createToken(stack[index++], current, reusableToken);
- if (nextToken != null) {
+ if (createToken(stack[index++], current)) {
todo--;
- return nextToken;
+ return true;
}
}
- Token nextToken = input.next(reusableToken);
- if (nextToken == null) return null; // EOS; iterator exhausted
+ if (!input.incrementToken()) return false; // EOS; iterator exhausted
- stack = synonyms.getSynonyms(nextToken.term()); // push onto stack
+ stack = synonyms.getSynonyms(termAtt.term()); // push onto stack
if (stack.length > maxSynonyms) randomize(stack);
index = 0;
- current = (Token) nextToken.clone();
+ current = captureState();
todo = maxSynonyms;
- return nextToken;
+ return true;
}
/**
@@ -101,12 +109,12 @@ public class SynonymTokenFilter extends TokenFilter {
* @return a new token, or null to indicate that the given synonym should be
* ignored
*/
- protected Token createToken(String synonym, Token current, final Token reusableToken) {
- reusableToken.reinit(current, synonym);
- reusableToken.setTermBuffer(synonym);
- reusableToken.setType(SYNONYM_TOKEN_TYPE);
- reusableToken.setPositionIncrement(0);
- return reusableToken;
+ protected boolean createToken(String synonym, AttributeSource.State current) {
+ restoreState(current);
+ termAtt.setTermBuffer(synonym);
+ typeAtt.setType(SYNONYM_TOKEN_TYPE);
+ posIncrAtt.setPositionIncrement(0);
+ return true;
}
/**
diff --git a/contrib/miscellaneous/src/java/org/apache/lucene/queryParser/analyzing/AnalyzingQueryParser.java b/contrib/miscellaneous/src/java/org/apache/lucene/queryParser/analyzing/AnalyzingQueryParser.java
index 88147eafdff..c3f686a769b 100644
--- a/contrib/miscellaneous/src/java/org/apache/lucene/queryParser/analyzing/AnalyzingQueryParser.java
+++ b/contrib/miscellaneous/src/java/org/apache/lucene/queryParser/analyzing/AnalyzingQueryParser.java
@@ -25,6 +25,7 @@ import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.search.Query;
@@ -105,20 +106,16 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryPar
// get Analyzer from superclass and tokenize the term
TokenStream source = getAnalyzer().tokenStream(field, new StringReader(termStr));
- final Token reusableToken = new Token();
- Token nextToken;
-
+ TermAttribute termAtt = (TermAttribute) source.addAttribute(TermAttribute.class);
+
int countTokens = 0;
while (true) {
try {
- nextToken = source.next(reusableToken);
+ if (!source.incrementToken()) break;
} catch (IOException e) {
- nextToken = null;
- }
- if (nextToken == null) {
break;
}
- String term = nextToken.term();
+ String term = termAtt.term();
if (!"".equals(term)) {
try {
tlist.set(countTokens++, term);
@@ -191,19 +188,15 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryPar
// get Analyzer from superclass and tokenize the term
TokenStream source = getAnalyzer().tokenStream(field, new StringReader(termStr));
List tlist = new ArrayList();
- final Token reusableToken = new Token();
- Token nextToken;
-
+ TermAttribute termAtt = (TermAttribute) source.addAttribute(TermAttribute.class);
+
while (true) {
try {
- nextToken = source.next(reusableToken);
+ if (!source.incrementToken()) break;
} catch (IOException e) {
- nextToken = null;
- }
- if (nextToken == null) {
break;
}
- tlist.add(nextToken.term());
+ tlist.add(termAtt.term());
}
try {
@@ -241,13 +234,15 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryPar
throws ParseException {
// get Analyzer from superclass and tokenize the term
TokenStream source = getAnalyzer().tokenStream(field, new StringReader(termStr));
- final Token reusableToken = new Token();
- Token nextToken;
+ TermAttribute termAtt = (TermAttribute) source.addAttribute(TermAttribute.class);
+ String nextToken = null;
boolean multipleTokens = false;
-
+
try {
- nextToken = source.next(reusableToken);
- multipleTokens = source.next(reusableToken) != null;
+ if (source.incrementToken()) {
+ nextToken = termAtt.term();
+ }
+ multipleTokens = source.incrementToken();
} catch (IOException e) {
nextToken = null;
}
@@ -263,7 +258,7 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryPar
+ " - tokens were added");
}
- return (nextToken == null) ? null : super.getFuzzyQuery(field, nextToken.term(), minSimilarity);
+ return (nextToken == null) ? null : super.getFuzzyQuery(field, nextToken, minSimilarity);
}
/**
@@ -274,20 +269,17 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryPar
throws ParseException {
// get Analyzer from superclass and tokenize the terms
TokenStream source = getAnalyzer().tokenStream(field, new StringReader(part1));
- final Token reusableToken = new Token();
- Token nextToken;
- Token multipleToken;
+ TermAttribute termAtt = (TermAttribute) source.addAttribute(TermAttribute.class);
boolean multipleTokens = false;
// part1
try {
- nextToken = source.next(reusableToken);
- if (nextToken != null) {
- part1 = nextToken.term();
+ if (source.incrementToken()) {
+ part1 = termAtt.term();
}
- multipleTokens = source.next(reusableToken) != null;
+ multipleTokens = source.incrementToken();
} catch (IOException e) {
- nextToken = null;
+ // ignore
}
try {
source.close();
@@ -301,14 +293,15 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryPar
// part2
source = getAnalyzer().tokenStream(field, new StringReader(part2));
+ termAtt = (TermAttribute) source.addAttribute(TermAttribute.class);
+
try {
- nextToken = source.next(reusableToken);
- if (nextToken != null) {
- part2 = nextToken.term();
+ if (source.incrementToken()) {
+ part2 = termAtt.term();
}
- multipleTokens = source.next(reusableToken) != null;
+ multipleTokens = source.incrementToken();
} catch (IOException e) {
- nextToken = null;
+ // ignore
}
try {
source.close();
diff --git a/contrib/miscellaneous/src/test/org/apache/lucene/queryParser/precedence/TestPrecedenceQueryParser.java b/contrib/miscellaneous/src/test/org/apache/lucene/queryParser/precedence/TestPrecedenceQueryParser.java
index b1d9854c21b..d85a4014283 100644
--- a/contrib/miscellaneous/src/test/org/apache/lucene/queryParser/precedence/TestPrecedenceQueryParser.java
+++ b/contrib/miscellaneous/src/test/org/apache/lucene/queryParser/precedence/TestPrecedenceQueryParser.java
@@ -26,6 +26,8 @@ import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.FuzzyQuery;
@@ -57,28 +59,27 @@ public class TestPrecedenceQueryParser extends TestCase {
boolean inPhrase = false;
int savedStart = 0, savedEnd = 0;
- public Token next(final Token reusableToken) throws IOException {
- assert reusableToken != null;
+ TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
+
+ public boolean incrementToken() throws IOException {
if (inPhrase) {
inPhrase = false;
- reusableToken.setTermBuffer("phrase2");
- reusableToken.setStartOffset(savedStart);
- reusableToken.setEndOffset(savedEnd);
- return reusableToken;
+ termAtt.setTermBuffer("phrase2");
+ offsetAtt.setOffset(savedStart, savedEnd);
+ return true;
} else
- for (Token nextToken = input.next(reusableToken); nextToken != null; nextToken = input.next(reusableToken)) {
- if (nextToken.term().equals("phrase")) {
+ while(input.incrementToken())
+ if (termAtt.term().equals("phrase")) {
inPhrase = true;
- savedStart = nextToken.startOffset();
- savedEnd = nextToken.endOffset();
- nextToken.setTermBuffer("phrase1");
- nextToken.setStartOffset(savedStart);
- nextToken.setEndOffset(savedEnd);
- return nextToken;
- } else if (!nextToken.term().equals("stop"))
- return nextToken;
- }
- return null;
+ savedStart = offsetAtt.startOffset();
+ savedEnd = offsetAtt.endOffset();
+ termAtt.setTermBuffer("phrase1");
+ offsetAtt.setOffset(savedStart, savedEnd);
+ return true;
+ } else if (!termAtt.term().equals("stop"))
+ return true;
+ return false;
}
}
diff --git a/contrib/queries/src/java/org/apache/lucene/search/FuzzyLikeThisQuery.java b/contrib/queries/src/java/org/apache/lucene/search/FuzzyLikeThisQuery.java
index 7c74aea2112..b6e9446f005 100644
--- a/contrib/queries/src/java/org/apache/lucene/search/FuzzyLikeThisQuery.java
+++ b/contrib/queries/src/java/org/apache/lucene/search/FuzzyLikeThisQuery.java
@@ -27,6 +27,7 @@ import java.util.Iterator;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;
@@ -181,13 +182,14 @@ public class FuzzyLikeThisQuery extends Query
{
if(f.queryString==null) return;
TokenStream ts=analyzer.tokenStream(f.fieldName,new StringReader(f.queryString));
- final Token reusableToken = new Token();
+ TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
+
int corpusNumDocs=reader.numDocs();
Term internSavingTemplateTerm =new Term(f.fieldName); //optimization to avoid constructing new Term() objects
HashSet processedTerms=new HashSet();
- for (Token nextToken = ts.next(reusableToken); nextToken!=null; nextToken = ts.next(reusableToken))
+ while (ts.incrementToken())
{
- String term = nextToken.term();
+ String term = termAtt.term();
if(!processedTerms.contains(term))
{
processedTerms.add(term);
diff --git a/contrib/queries/src/java/org/apache/lucene/search/similar/MoreLikeThis.java b/contrib/queries/src/java/org/apache/lucene/search/similar/MoreLikeThis.java
index 6fed4b5be1f..ba9ed35efe6 100644
--- a/contrib/queries/src/java/org/apache/lucene/search/similar/MoreLikeThis.java
+++ b/contrib/queries/src/java/org/apache/lucene/search/similar/MoreLikeThis.java
@@ -28,9 +28,9 @@ import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Hits;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.document.Document;
import java.util.Set;
@@ -829,9 +829,10 @@ public final class MoreLikeThis {
TokenStream ts = analyzer.tokenStream(fieldName, r);
int tokenCount=0;
// for every token
- final Token reusableToken = new Token();
- for (Token nextToken = ts.next(reusableToken); nextToken != null; nextToken = ts.next(reusableToken)) {
- String word = nextToken.term();
+ TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
+
+ while (ts.incrementToken()) {
+ String word = termAtt.term();
tokenCount++;
if(tokenCount>maxNumTokensParsed)
{
diff --git a/contrib/queries/src/java/org/apache/lucene/search/similar/SimilarityQueries.java b/contrib/queries/src/java/org/apache/lucene/search/similar/SimilarityQueries.java
index e3cea76d034..090d52c4c2b 100644
--- a/contrib/queries/src/java/org/apache/lucene/search/similar/SimilarityQueries.java
+++ b/contrib/queries/src/java/org/apache/lucene/search/similar/SimilarityQueries.java
@@ -21,8 +21,8 @@ import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
@@ -86,11 +86,12 @@ public final class SimilarityQueries
throws IOException
{
TokenStream ts = a.tokenStream( field, new StringReader( body));
+ TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
+
BooleanQuery tmp = new BooleanQuery();
Set already = new HashSet(); // ignore dups
- final Token reusableToken = new Token();
- for (Token nextToken = ts.next(reusableToken); nextToken != null; nextToken = ts.next(reusableToken)) {
- String word = nextToken.term();
+ while (ts.incrementToken()) {
+ String word = termAtt.term();
// ignore opt stop words
if ( stop != null &&
stop.contains( word)) continue;
diff --git a/contrib/snowball/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java b/contrib/snowball/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java
index 86ccc8b3215..c4047746f45 100644
--- a/contrib/snowball/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java
+++ b/contrib/snowball/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java
@@ -22,6 +22,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.tartarus.snowball.SnowballProgram;
/**
@@ -33,9 +34,12 @@ public class SnowballFilter extends TokenFilter {
private SnowballProgram stemmer;
+ private TermAttribute termAtt;
+
public SnowballFilter(TokenStream input, SnowballProgram stemmer) {
super(input);
this.stemmer = stemmer;
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
}
/**
@@ -56,21 +60,34 @@ public class SnowballFilter extends TokenFilter {
} catch (Exception e) {
throw new RuntimeException(e.toString());
}
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
}
/** Returns the next input Token, after being stemmed */
- public final Token next(final Token reusableToken) throws IOException {
- assert reusableToken != null;
- Token nextToken = input.next(reusableToken);
- if (nextToken == null)
- return null;
- String originalTerm = nextToken.term();
- stemmer.setCurrent(originalTerm);
- stemmer.stem();
- String finalTerm = stemmer.getCurrent();
- // Don't bother updating, if it is unchanged.
- if (!originalTerm.equals(finalTerm))
- nextToken.setTermBuffer(finalTerm);
- return nextToken;
+ public final boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ String originalTerm = termAtt.term();
+ stemmer.setCurrent(originalTerm);
+ stemmer.stem();
+ String finalTerm = stemmer.getCurrent();
+ // Don't bother updating, if it is unchanged.
+ if (!originalTerm.equals(finalTerm))
+ termAtt.setTermBuffer(finalTerm);
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+ * not be overridden. Delegates to the backwards compatibility layer. */
+ public final Token next(final Token reusableToken) throws java.io.IOException {
+ return super.next(reusableToken);
+ }
+
+ /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+ * not be overridden. Delegates to the backwards compatibility layer. */
+ public final Token next() throws java.io.IOException {
+ return super.next();
}
}
diff --git a/contrib/snowball/src/test/org/apache/lucene/analysis/snowball/TestSnowball.java b/contrib/snowball/src/test/org/apache/lucene/analysis/snowball/TestSnowball.java
index 9cd65d625e2..45042c26140 100644
--- a/contrib/snowball/src/test/org/apache/lucene/analysis/snowball/TestSnowball.java
+++ b/contrib/snowball/src/test/org/apache/lucene/analysis/snowball/TestSnowball.java
@@ -22,9 +22,14 @@ import java.io.StringReader;
import junit.framework.TestCase;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.Token;
import org.apache.lucene.index.Payload;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
public class TestSnowball extends TestCase {
@@ -32,12 +37,12 @@ public class TestSnowball extends TestCase {
String input,
String[] output) throws Exception {
TokenStream ts = a.tokenStream("dummy", new StringReader(input));
- final Token reusableToken = new Token();
+ TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class);
for (int i = 0; i < output.length; i++) {
- Token nextToken = ts.next(reusableToken);
- assertEquals(output[i], nextToken.term());
+ assertTrue(ts.incrementToken());
+ assertEquals(output[i], termAtt.term());
}
- assertNull(ts.next(reusableToken));
+ assertFalse(ts.incrementToken());
ts.close();
}
@@ -49,33 +54,51 @@ public class TestSnowball extends TestCase {
public void testFilterTokens() throws Exception {
- final Token tok = new Token(2, 7, "wrd");
- tok.setTermBuffer("accents");
- tok.setPositionIncrement(3);
- Payload tokPayload = new Payload(new byte[]{0,1,2,3});
- tok.setPayload(tokPayload);
- int tokFlags = 77;
- tok.setFlags(tokFlags);
+ SnowballFilter filter = new SnowballFilter(new TestTokenStream(), "English");
+ TermAttribute termAtt = (TermAttribute) filter.getAttribute(TermAttribute.class);
+ OffsetAttribute offsetAtt = (OffsetAttribute) filter.getAttribute(OffsetAttribute.class);
+ TypeAttribute typeAtt = (TypeAttribute) filter.getAttribute(TypeAttribute.class);
+ PayloadAttribute payloadAtt = (PayloadAttribute) filter.getAttribute(PayloadAttribute.class);
+ PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) filter.getAttribute(PositionIncrementAttribute.class);
+ FlagsAttribute flagsAtt = (FlagsAttribute) filter.getAttribute(FlagsAttribute.class);
+
+ filter.incrementToken();
- SnowballFilter filter = new SnowballFilter(
- new TokenStream() {
- public Token next(final Token reusableToken) {
- assert reusableToken != null;
- return tok;
- }
- },
- "English"
- );
-
- final Token reusableToken = new Token();
- Token nextToken = filter.next(reusableToken);
-
- assertEquals("accent", nextToken.term());
- assertEquals(2, nextToken.startOffset());
- assertEquals(7, nextToken.endOffset());
- assertEquals("wrd", nextToken.type());
- assertEquals(3, nextToken.getPositionIncrement());
- assertEquals(tokFlags, nextToken.getFlags());
- assertEquals(tokPayload, nextToken.getPayload());
+ assertEquals("accent", termAtt.term());
+ assertEquals(2, offsetAtt.startOffset());
+ assertEquals(7, offsetAtt.endOffset());
+ assertEquals("wrd", typeAtt.type());
+ assertEquals(3, posIncAtt.getPositionIncrement());
+ assertEquals(77, flagsAtt.getFlags());
+ assertEquals(new Payload(new byte[]{0,1,2,3}), payloadAtt.getPayload());
+ }
+
+ private final class TestTokenStream extends TokenStream {
+ private TermAttribute termAtt;
+ private OffsetAttribute offsetAtt;
+ private TypeAttribute typeAtt;
+ private PayloadAttribute payloadAtt;
+ private PositionIncrementAttribute posIncAtt;
+ private FlagsAttribute flagsAtt;
+
+ TestTokenStream() {
+ super();
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
+ typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
+ payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
+ posIncAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
+ flagsAtt = (FlagsAttribute) addAttribute(FlagsAttribute.class);
+ }
+
+ public boolean incrementToken() {
+ termAtt.setTermBuffer("accents");
+ offsetAtt.setOffset(2, 7);
+ typeAtt.setType("wrd");
+ posIncAtt.setPositionIncrement(3);
+ payloadAtt.setPayload(new Payload(new byte[]{0,1,2,3}));
+ flagsAtt.setFlags(77);
+ return true;
+ }
}
}
\ No newline at end of file
diff --git a/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java b/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java
index f8985cb1828..777df1307cf 100644
--- a/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java
+++ b/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java
@@ -20,6 +20,12 @@ package org.apache.lucene.wikipedia.analysis;
import org.apache.lucene.analysis.CharReader;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.AttributeSource;
import java.io.IOException;
import java.io.Reader;
@@ -114,6 +120,12 @@ public class WikipediaTokenizer extends Tokenizer {
private int tokenOutput = TOKENS_ONLY;
private Set untokenizedTypes = Collections.EMPTY_SET;
private Iterator tokens = null;
+
+ private OffsetAttribute offsetAtt;
+ private TypeAttribute typeAtt;
+ private PositionIncrementAttribute posIncrAtt;
+ private TermAttribute termAtt;
+ private FlagsAttribute flagsAtt;
void setInput(Reader reader) {
this.input = CharReader.get(reader);
@@ -142,41 +154,59 @@ public class WikipediaTokenizer extends Tokenizer {
this.tokenOutput = tokenOutput;
this.scanner = new WikipediaTokenizerImpl(input);
this.untokenizedTypes = untokenizedTypes;
+ this.offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
+ this.typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
+ this.posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
+ this.termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ this.flagsAtt = (FlagsAttribute) addAttribute(FlagsAttribute.class);
}
+ /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+ * not be overridden. Delegates to the backwards compatibility layer. */
+ public final Token next(final Token reusableToken) throws java.io.IOException {
+ return super.next(reusableToken);
+ }
+
+ /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+ * not be overridden. Delegates to the backwards compatibility layer. */
+ public final Token next() throws java.io.IOException {
+ return super.next();
+ }
+
/*
* (non-Javadoc)
*
* @see org.apache.lucene.analysis.TokenStream#next()
*/
- public Token next(final Token reusableToken) throws IOException {
- assert reusableToken != null;
+ public final boolean incrementToken() throws IOException {
if (tokens != null && tokens.hasNext()){
- return (Token)tokens.next();
+ AttributeSource.State state = (AttributeSource.State) tokens.next();
+ restoreState(state);
+ return true;
}
int tokenType = scanner.getNextToken();
if (tokenType == WikipediaTokenizerImpl.YYEOF) {
- return null;
+ return false;
}
String type = WikipediaTokenizerImpl.TOKEN_TYPES[tokenType];
if (tokenOutput == TOKENS_ONLY || untokenizedTypes.contains(type) == false){
- setupToken(reusableToken);
+ setupToken();
} else if (tokenOutput == UNTOKENIZED_ONLY && untokenizedTypes.contains(type) == true){
- collapseTokens(reusableToken, tokenType);
+ collapseTokens(tokenType);
}
else if (tokenOutput == BOTH){
//collapse into a single token, add it to tokens AND output the individual tokens
//output the untokenized Token first
- collapseAndSaveTokens(reusableToken, tokenType, type);
+ collapseAndSaveTokens(tokenType, type);
}
- reusableToken.setPositionIncrement(scanner.getPositionIncrement());
- reusableToken.setType(type);
- return reusableToken;
+ posIncrAtt.setPositionIncrement(scanner.getPositionIncrement());
+ typeAtt.setType(type);
+ return true;
}
- private void collapseAndSaveTokens(final Token reusableToken, int tokenType, String type) throws IOException {
+ private void collapseAndSaveTokens(int tokenType, String type) throws IOException {
//collapse
StringBuffer buffer = new StringBuffer(32);
int numAdded = scanner.setText(buffer);
@@ -186,9 +216,8 @@ public class WikipediaTokenizer extends Tokenizer {
int tmpTokType;
int numSeen = 0;
List tmp = new ArrayList();
- Token saved = new Token();
- setupSavedToken(saved, 0, type);
- tmp.add(saved);
+ setupSavedToken(0, type);
+ tmp.add(captureState());
//while we can get a token and that token is the same type and we have not transitioned to a new wiki-item of the same type
while ((tmpTokType = scanner.getNextToken()) != WikipediaTokenizerImpl.YYEOF && tmpTokType == tokenType && scanner.getNumWikiTokensSeen() > numSeen){
int currPos = scanner.yychar();
@@ -197,18 +226,16 @@ public class WikipediaTokenizer extends Tokenizer {
buffer.append(' ');
}
numAdded = scanner.setText(buffer);
- saved = new Token();
- setupSavedToken(saved, scanner.getPositionIncrement(), type);
- tmp.add(saved);
+ setupSavedToken(scanner.getPositionIncrement(), type);
+ tmp.add(captureState());
numSeen++;
lastPos = currPos + numAdded;
}
//trim the buffer
String s = buffer.toString().trim();
- reusableToken.setTermBuffer(s.toCharArray(), 0, s.length());
- reusableToken.setStartOffset(input.correctOffset(theStart));
- reusableToken.setEndOffset(input.correctOffset(theStart + s.length()));
- reusableToken.setFlags(UNTOKENIZED_TOKEN_FLAG);
+ termAtt.setTermBuffer(s.toCharArray(), 0, s.length());
+ offsetAtt.setOffset(input.correctOffset(theStart), input.correctOffset(theStart + s.length()));
+ flagsAtt.setFlags(UNTOKENIZED_TOKEN_FLAG);
//The way the loop is written, we will have proceeded to the next token. We need to pushback the scanner to lastPos
if (tmpTokType != WikipediaTokenizerImpl.YYEOF){
scanner.yypushback(scanner.yylength());
@@ -216,13 +243,13 @@ public class WikipediaTokenizer extends Tokenizer {
tokens = tmp.iterator();
}
- private void setupSavedToken(Token saved, int positionInc, String type){
- setupToken(saved);
- saved.setPositionIncrement(positionInc);
- saved.setType(type);
+ private void setupSavedToken(int positionInc, String type){
+ setupToken();
+ posIncrAtt.setPositionIncrement(positionInc);
+ typeAtt.setType(type);
}
- private void collapseTokens(final Token reusableToken, int tokenType) throws IOException {
+ private void collapseTokens(int tokenType) throws IOException {
//collapse
StringBuffer buffer = new StringBuffer(32);
int numAdded = scanner.setText(buffer);
@@ -244,10 +271,9 @@ public class WikipediaTokenizer extends Tokenizer {
}
//trim the buffer
String s = buffer.toString().trim();
- reusableToken.setTermBuffer(s.toCharArray(), 0, s.length());
- reusableToken.setStartOffset(input.correctOffset(theStart));
- reusableToken.setEndOffset(input.correctOffset(theStart + s.length()));
- reusableToken.setFlags(UNTOKENIZED_TOKEN_FLAG);
+ termAtt.setTermBuffer(s.toCharArray(), 0, s.length());
+ offsetAtt.setOffset(input.correctOffset(theStart), input.correctOffset(theStart + s.length()));
+ flagsAtt.setFlags(UNTOKENIZED_TOKEN_FLAG);
//The way the loop is written, we will have proceeded to the next token. We need to pushback the scanner to lastPos
if (tmpTokType != WikipediaTokenizerImpl.YYEOF){
scanner.yypushback(scanner.yylength());
@@ -256,11 +282,10 @@ public class WikipediaTokenizer extends Tokenizer {
}
}
- private void setupToken(final Token reusableToken) {
- scanner.getText(reusableToken);
+ private void setupToken() {
+ scanner.getText(termAtt);
final int start = scanner.yychar();
- reusableToken.setStartOffset(input.correctOffset(start));
- reusableToken.setEndOffset(input.correctOffset(start + reusableToken.termLength()));
+ offsetAtt.setOffset(input.correctOffset(start), input.correctOffset(start + termAtt.termLength()));
}
/*
diff --git a/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerImpl.java b/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerImpl.java
index 0be9b161eca..e6dced9b4c7 100644
--- a/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerImpl.java
+++ b/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerImpl.java
@@ -19,7 +19,7 @@ package org.apache.lucene.wikipedia.analysis;
* limitations under the License.
*/
-import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/**
@@ -476,7 +476,7 @@ public final int getPositionIncrement(){
/**
* Fills Lucene token with the current token text.
*/
-final void getText(Token t) {
+final void getText(TermAttribute t) {
t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
}
diff --git a/contrib/wikipedia/src/test/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerTest.java b/contrib/wikipedia/src/test/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerTest.java
index d295aad1d4e..a594335e2fd 100644
--- a/contrib/wikipedia/src/test/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerTest.java
+++ b/contrib/wikipedia/src/test/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerTest.java
@@ -19,7 +19,6 @@
package org.apache.lucene.wikipedia.analysis;
import junit.framework.TestCase;
-import org.apache.lucene.analysis.Token;
import java.io.StringReader;
import java.io.IOException;
@@ -28,6 +27,12 @@ import java.util.Map;
import java.util.Set;
import java.util.HashSet;
+import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+
/**
*
@@ -131,23 +136,24 @@ public class WikipediaTokenizerTest extends TestCase {
int numBoldItalics = 0;
int numCategory = 0;
int numCitation = 0;
- final Token reusableToken = new Token();
- for (Token nextToken = tf.next(reusableToken); nextToken != null; nextToken = tf.next(reusableToken)) {
- String tokText = nextToken.term();
+ TermAttribute termAtt = (TermAttribute) tf.addAttribute(TermAttribute.class);
+ TypeAttribute typeAtt = (TypeAttribute) tf.addAttribute(TypeAttribute.class);
+
+ while (tf.incrementToken()) {
+ String tokText = termAtt.term();
//System.out.println("Text: " + tokText + " Type: " + token.type());
- assertTrue("nextToken is null and it shouldn't be", nextToken != null);
String expectedType = (String) tcm.get(tokText);
- assertTrue("expectedType is null and it shouldn't be for: " + nextToken, expectedType != null);
- assertTrue(nextToken.type() + " is not equal to " + expectedType + " for " + nextToken, nextToken.type().equals(expectedType) == true);
+ assertTrue("expectedType is null and it shouldn't be for: " + tf.toString(), expectedType != null);
+ assertTrue(typeAtt.type() + " is not equal to " + expectedType + " for " + tf.toString(), typeAtt.type().equals(expectedType) == true);
count++;
- if (nextToken.type().equals(WikipediaTokenizer.ITALICS) == true){
+ if (typeAtt.type().equals(WikipediaTokenizer.ITALICS) == true){
numItalics++;
- } else if (nextToken.type().equals(WikipediaTokenizer.BOLD_ITALICS) == true){
+ } else if (typeAtt.type().equals(WikipediaTokenizer.BOLD_ITALICS) == true){
numBoldItalics++;
- } else if (nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true){
+ } else if (typeAtt.type().equals(WikipediaTokenizer.CATEGORY) == true){
numCategory++;
}
- else if (nextToken.type().equals(WikipediaTokenizer.CITATION) == true){
+ else if (typeAtt.type().equals(WikipediaTokenizer.CITATION) == true){
numCitation++;
}
}
@@ -166,106 +172,93 @@ public class WikipediaTokenizerTest extends TestCase {
}
private void checkLinkPhrases(WikipediaTokenizer tf) throws IOException {
- final Token reusableToken = new Token();
- Token nextToken = tf.next(reusableToken);
- assertTrue("nextToken is null and it shouldn't be", nextToken != null);
- assertTrue(nextToken.term() + " is not equal to " + "click", nextToken.term().equals("click") == true);
- assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
- nextToken = tf.next(reusableToken);
- assertTrue("nextToken is null and it shouldn't be", nextToken != null);
- assertTrue(nextToken.term() + " is not equal to " + "link", nextToken.term().equals("link") == true);
- assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
- nextToken = tf.next(reusableToken);
- assertTrue("nextToken is null and it shouldn't be", nextToken != null);
- assertTrue(nextToken.term() + " is not equal to " + "here",
- nextToken.term().equals("here") == true);
+ TermAttribute termAtt = (TermAttribute) tf.addAttribute(TermAttribute.class);
+ PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) tf.addAttribute(PositionIncrementAttribute.class);
+
+ assertTrue(tf.incrementToken());
+ assertTrue(termAtt.term() + " is not equal to " + "click", termAtt.term().equals("click") == true);
+ assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
+ assertTrue(tf.incrementToken());
+ assertTrue(termAtt.term() + " is not equal to " + "link", termAtt.term().equals("link") == true);
+ assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
+ assertTrue(tf.incrementToken());
+ assertTrue(termAtt.term() + " is not equal to " + "here",
+ termAtt.term().equals("here") == true);
//The link, and here should be at the same position for phrases to work
- assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
- nextToken = tf.next(reusableToken);
- assertTrue("nextToken is null and it shouldn't be", nextToken != null);
- assertTrue(nextToken.term() + " is not equal to " + "again",
- nextToken.term().equals("again") == true);
- assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
+ assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
+ assertTrue(tf.incrementToken());
+ assertTrue(termAtt.term() + " is not equal to " + "again",
+ termAtt.term().equals("again") == true);
+ assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
- nextToken = tf.next(reusableToken);
- assertTrue("nextToken is null and it shouldn't be", nextToken != null);
- assertTrue(nextToken.term() + " is not equal to " + "click",
- nextToken.term().equals("click") == true);
- assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
+ assertTrue(tf.incrementToken());
+ assertTrue(termAtt.term() + " is not equal to " + "click",
+ termAtt.term().equals("click") == true);
+ assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
- nextToken = tf.next(reusableToken);
- assertTrue("nextToken is null and it shouldn't be", nextToken != null);
- assertTrue(nextToken.term() + " is not equal to " + "http://lucene.apache.org",
- nextToken.term().equals("http://lucene.apache.org") == true);
- assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
+ assertTrue(tf.incrementToken());
+ assertTrue(termAtt.term() + " is not equal to " + "http://lucene.apache.org",
+ termAtt.term().equals("http://lucene.apache.org") == true);
+ assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
- nextToken = tf.next(reusableToken);
- assertTrue("nextToken is null and it shouldn't be", nextToken != null);
- assertTrue(nextToken.term() + " is not equal to " + "here",
- nextToken.term().equals("here") == true);
- assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 0, nextToken.getPositionIncrement() == 0);
+ assertTrue(tf.incrementToken());
+ assertTrue(termAtt.term() + " is not equal to " + "here",
+ termAtt.term().equals("here") == true);
+ assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 0, posIncrAtt.getPositionIncrement() == 0);
- nextToken = tf.next(reusableToken);
- assertTrue("nextToken is null and it shouldn't be", nextToken != null);
- assertTrue(nextToken.term() + " is not equal to " + "again",
- nextToken.term().equals("again") == true);
- assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
+ assertTrue(tf.incrementToken());
+ assertTrue(termAtt.term() + " is not equal to " + "again",
+ termAtt.term().equals("again") == true);
+ assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
- nextToken = tf.next(reusableToken);
- assertTrue("nextToken is null and it shouldn't be", nextToken != null);
- assertTrue(nextToken.term() + " is not equal to " + "a",
- nextToken.term().equals("a") == true);
- assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
+ assertTrue(tf.incrementToken());
+ assertTrue(termAtt.term() + " is not equal to " + "a",
+ termAtt.term().equals("a") == true);
+ assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
- nextToken = tf.next(reusableToken);
- assertTrue("nextToken is null and it shouldn't be", nextToken != null);
- assertTrue(nextToken.term() + " is not equal to " + "b",
- nextToken.term().equals("b") == true);
- assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
+ assertTrue(tf.incrementToken());
+ assertTrue(termAtt.term() + " is not equal to " + "b",
+ termAtt.term().equals("b") == true);
+ assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
- nextToken = tf.next(reusableToken);
- assertTrue("nextToken is null and it shouldn't be", nextToken != null);
- assertTrue(nextToken.term() + " is not equal to " + "c",
- nextToken.term().equals("c") == true);
- assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
+ assertTrue(tf.incrementToken());
+ assertTrue(termAtt.term() + " is not equal to " + "c",
+ termAtt.term().equals("c") == true);
+ assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
- nextToken = tf.next(reusableToken);
- assertTrue("nextToken is null and it shouldn't be", nextToken != null);
- assertTrue(nextToken.term() + " is not equal to " + "d",
- nextToken.term().equals("d") == true);
- assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
+ assertTrue(tf.incrementToken());
+ assertTrue(termAtt.term() + " is not equal to " + "d",
+ termAtt.term().equals("d") == true);
+ assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
- nextToken = tf.next(reusableToken);
- assertTrue("nextToken is not null and it should be", nextToken == null);
+ assertFalse(tf.incrementToken());
}
public void testLinks() throws Exception {
String test = "[http://lucene.apache.org/java/docs/index.html#news here] [http://lucene.apache.org/java/docs/index.html?b=c here] [https://lucene.apache.org/java/docs/index.html?b=c here]";
WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(test));
- final Token reusableToken = new Token();
- Token nextToken = tf.next(reusableToken);
- assertTrue("nextToken is null and it shouldn't be", nextToken != null);
- assertTrue(nextToken.term() + " is not equal to " + "http://lucene.apache.org/java/docs/index.html#news",
- nextToken.term().equals("http://lucene.apache.org/java/docs/index.html#news") == true);
- assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.EXTERNAL_LINK_URL, nextToken.type().equals(WikipediaTokenizer.EXTERNAL_LINK_URL) == true);
- tf.next(reusableToken);//skip here
- nextToken = tf.next(reusableToken);
- assertTrue("nextToken is null and it shouldn't be", nextToken != null);
- assertTrue(nextToken.term() + " is not equal to " + "http://lucene.apache.org/java/docs/index.html?b=c",
- nextToken.term().equals("http://lucene.apache.org/java/docs/index.html?b=c") == true);
- assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.EXTERNAL_LINK_URL, nextToken.type().equals(WikipediaTokenizer.EXTERNAL_LINK_URL) == true);
- tf.next(reusableToken);//skip here
- nextToken = tf.next(reusableToken);
- assertTrue("nextToken is null and it shouldn't be", nextToken != null);
- assertTrue(nextToken.term() + " is not equal to " + "https://lucene.apache.org/java/docs/index.html?b=c",
- nextToken.term().equals("https://lucene.apache.org/java/docs/index.html?b=c") == true);
- assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.EXTERNAL_LINK_URL, nextToken.type().equals(WikipediaTokenizer.EXTERNAL_LINK_URL) == true);
- nextToken = tf.next(reusableToken);
- assertTrue("nextToken is null and it shouldn't be", nextToken != null);
-
- nextToken = tf.next(reusableToken);
- assertTrue("nextToken is not null and it should be", nextToken == null);
-
+ TermAttribute termAtt = (TermAttribute) tf.addAttribute(TermAttribute.class);
+ TypeAttribute typeAtt = (TypeAttribute) tf.addAttribute(TypeAttribute.class);
+
+ assertTrue(tf.incrementToken());
+ assertTrue(termAtt.term() + " is not equal to " + "http://lucene.apache.org/java/docs/index.html#news",
+ termAtt.term().equals("http://lucene.apache.org/java/docs/index.html#news") == true);
+ assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.EXTERNAL_LINK_URL, typeAtt.type().equals(WikipediaTokenizer.EXTERNAL_LINK_URL) == true);
+ tf.incrementToken();//skip here
+
+ assertTrue(tf.incrementToken());
+ assertTrue(termAtt.term() + " is not equal to " + "http://lucene.apache.org/java/docs/index.html?b=c",
+ termAtt.term().equals("http://lucene.apache.org/java/docs/index.html?b=c") == true);
+ assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.EXTERNAL_LINK_URL, typeAtt.type().equals(WikipediaTokenizer.EXTERNAL_LINK_URL) == true);
+ tf.incrementToken();//skip here
+
+ assertTrue(tf.incrementToken());
+ assertTrue(termAtt.term() + " is not equal to " + "https://lucene.apache.org/java/docs/index.html?b=c",
+ termAtt.term().equals("https://lucene.apache.org/java/docs/index.html?b=c") == true);
+ assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.EXTERNAL_LINK_URL, typeAtt.type().equals(WikipediaTokenizer.EXTERNAL_LINK_URL) == true);
+
+ assertTrue(tf.incrementToken());
+ assertFalse(tf.incrementToken());
}
public void testLucene1133() throws Exception {
@@ -277,72 +270,73 @@ public class WikipediaTokenizerTest extends TestCase {
checkLinkPhrases(tf);
String test = "[[Category:a b c d]] [[Category:e f g]] [[link here]] [[link there]] ''italics here'' something ''more italics'' [[Category:h i j]]";
tf = new WikipediaTokenizer(new StringReader(test), WikipediaTokenizer.UNTOKENIZED_ONLY, untoks);
- final Token reusableToken = new Token();
- Token nextToken = tf.next(reusableToken);
- assertTrue("nextToken is null and it shouldn't be", nextToken != null);
- assertTrue(nextToken.term() + " is not equal to " + "a b c d",
- nextToken.term().equals("a b c d") == true);
- assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
- assertTrue(nextToken.startOffset() + " does not equal: " + 11, nextToken.startOffset() == 11);
- assertTrue(nextToken.endOffset() + " does not equal: " + 18, nextToken.endOffset() == 18);
- nextToken = tf.next(reusableToken);
- assertTrue("nextToken is null and it shouldn't be", nextToken != null);
- assertTrue(nextToken.term() + " is not equal to " + "e f g",
- nextToken.term().equals("e f g") == true);
- assertTrue(nextToken.startOffset() + " does not equal: " + 32, nextToken.startOffset() == 32);
- assertTrue(nextToken.endOffset() + " does not equal: " + 37, nextToken.endOffset() == 37);
- nextToken = tf.next(reusableToken);
- assertTrue("nextToken is null and it shouldn't be", nextToken != null);
- assertTrue(nextToken.term() + " is not equal to " + "link",
- nextToken.term().equals("link") == true);
- assertTrue(nextToken.startOffset() + " does not equal: " + 42, nextToken.startOffset() == 42);
- assertTrue(nextToken.endOffset() + " does not equal: " + 46, nextToken.endOffset() == 46);
- nextToken = tf.next(reusableToken);
- assertTrue("nextToken is null and it shouldn't be", nextToken != null);
- assertTrue(nextToken.term() + " is not equal to " + "here",
- nextToken.term().equals("here") == true);
- assertTrue(nextToken.startOffset() + " does not equal: " + 47, nextToken.startOffset() == 47);
- assertTrue(nextToken.endOffset() + " does not equal: " + 51, nextToken.endOffset() == 51);
- nextToken = tf.next(reusableToken);
- assertTrue("nextToken is null and it shouldn't be", nextToken != null);
- assertTrue(nextToken.term() + " is not equal to " + "link",
- nextToken.term().equals("link") == true);
- assertTrue(nextToken.startOffset() + " does not equal: " + 56, nextToken.startOffset() == 56);
- assertTrue(nextToken.endOffset() + " does not equal: " + 60, nextToken.endOffset() == 60);
- nextToken = tf.next(reusableToken);
- assertTrue("nextToken is null and it shouldn't be", nextToken != null);
- assertTrue(nextToken.term() + " is not equal to " + "there",
- nextToken.term().equals("there") == true);
- assertTrue(nextToken.startOffset() + " does not equal: " + 61, nextToken.startOffset() == 61);
- assertTrue(nextToken.endOffset() + " does not equal: " + 66, nextToken.endOffset() == 66);
- nextToken = tf.next(reusableToken);
- assertTrue("nextToken is null and it shouldn't be", nextToken != null);
- assertTrue(nextToken.term() + " is not equal to " + "italics here",
- nextToken.term().equals("italics here") == true);
- assertTrue(nextToken.startOffset() + " does not equal: " + 71, nextToken.startOffset() == 71);
- assertTrue(nextToken.endOffset() + " does not equal: " + 83, nextToken.endOffset() == 83);
- nextToken = tf.next(reusableToken);
- assertTrue("nextToken is null and it shouldn't be", nextToken != null);
- assertTrue(nextToken.term() + " is not equal to " + "something",
- nextToken.term().equals("something") == true);
- assertTrue(nextToken.startOffset() + " does not equal: " + 86, nextToken.startOffset() == 86);
- assertTrue(nextToken.endOffset() + " does not equal: " + 95, nextToken.endOffset() == 95);
- nextToken = tf.next(reusableToken);
- assertTrue("nextToken is null and it shouldn't be", nextToken != null);
- assertTrue(nextToken.term() + " is not equal to " + "more italics",
- nextToken.term().equals("more italics") == true);
- assertTrue(nextToken.startOffset() + " does not equal: " + 98, nextToken.startOffset() == 98);
- assertTrue(nextToken.endOffset() + " does not equal: " + 110, nextToken.endOffset() == 110);
+ TermAttribute termAtt = (TermAttribute) tf.addAttribute(TermAttribute.class);
+ PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) tf.addAttribute(PositionIncrementAttribute.class);
+ OffsetAttribute offsetAtt = (OffsetAttribute) tf.addAttribute(OffsetAttribute.class);
+
+ assertTrue(tf.incrementToken());
+ assertTrue(termAtt.term() + " is not equal to " + "a b c d",
+ termAtt.term().equals("a b c d") == true);
+ assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
+ assertTrue(offsetAtt.startOffset() + " does not equal: " + 11, offsetAtt.startOffset() == 11);
+ assertTrue(offsetAtt.endOffset() + " does not equal: " + 18, offsetAtt.endOffset() == 18);
+
+ assertTrue(tf.incrementToken());
+ assertTrue(termAtt.term() + " is not equal to " + "e f g",
+ termAtt.term().equals("e f g") == true);
+ assertTrue(offsetAtt.startOffset() + " does not equal: " + 32, offsetAtt.startOffset() == 32);
+ assertTrue(offsetAtt.endOffset() + " does not equal: " + 37, offsetAtt.endOffset() == 37);
+
+ assertTrue(tf.incrementToken());
+ assertTrue(termAtt.term() + " is not equal to " + "link",
+ termAtt.term().equals("link") == true);
+ assertTrue(offsetAtt.startOffset() + " does not equal: " + 42, offsetAtt.startOffset() == 42);
+ assertTrue(offsetAtt.endOffset() + " does not equal: " + 46, offsetAtt.endOffset() == 46);
+
+ assertTrue(tf.incrementToken());
+ assertTrue(termAtt.term() + " is not equal to " + "here",
+ termAtt.term().equals("here") == true);
+ assertTrue(offsetAtt.startOffset() + " does not equal: " + 47, offsetAtt.startOffset() == 47);
+ assertTrue(offsetAtt.endOffset() + " does not equal: " + 51, offsetAtt.endOffset() == 51);
- nextToken = tf.next(reusableToken);
- assertTrue("nextToken is null and it shouldn't be", nextToken != null);
- assertTrue(nextToken.term() + " is not equal to " + "h i j",
- nextToken.term().equals("h i j") == true);
- assertTrue(nextToken.startOffset() + " does not equal: " + 124, nextToken.startOffset() == 124);
- assertTrue(nextToken.endOffset() + " does not equal: " + 133, nextToken.endOffset() == 133);
+ assertTrue(tf.incrementToken());
+ assertTrue(termAtt.term() + " is not equal to " + "link",
+ termAtt.term().equals("link") == true);
+ assertTrue(offsetAtt.startOffset() + " does not equal: " + 56, offsetAtt.startOffset() == 56);
+ assertTrue(offsetAtt.endOffset() + " does not equal: " + 60, offsetAtt.endOffset() == 60);
+
+ assertTrue(tf.incrementToken());
+ assertTrue(termAtt.term() + " is not equal to " + "there",
+ termAtt.term().equals("there") == true);
- nextToken = tf.next(reusableToken);
- assertTrue("nextToken is not null and it should be", nextToken == null);
+ assertTrue(offsetAtt.startOffset() + " does not equal: " + 61, offsetAtt.startOffset() == 61);
+ assertTrue(offsetAtt.endOffset() + " does not equal: " + 66, offsetAtt.endOffset() == 66);
+
+ assertTrue(tf.incrementToken());
+ assertTrue(termAtt.term() + " is not equal to " + "italics here",
+ termAtt.term().equals("italics here") == true);
+ assertTrue(offsetAtt.startOffset() + " does not equal: " + 71, offsetAtt.startOffset() == 71);
+ assertTrue(offsetAtt.endOffset() + " does not equal: " + 83, offsetAtt.endOffset() == 83);
+
+ assertTrue(tf.incrementToken());
+ assertTrue(termAtt.term() + " is not equal to " + "something",
+ termAtt.term().equals("something") == true);
+ assertTrue(offsetAtt.startOffset() + " does not equal: " + 86, offsetAtt.startOffset() == 86);
+ assertTrue(offsetAtt.endOffset() + " does not equal: " + 95, offsetAtt.endOffset() == 95);
+
+ assertTrue(tf.incrementToken());
+ assertTrue(termAtt.term() + " is not equal to " + "more italics",
+ termAtt.term().equals("more italics") == true);
+ assertTrue(offsetAtt.startOffset() + " does not equal: " + 98, offsetAtt.startOffset() == 98);
+ assertTrue(offsetAtt.endOffset() + " does not equal: " + 110, offsetAtt.endOffset() == 110);
+
+ assertTrue(tf.incrementToken());
+ assertTrue(termAtt.term() + " is not equal to " + "h i j",
+ termAtt.term().equals("h i j") == true);
+ assertTrue(offsetAtt.startOffset() + " does not equal: " + 124, offsetAtt.startOffset() == 124);
+ assertTrue(offsetAtt.endOffset() + " does not equal: " + 133, offsetAtt.endOffset() == 133);
+
+ assertFalse(tf.incrementToken());
}
public void testBoth() throws Exception {
@@ -352,225 +346,211 @@ public class WikipediaTokenizerTest extends TestCase {
String test = "[[Category:a b c d]] [[Category:e f g]] [[link here]] [[link there]] ''italics here'' something ''more italics'' [[Category:h i j]]";
//should output all the indivual tokens plus the untokenized tokens as well. Untokenized tokens
WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(test), WikipediaTokenizer.BOTH, untoks);
- final Token reusableToken = new Token();
- Token nextToken = tf.next(reusableToken);
- assertTrue("nextToken is null and it shouldn't be", nextToken != null);
- assertTrue(nextToken.term() + " is not equal to " + "a b c d",
- nextToken.term().equals("a b c d") == true);
- assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
- assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true);
- assertTrue(nextToken.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, nextToken.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
- assertTrue(nextToken.startOffset() + " does not equal: " + 11, nextToken.startOffset() == 11);
- assertTrue(nextToken.endOffset() + " does not equal: " + 18, nextToken.endOffset() == 18);
- nextToken = tf.next(reusableToken);
- assertTrue("nextToken is null and it shouldn't be", nextToken != null);
- assertTrue(nextToken.term() + " is not equal to " + "a",
- nextToken.term().equals("a") == true);
- assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 0, nextToken.getPositionIncrement() == 0);
- assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true);
- assertTrue(nextToken.getFlags() + " equals: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG + " and it shouldn't", nextToken.getFlags() != WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
- assertTrue(nextToken.startOffset() + " does not equal: " + 11, nextToken.startOffset() == 11);
- assertTrue(nextToken.endOffset() + " does not equal: " + 12, nextToken.endOffset() == 12);
+ TermAttribute termAtt = (TermAttribute) tf.addAttribute(TermAttribute.class);
+ TypeAttribute typeAtt = (TypeAttribute) tf.addAttribute(TypeAttribute.class);
+ PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) tf.addAttribute(PositionIncrementAttribute.class);
+ OffsetAttribute offsetAtt = (OffsetAttribute) tf.addAttribute(OffsetAttribute.class);
+ FlagsAttribute flagsAtt = (FlagsAttribute) tf.addAttribute(FlagsAttribute.class);
+
+ assertTrue(tf.incrementToken());
+ assertTrue(termAtt.term() + " is not equal to " + "a b c d",
+ termAtt.term().equals("a b c d") == true);
+ assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
+ assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, typeAtt.type().equals(WikipediaTokenizer.CATEGORY) == true);
+ assertTrue(flagsAtt.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, flagsAtt.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
+ assertTrue(offsetAtt.startOffset() + " does not equal: " + 11, offsetAtt.startOffset() == 11);
+ assertTrue(offsetAtt.endOffset() + " does not equal: " + 18, offsetAtt.endOffset() == 18);
+
+ assertTrue(tf.incrementToken());
+ assertTrue(termAtt.term() + " is not equal to " + "a",
+ termAtt.term().equals("a") == true);
+ assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 0, posIncrAtt.getPositionIncrement() == 0);
+ assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, typeAtt.type().equals(WikipediaTokenizer.CATEGORY) == true);
+ assertTrue(flagsAtt.getFlags() + " equals: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG + " and it shouldn't", flagsAtt.getFlags() != WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
+ assertTrue(offsetAtt.startOffset() + " does not equal: " + 11, offsetAtt.startOffset() == 11);
+ assertTrue(offsetAtt.endOffset() + " does not equal: " + 12, offsetAtt.endOffset() == 12);
- nextToken = tf.next(reusableToken);
- assertTrue("nextToken is null and it shouldn't be", nextToken != null);
- assertTrue(nextToken.term() + " is not equal to " + "b",
- nextToken.term().equals("b") == true);
- assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
- assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true);
- assertTrue(nextToken.startOffset() + " does not equal: " + 13, nextToken.startOffset() == 13);
- assertTrue(nextToken.endOffset() + " does not equal: " + 14, nextToken.endOffset() == 14);
+ assertTrue(tf.incrementToken());
+ assertTrue(termAtt.term() + " is not equal to " + "b",
+ termAtt.term().equals("b") == true);
+ assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
+ assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, typeAtt.type().equals(WikipediaTokenizer.CATEGORY) == true);
+ assertTrue(offsetAtt.startOffset() + " does not equal: " + 13, offsetAtt.startOffset() == 13);
+ assertTrue(offsetAtt.endOffset() + " does not equal: " + 14, offsetAtt.endOffset() == 14);
- nextToken = tf.next(reusableToken);
- assertTrue("nextToken is null and it shouldn't be", nextToken != null);
- assertTrue(nextToken.term() + " is not equal to " + "c",
- nextToken.term().equals("c") == true);
- assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
- assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true);
- assertTrue(nextToken.startOffset() + " does not equal: " + 15, nextToken.startOffset() == 15);
- assertTrue(nextToken.endOffset() + " does not equal: " + 16, nextToken.endOffset() == 16);
+ assertTrue(tf.incrementToken());
+ assertTrue(termAtt.term() + " is not equal to " + "c",
+ termAtt.term().equals("c") == true);
+ assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
+ assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, typeAtt.type().equals(WikipediaTokenizer.CATEGORY) == true);
+ assertTrue(offsetAtt.startOffset() + " does not equal: " + 15, offsetAtt.startOffset() == 15);
+ assertTrue(offsetAtt.endOffset() + " does not equal: " + 16, offsetAtt.endOffset() == 16);
- nextToken = tf.next(reusableToken);
- assertTrue("nextToken is null and it shouldn't be", nextToken != null);
- assertTrue(nextToken.term() + " is not equal to " + "d",
- nextToken.term().equals("d") == true);
- assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
- assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true);
- assertTrue(nextToken.startOffset() + " does not equal: " + 17, nextToken.startOffset() == 17);
- assertTrue(nextToken.endOffset() + " does not equal: " + 18, nextToken.endOffset() == 18);
+ assertTrue(tf.incrementToken());
+ assertTrue(termAtt.term() + " is not equal to " + "d",
+ termAtt.term().equals("d") == true);
+ assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
+ assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, typeAtt.type().equals(WikipediaTokenizer.CATEGORY) == true);
+ assertTrue(offsetAtt.startOffset() + " does not equal: " + 17, offsetAtt.startOffset() == 17);
+ assertTrue(offsetAtt.endOffset() + " does not equal: " + 18, offsetAtt.endOffset() == 18);
- nextToken = tf.next(reusableToken);
- assertTrue("nextToken is null and it shouldn't be", nextToken != null);
- assertTrue(nextToken.term() + " is not equal to " + "e f g",
- nextToken.term().equals("e f g") == true);
- assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true);
- assertTrue(nextToken.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, nextToken.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
- assertTrue(nextToken.startOffset() + " does not equal: " + 32, nextToken.startOffset() == 32);
- assertTrue(nextToken.endOffset() + " does not equal: " + 37, nextToken.endOffset() == 37);
+ assertTrue(tf.incrementToken());
+ assertTrue(termAtt.term() + " is not equal to " + "e f g",
+ termAtt.term().equals("e f g") == true);
+ assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, typeAtt.type().equals(WikipediaTokenizer.CATEGORY) == true);
+ assertTrue(flagsAtt.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, flagsAtt.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
+ assertTrue(offsetAtt.startOffset() + " does not equal: " + 32, offsetAtt.startOffset() == 32);
+ assertTrue(offsetAtt.endOffset() + " does not equal: " + 37, offsetAtt.endOffset() == 37);
- nextToken = tf.next(reusableToken);
- assertTrue("nextToken is null and it shouldn't be", nextToken != null);
- assertTrue(nextToken.term() + " is not equal to " + "e",
- nextToken.term().equals("e") == true);
- assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true);
- assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 0, nextToken.getPositionIncrement() == 0);
- assertTrue(nextToken.startOffset() + " does not equal: " + 32, nextToken.startOffset() == 32);
- assertTrue(nextToken.endOffset() + " does not equal: " + 33, nextToken.endOffset() == 33);
+ assertTrue(tf.incrementToken());
+ assertTrue(termAtt.term() + " is not equal to " + "e",
+ termAtt.term().equals("e") == true);
+ assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, typeAtt.type().equals(WikipediaTokenizer.CATEGORY) == true);
+ assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 0, posIncrAtt.getPositionIncrement() == 0);
+ assertTrue(offsetAtt.startOffset() + " does not equal: " + 32, offsetAtt.startOffset() == 32);
+ assertTrue(offsetAtt.endOffset() + " does not equal: " + 33, offsetAtt.endOffset() == 33);
- nextToken = tf.next(reusableToken);
- assertTrue("nextToken is null and it shouldn't be", nextToken != null);
- assertTrue(nextToken.term() + " is not equal to " + "f",
- nextToken.term().equals("f") == true);
- assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true);
- assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
- assertTrue(nextToken.startOffset() + " does not equal: " + 34, nextToken.startOffset() == 34);
- assertTrue(nextToken.endOffset() + " does not equal: " + 35, nextToken.endOffset() == 35);
+ assertTrue(tf.incrementToken());
+ assertTrue(termAtt.term() + " is not equal to " + "f",
+ termAtt.term().equals("f") == true);
+ assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, typeAtt.type().equals(WikipediaTokenizer.CATEGORY) == true);
+ assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
+ assertTrue(offsetAtt.startOffset() + " does not equal: " + 34, offsetAtt.startOffset() == 34);
+ assertTrue(offsetAtt.endOffset() + " does not equal: " + 35, offsetAtt.endOffset() == 35);
- nextToken = tf.next(reusableToken);
- assertTrue("nextToken is null and it shouldn't be", nextToken != null);
- assertTrue(nextToken.term() + " is not equal to " + "g",
- nextToken.term().equals("g") == true);
- assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true);
- assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
- assertTrue(nextToken.startOffset() + " does not equal: " + 36, nextToken.startOffset() == 36);
- assertTrue(nextToken.endOffset() + " does not equal: " + 37, nextToken.endOffset() == 37);
+ assertTrue(tf.incrementToken());
+ assertTrue(termAtt.term() + " is not equal to " + "g",
+ termAtt.term().equals("g") == true);
+ assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, typeAtt.type().equals(WikipediaTokenizer.CATEGORY) == true);
+ assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
+ assertTrue(offsetAtt.startOffset() + " does not equal: " + 36, offsetAtt.startOffset() == 36);
+ assertTrue(offsetAtt.endOffset() + " does not equal: " + 37, offsetAtt.endOffset() == 37);
- nextToken = tf.next(reusableToken);
- assertTrue("nextToken is null and it shouldn't be", nextToken != null);
- assertTrue(nextToken.term() + " is not equal to " + "link",
- nextToken.term().equals("link") == true);
- assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
- assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.INTERNAL_LINK, nextToken.type().equals(WikipediaTokenizer.INTERNAL_LINK) == true);
- assertTrue(nextToken.startOffset() + " does not equal: " + 42, nextToken.startOffset() == 42);
- assertTrue(nextToken.endOffset() + " does not equal: " + 46, nextToken.endOffset() == 46);
- nextToken = tf.next(reusableToken);
- assertTrue("nextToken is null and it shouldn't be", nextToken != null);
- assertTrue(nextToken.term() + " is not equal to " + "here",
- nextToken.term().equals("here") == true);
- assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
- assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.INTERNAL_LINK, nextToken.type().equals(WikipediaTokenizer.INTERNAL_LINK) == true);
- assertTrue(nextToken.startOffset() + " does not equal: " + 47, nextToken.startOffset() == 47);
- assertTrue(nextToken.endOffset() + " does not equal: " + 51, nextToken.endOffset() == 51);
- nextToken = tf.next(reusableToken);
- assertTrue("nextToken is null and it shouldn't be", nextToken != null);
- assertTrue(nextToken.term() + " is not equal to " + "link",
- nextToken.term().equals("link") == true);
- assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
- assertTrue(nextToken.startOffset() + " does not equal: " + 56, nextToken.startOffset() == 56);
- assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.INTERNAL_LINK, nextToken.type().equals(WikipediaTokenizer.INTERNAL_LINK) == true);
- assertTrue(nextToken.endOffset() + " does not equal: " + 60, nextToken.endOffset() == 60);
- nextToken = tf.next(reusableToken);
- assertTrue("nextToken is null and it shouldn't be", nextToken != null);
- assertTrue(nextToken.term() + " is not equal to " + "there",
- nextToken.term().equals("there") == true);
- assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
- assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.INTERNAL_LINK, nextToken.type().equals(WikipediaTokenizer.INTERNAL_LINK) == true);
- assertTrue(nextToken.startOffset() + " does not equal: " + 61, nextToken.startOffset() == 61);
- assertTrue(nextToken.endOffset() + " does not equal: " + 66, nextToken.endOffset() == 66);
- nextToken = tf.next(reusableToken);
- assertTrue("nextToken is null and it shouldn't be", nextToken != null);
- assertTrue(nextToken.term() + " is not equal to " + "italics here",
- nextToken.term().equals("italics here") == true);
- assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
- assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.ITALICS, nextToken.type().equals(WikipediaTokenizer.ITALICS) == true);
- assertTrue(nextToken.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, nextToken.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
- assertTrue(nextToken.startOffset() + " does not equal: " + 71, nextToken.startOffset() == 71);
- assertTrue(nextToken.endOffset() + " does not equal: " + 83, nextToken.endOffset() == 83);
+ assertTrue(tf.incrementToken());
+ assertTrue(termAtt.term() + " is not equal to " + "link",
+ termAtt.term().equals("link") == true);
+ assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
+ assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.INTERNAL_LINK, typeAtt.type().equals(WikipediaTokenizer.INTERNAL_LINK) == true);
+ assertTrue(offsetAtt.startOffset() + " does not equal: " + 42, offsetAtt.startOffset() == 42);
+ assertTrue(offsetAtt.endOffset() + " does not equal: " + 46, offsetAtt.endOffset() == 46);
+
+ assertTrue(tf.incrementToken());
+ assertTrue(termAtt.term() + " is not equal to " + "here",
+ termAtt.term().equals("here") == true);
+ assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
+ assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.INTERNAL_LINK, typeAtt.type().equals(WikipediaTokenizer.INTERNAL_LINK) == true);
+ assertTrue(offsetAtt.startOffset() + " does not equal: " + 47, offsetAtt.startOffset() == 47);
+ assertTrue(offsetAtt.endOffset() + " does not equal: " + 51, offsetAtt.endOffset() == 51);
+
+ assertTrue(tf.incrementToken());
+ assertTrue(termAtt.term() + " is not equal to " + "link",
+ termAtt.term().equals("link") == true);
+ assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
+ assertTrue(offsetAtt.startOffset() + " does not equal: " + 56, offsetAtt.startOffset() == 56);
+ assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.INTERNAL_LINK, typeAtt.type().equals(WikipediaTokenizer.INTERNAL_LINK) == true);
+ assertTrue(offsetAtt.endOffset() + " does not equal: " + 60, offsetAtt.endOffset() == 60);
+
+ assertTrue(tf.incrementToken());
+ assertTrue(termAtt.term() + " is not equal to " + "there",
+ termAtt.term().equals("there") == true);
+ assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
+ assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.INTERNAL_LINK, typeAtt.type().equals(WikipediaTokenizer.INTERNAL_LINK) == true);
+ assertTrue(offsetAtt.startOffset() + " does not equal: " + 61, offsetAtt.startOffset() == 61);
+ assertTrue(offsetAtt.endOffset() + " does not equal: " + 66, offsetAtt.endOffset() == 66);
+
+ assertTrue(tf.incrementToken());
+ assertTrue(termAtt.term() + " is not equal to " + "italics here",
+ termAtt.term().equals("italics here") == true);
+ assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
+ assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.ITALICS, typeAtt.type().equals(WikipediaTokenizer.ITALICS) == true);
+ assertTrue(flagsAtt.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, flagsAtt.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
+ assertTrue(offsetAtt.startOffset() + " does not equal: " + 71, offsetAtt.startOffset() == 71);
+ assertTrue(offsetAtt.endOffset() + " does not equal: " + 83, offsetAtt.endOffset() == 83);
- nextToken = tf.next(reusableToken);
- assertTrue("nextToken is null and it shouldn't be", nextToken != null);
- assertTrue(nextToken.term() + " is not equal to " + "italics",
- nextToken.term().equals("italics") == true);
- assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 0, nextToken.getPositionIncrement() == 0);
- assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.ITALICS, nextToken.type().equals(WikipediaTokenizer.ITALICS) == true);
- assertTrue(nextToken.startOffset() + " does not equal: " + 71, nextToken.startOffset() == 71);
- assertTrue(nextToken.endOffset() + " does not equal: " + 78, nextToken.endOffset() == 78);
+ assertTrue(tf.incrementToken());
+ assertTrue(termAtt.term() + " is not equal to " + "italics",
+ termAtt.term().equals("italics") == true);
+ assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 0, posIncrAtt.getPositionIncrement() == 0);
+ assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.ITALICS, typeAtt.type().equals(WikipediaTokenizer.ITALICS) == true);
+ assertTrue(offsetAtt.startOffset() + " does not equal: " + 71, offsetAtt.startOffset() == 71);
+ assertTrue(offsetAtt.endOffset() + " does not equal: " + 78, offsetAtt.endOffset() == 78);
- nextToken = tf.next(reusableToken);
- assertTrue("nextToken is null and it shouldn't be", nextToken != null);
- assertTrue(nextToken.term() + " is not equal to " + "here",
- nextToken.term().equals("here") == true);
- assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
- assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.ITALICS, nextToken.type().equals(WikipediaTokenizer.ITALICS) == true);
- assertTrue(nextToken.startOffset() + " does not equal: " + 79, nextToken.startOffset() == 79);
- assertTrue(nextToken.endOffset() + " does not equal: " + 83, nextToken.endOffset() == 83);
+ assertTrue(tf.incrementToken());
+ assertTrue(termAtt.term() + " is not equal to " + "here",
+ termAtt.term().equals("here") == true);
+ assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
+ assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.ITALICS, typeAtt.type().equals(WikipediaTokenizer.ITALICS) == true);
+ assertTrue(offsetAtt.startOffset() + " does not equal: " + 79, offsetAtt.startOffset() == 79);
+ assertTrue(offsetAtt.endOffset() + " does not equal: " + 83, offsetAtt.endOffset() == 83);
- nextToken = tf.next(reusableToken);
- assertTrue("nextToken is null and it shouldn't be", nextToken != null);
- assertTrue(nextToken.term() + " is not equal to " + "something",
- nextToken.term().equals("something") == true);
- assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
- assertTrue(nextToken.startOffset() + " does not equal: " + 86, nextToken.startOffset() == 86);
- assertTrue(nextToken.endOffset() + " does not equal: " + 95, nextToken.endOffset() == 95);
- nextToken = tf.next(reusableToken);
- assertTrue("nextToken is null and it shouldn't be", nextToken != null);
- assertTrue(nextToken.term() + " is not equal to " + "more italics",
- nextToken.term().equals("more italics") == true);
- assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
- assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.ITALICS, nextToken.type().equals(WikipediaTokenizer.ITALICS) == true);
- assertTrue(nextToken.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, nextToken.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
- assertTrue(nextToken.startOffset() + " does not equal: " + 98, nextToken.startOffset() == 98);
- assertTrue(nextToken.endOffset() + " does not equal: " + 110, nextToken.endOffset() == 110);
+ assertTrue(tf.incrementToken());
+ assertTrue(termAtt.term() + " is not equal to " + "something",
+ termAtt.term().equals("something") == true);
+ assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
+ assertTrue(offsetAtt.startOffset() + " does not equal: " + 86, offsetAtt.startOffset() == 86);
+ assertTrue(offsetAtt.endOffset() + " does not equal: " + 95, offsetAtt.endOffset() == 95);
+
+ assertTrue(tf.incrementToken());
+ assertTrue(termAtt.term() + " is not equal to " + "more italics",
+ termAtt.term().equals("more italics") == true);
+ assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
+ assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.ITALICS, typeAtt.type().equals(WikipediaTokenizer.ITALICS) == true);
+ assertTrue(flagsAtt.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, flagsAtt.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
+ assertTrue(offsetAtt.startOffset() + " does not equal: " + 98, offsetAtt.startOffset() == 98);
+ assertTrue(offsetAtt.endOffset() + " does not equal: " + 110, offsetAtt.endOffset() == 110);
- nextToken = tf.next(reusableToken);
- assertTrue("nextToken is null and it shouldn't be", nextToken != null);
- assertTrue(nextToken.term() + " is not equal to " + "more",
- nextToken.term().equals("more") == true);
- assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 0, nextToken.getPositionIncrement() == 0);
- assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.ITALICS, nextToken.type().equals(WikipediaTokenizer.ITALICS) == true);
- assertTrue(nextToken.startOffset() + " does not equal: " + 98, nextToken.startOffset() == 98);
- assertTrue(nextToken.endOffset() + " does not equal: " + 102, nextToken.endOffset() == 102);
+ assertTrue(tf.incrementToken());
+ assertTrue(termAtt.term() + " is not equal to " + "more",
+ termAtt.term().equals("more") == true);
+ assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 0, posIncrAtt.getPositionIncrement() == 0);
+ assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.ITALICS, typeAtt.type().equals(WikipediaTokenizer.ITALICS) == true);
+ assertTrue(offsetAtt.startOffset() + " does not equal: " + 98, offsetAtt.startOffset() == 98);
+ assertTrue(offsetAtt.endOffset() + " does not equal: " + 102, offsetAtt.endOffset() == 102);
- nextToken = tf.next(reusableToken);
- assertTrue("nextToken is null and it shouldn't be", nextToken != null);
- assertTrue(nextToken.term() + " is not equal to " + "italics",
- nextToken.term().equals("italics") == true);
- assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
- assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.ITALICS, nextToken.type().equals(WikipediaTokenizer.ITALICS) == true);
+ assertTrue(tf.incrementToken());
+ assertTrue(termAtt.term() + " is not equal to " + "italics",
+ termAtt.term().equals("italics") == true);
+ assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
+ assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.ITALICS, typeAtt.type().equals(WikipediaTokenizer.ITALICS) == true);
- assertTrue(nextToken.startOffset() + " does not equal: " + 103, nextToken.startOffset() == 103);
- assertTrue(nextToken.endOffset() + " does not equal: " + 110, nextToken.endOffset() == 110);
+ assertTrue(offsetAtt.startOffset() + " does not equal: " + 103, offsetAtt.startOffset() == 103);
+ assertTrue(offsetAtt.endOffset() + " does not equal: " + 110, offsetAtt.endOffset() == 110);
- nextToken = tf.next(reusableToken);
- assertTrue("nextToken is null and it shouldn't be", nextToken != null);
- assertTrue(nextToken.term() + " is not equal to " + "h i j",
- nextToken.term().equals("h i j") == true);
- assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
- assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true);
- assertTrue(nextToken.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, nextToken.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
- assertTrue(nextToken.startOffset() + " does not equal: " + 124, nextToken.startOffset() == 124);
- assertTrue(nextToken.endOffset() + " does not equal: " + 133, nextToken.endOffset() == 133);
+ assertTrue(tf.incrementToken());
+ assertTrue(termAtt.term() + " is not equal to " + "h i j",
+ termAtt.term().equals("h i j") == true);
+ assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
+ assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, typeAtt.type().equals(WikipediaTokenizer.CATEGORY) == true);
+ assertTrue(flagsAtt.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, flagsAtt.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
+ assertTrue(offsetAtt.startOffset() + " does not equal: " + 124, offsetAtt.startOffset() == 124);
+ assertTrue(offsetAtt.endOffset() + " does not equal: " + 133, offsetAtt.endOffset() == 133);
- nextToken = tf.next(reusableToken);
- assertTrue("nextToken is null and it shouldn't be", nextToken != null);
- assertTrue(nextToken.term() + " is not equal to " + "h",
- nextToken.term().equals("h") == true);
- assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 0, nextToken.getPositionIncrement() == 0);
- assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true);
- assertTrue(nextToken.startOffset() + " does not equal: " + 124, nextToken.startOffset() == 124);
- assertTrue(nextToken.endOffset() + " does not equal: " + 125, nextToken.endOffset() == 125);
+ assertTrue(tf.incrementToken());
+ assertTrue(termAtt.term() + " is not equal to " + "h",
+ termAtt.term().equals("h") == true);
+ assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 0, posIncrAtt.getPositionIncrement() == 0);
+ assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, typeAtt.type().equals(WikipediaTokenizer.CATEGORY) == true);
+ assertTrue(offsetAtt.startOffset() + " does not equal: " + 124, offsetAtt.startOffset() == 124);
+ assertTrue(offsetAtt.endOffset() + " does not equal: " + 125, offsetAtt.endOffset() == 125);
- nextToken = tf.next(reusableToken);
- assertTrue("nextToken is null and it shouldn't be", nextToken != null);
- assertTrue(nextToken.term() + " is not equal to " + "i",
- nextToken.term().equals("i") == true);
- assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
- assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true);
- assertTrue(nextToken.startOffset() + " does not equal: " + 128, nextToken.startOffset() == 128);
- assertTrue(nextToken.endOffset() + " does not equal: " + 129, nextToken.endOffset() == 129);
- nextToken = tf.next(reusableToken);
- assertTrue("nextToken is null and it shouldn't be", nextToken != null);
- assertTrue(nextToken.term() + " is not equal to " + "j",
- nextToken.term().equals("j") == true);
- assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
- assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true);
- assertTrue(nextToken.startOffset() + " does not equal: " + 132, nextToken.startOffset() == 132);
- assertTrue(nextToken.endOffset() + " does not equal: " + 133, nextToken.endOffset() == 133);
-
- nextToken = tf.next(reusableToken);
- assertTrue("nextToken is not null and it should be", nextToken == null);
+ assertTrue(tf.incrementToken());
+ assertTrue(termAtt.term() + " is not equal to " + "i",
+ termAtt.term().equals("i") == true);
+ assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
+ assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, typeAtt.type().equals(WikipediaTokenizer.CATEGORY) == true);
+ assertTrue(offsetAtt.startOffset() + " does not equal: " + 128, offsetAtt.startOffset() == 128);
+ assertTrue(offsetAtt.endOffset() + " does not equal: " + 129, offsetAtt.endOffset() == 129);
+
+ assertTrue(tf.incrementToken());
+ assertTrue(termAtt.term() + " is not equal to " + "j",
+ termAtt.term().equals("j") == true);
+ assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
+ assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, typeAtt.type().equals(WikipediaTokenizer.CATEGORY) == true);
+ assertTrue(offsetAtt.startOffset() + " does not equal: " + 132, offsetAtt.startOffset() == 132);
+ assertTrue(offsetAtt.endOffset() + " does not equal: " + 133, offsetAtt.endOffset() == 133);
+ assertFalse(tf.incrementToken());
}
}
diff --git a/contrib/wordnet/src/java/org/apache/lucene/wordnet/SynExpand.java b/contrib/wordnet/src/java/org/apache/lucene/wordnet/SynExpand.java
index 97a70d6d9b3..e3932541d20 100755
--- a/contrib/wordnet/src/java/org/apache/lucene/wordnet/SynExpand.java
+++ b/contrib/wordnet/src/java/org/apache/lucene/wordnet/SynExpand.java
@@ -27,9 +27,9 @@ import java.util.List;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
@@ -114,10 +114,10 @@ public final class SynExpand {
// [1] Parse query into separate words so that when we expand we can avoid dups
TokenStream ts = a.tokenStream( field, new StringReader( query));
-
- final Token reusableToken = new Token();
- for (Token nextToken = ts.next(reusableToken); nextToken != null; nextToken = ts.next(reusableToken)) {
- String word = nextToken.term();
+ TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
+
+ while (ts.incrementToken()) {
+ String word = termAtt.term();
if ( already.add( word))
top.add( word);
}
diff --git a/contrib/wordnet/src/java/org/apache/lucene/wordnet/SynLookup.java b/contrib/wordnet/src/java/org/apache/lucene/wordnet/SynLookup.java
index 509bbfc7fa6..087212244b9 100644
--- a/contrib/wordnet/src/java/org/apache/lucene/wordnet/SynLookup.java
+++ b/contrib/wordnet/src/java/org/apache/lucene/wordnet/SynLookup.java
@@ -27,8 +27,8 @@ import java.util.List;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
@@ -101,9 +101,10 @@ public class SynLookup {
// [1] Parse query into separate words so that when we expand we can avoid dups
TokenStream ts = a.tokenStream( field, new StringReader( query));
- final Token reusableToken = new Token();
- for (Token nextToken = ts.next(reusableToken); nextToken != null; nextToken = ts.next(reusableToken)) {
- String word = nextToken.term();
+ TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
+
+ while (ts.incrementToken()) {
+ String word = termAtt.term();
if ( already.add( word))
top.add( word);
}
diff --git a/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/LikeThisQueryBuilder.java b/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/LikeThisQueryBuilder.java
index 431c1d13b5d..13bfdbcd443 100644
--- a/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/LikeThisQueryBuilder.java
+++ b/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/LikeThisQueryBuilder.java
@@ -9,8 +9,8 @@ import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.search.similar.MoreLikeThisQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.xmlparser.DOMUtils;
@@ -72,14 +72,14 @@ public class LikeThisQueryBuilder implements QueryBuilder {
if((stopWords!=null)&&(fields!=null))
{
stopWordsSet=new HashSet();
- final Token reusableToken = new Token();
for (int i = 0; i < fields.length; i++)
{
TokenStream ts = analyzer.tokenStream(fields[i],new StringReader(stopWords));
+ TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
try
{
- for (Token nextToken = ts.next(reusableToken); nextToken != null; nextToken = ts.next(reusableToken)) {
- stopWordsSet.add(nextToken.term());
+ while(ts.incrementToken()) {
+ stopWordsSet.add(termAtt.term());
}
}
catch(IOException ioe)
diff --git a/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/SpanOrTermsBuilder.java b/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/SpanOrTermsBuilder.java
index c8ed5665b9f..1c5bdaf86e7 100644
--- a/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/SpanOrTermsBuilder.java
+++ b/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/SpanOrTermsBuilder.java
@@ -5,8 +5,8 @@ import java.io.StringReader;
import java.util.ArrayList;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.spans.SpanOrQuery;
import org.apache.lucene.search.spans.SpanQuery;
@@ -52,9 +52,10 @@ public class SpanOrTermsBuilder extends SpanBuilderBase
{
ArrayList clausesList=new ArrayList();
TokenStream ts=analyzer.tokenStream(fieldName,new StringReader(value));
- final Token reusableToken = new Token();
- for (Token nextToken = ts.next(reusableToken); nextToken != null; nextToken = ts.next(reusableToken)) {
- SpanTermQuery stq=new SpanTermQuery(new Term(fieldName,nextToken.term()));
+ TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
+
+ while (ts.incrementToken()) {
+ SpanTermQuery stq=new SpanTermQuery(new Term(fieldName, termAtt.term()));
clausesList.add(stq);
}
SpanOrQuery soq=new SpanOrQuery((SpanQuery[]) clausesList.toArray(new SpanQuery[clausesList.size()]));
diff --git a/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/TermsFilterBuilder.java b/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/TermsFilterBuilder.java
index 52091dccb92..93e27fdd7eb 100644
--- a/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/TermsFilterBuilder.java
+++ b/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/TermsFilterBuilder.java
@@ -4,8 +4,8 @@ import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.TermsFilter;
@@ -54,19 +54,19 @@ public class TermsFilterBuilder implements FilterBuilder
String text = DOMUtils.getNonBlankTextOrFail(e);
String fieldName = DOMUtils.getAttributeWithInheritanceOrFail(e, "fieldName");
TokenStream ts = analyzer.tokenStream(fieldName, new StringReader(text));
-
+ TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
+
try
{
- final Token reusableToken = new Token();
Term term = null;
- for (Token nextToken = ts.next(reusableToken); nextToken != null; nextToken = ts.next(reusableToken)) {
+ while (ts.incrementToken()) {
if (term == null)
{
- term = new Term(fieldName, nextToken.term());
+ term = new Term(fieldName, termAtt.term());
} else
{
// create from previous to save fieldName.intern overhead
- term = term.createTerm(nextToken.term());
+ term = term.createTerm(termAtt.term());
}
tf.addTerm(term);
}
diff --git a/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/TermsQueryBuilder.java b/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/TermsQueryBuilder.java
index 40e1c2ca035..7a6d1e57c23 100644
--- a/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/TermsQueryBuilder.java
+++ b/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/TermsQueryBuilder.java
@@ -4,8 +4,8 @@ import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
@@ -57,16 +57,16 @@ public class TermsQueryBuilder implements QueryBuilder {
TokenStream ts = analyzer.tokenStream(fieldName, new StringReader(text));
try
{
- final Token reusableToken = new Token();
+ TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
Term term = null;
- for (Token nextToken = ts.next(reusableToken); nextToken != null; nextToken = ts.next(reusableToken)) {
+ while (ts.incrementToken()) {
if (term == null)
{
- term = new Term(fieldName, nextToken.term());
+ term = new Term(fieldName, termAtt.term());
} else
{
// create from previous to save fieldName.intern overhead
- term = term.createTerm(nextToken.term());
+ term = term.createTerm(termAtt.term());
}
bq.add(new BooleanClause(new TermQuery(term),BooleanClause.Occur.SHOULD));
}