From 898cfe87cdb7dab5e754d777e5685ec31ff6527b Mon Sep 17 00:00:00 2001 From: Michael Busch Date: Tue, 18 Nov 2008 23:41:49 +0000 Subject: [PATCH] LUCENE-1422: New TokenStream API that uses a new class called AttributeSource instead of the now deprecated Token class. All attributes that the Token class had have been moved into separate classes: TermAttribute, OffsetAttribute, PositionIncrementAttribute, PayloadAttribute, TypeAttribute and FlagsAttribute. git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@718798 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 9 + .../lucene/analysis/CachingTokenFilter.java | 32 +- .../apache/lucene/analysis/CharTokenizer.java | 52 +++ .../analysis/ISOLatin1AccentFilter.java | 28 +- .../lucene/analysis/KeywordTokenizer.java | 31 +- .../apache/lucene/analysis/LengthFilter.java | 22 ++ .../lucene/analysis/LowerCaseFilter.java | 19 + .../lucene/analysis/PorterStemFilter.java | 14 + .../apache/lucene/analysis/SinkTokenizer.java | 26 +- .../apache/lucene/analysis/StopFilter.java | 33 ++ .../lucene/analysis/TeeTokenFilter.java | 14 + .../org/apache/lucene/analysis/Token.java | 8 +- .../apache/lucene/analysis/TokenFilter.java | 29 +- .../apache/lucene/analysis/TokenStream.java | 165 +++++++- .../org/apache/lucene/analysis/Tokenizer.java | 17 +- .../org/apache/lucene/analysis/package.html | 367 +++++++++++++++++- .../analysis/standard/StandardFilter.java | 43 +- .../analysis/standard/StandardTokenizer.java | 76 +++- .../standard/StandardTokenizerImpl.java | 8 + .../standard/StandardTokenizerImpl.jflex | 9 + .../tokenattributes/FlagsAttribute.java | 86 ++++ .../tokenattributes/OffsetAttribute.java | 98 +++++ .../tokenattributes/PayloadAttribute.java | 109 ++++++ .../PositionIncrementAttribute.java | 106 +++++ .../tokenattributes/TermAttribute.java | 242 ++++++++++++ .../tokenattributes/TypeAttribute.java | 83 ++++ .../org/apache/lucene/index/DocInverter.java | 8 +- .../lucene/index/DocInverterPerField.java | 45 ++- .../lucene/index/DocInverterPerThread.java | 96 +++++ .../apache/lucene/index/FieldInvertState.java | 7 + .../index/FreqProxTermsWriterPerField.java | 34 +- .../index/InvertedDocConsumerPerField.java | 9 +- src/java/org/apache/lucene/index/Payload.java | 3 +- .../index/TermVectorsTermsWriterPerField.java | 32 +- .../index/TermsHashConsumerPerField.java | 9 +- .../lucene/index/TermsHashPerField.java | 35 +- .../lucene/queryParser/QueryParser.java | 202 ++++++++-- .../apache/lucene/queryParser/QueryParser.jj | 194 +++++++-- .../queryParser/QueryParserTokenManager.java | 5 +- .../apache/lucene/search/QueryTermVector.java | 15 +- .../org/apache/lucene/util/Attribute.java | 95 +++++ .../apache/lucene/util/AttributeSource.java | 274 +++++++++++++ src/test/org/apache/lucene/AnalysisTest.java | 30 +- .../lucene/analysis/TeeSinkTokenTest.java | 126 +++--- .../apache/lucene/analysis/TestAnalyzers.java | 55 +-- .../analysis/TestCachingTokenFilter.java | 22 +- .../analysis/TestISOLatin1AccentFilter.java | 160 ++++---- .../lucene/analysis/TestKeywordAnalyzer.java | 9 +- .../lucene/analysis/TestLengthFilter.java | 15 +- .../analysis/TestPerFieldAnalzyerWrapper.java | 16 +- .../lucene/analysis/TestStandardAnalyzer.java | 24 +- .../lucene/analysis/TestStopAnalyzer.java | 29 +- .../lucene/analysis/TestStopFilter.java | 42 +- .../org/apache/lucene/analysis/TestToken.java | 1 + .../lucene/index/TestDocumentWriter.java | 52 ++- .../apache/lucene/index/TestIndexWriter.java | 90 +++-- .../lucene/index/TestMultiLevelSkipList.java | 21 +- .../org/apache/lucene/index/TestPayloads.java | 39 +- .../lucene/index/TestTermVectorsReader.java | 48 ++- .../apache/lucene/index/TestTermdocPerf.java | 32 +- .../lucene/queryParser/TestMultiAnalyzer.java | 77 ++-- .../TestMultiFieldQueryParser.java | 1 - .../lucene/queryParser/TestQueryParser.java | 38 +- .../lucene/search/TestPositionIncrement.java | 37 +- .../apache/lucene/search/TestRangeQuery.java | 18 +- .../lucene/search/payloads/PayloadHelper.java | 21 +- .../payloads/TestBoostingTermQuery.java | 27 +- .../lucene/search/spans/TestPayloadSpans.java | 33 +- .../apache/lucene/util/LuceneTestCase.java | 2 + 69 files changed, 3226 insertions(+), 628 deletions(-) create mode 100644 src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttribute.java create mode 100644 src/java/org/apache/lucene/analysis/tokenattributes/OffsetAttribute.java create mode 100644 src/java/org/apache/lucene/analysis/tokenattributes/PayloadAttribute.java create mode 100644 src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttribute.java create mode 100644 src/java/org/apache/lucene/analysis/tokenattributes/TermAttribute.java create mode 100644 src/java/org/apache/lucene/analysis/tokenattributes/TypeAttribute.java create mode 100644 src/java/org/apache/lucene/util/Attribute.java create mode 100644 src/java/org/apache/lucene/util/AttributeSource.java diff --git a/CHANGES.txt b/CHANGES.txt index 667cc632080..849f45e3504 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -25,6 +25,15 @@ API Changes and deprecate FSDirectory.getDirectory(). FSDirectory instances are not required to be singletons per path. (yonik) +4. LUCENE-1422: New TokenStream API that uses a new class called + AttributeSource instead of the now deprecated Token class. All attributes + that the Token class had have been moved into separate classes: + TermAttribute, OffsetAttribute, PositionIncrementAttribute, + PayloadAttribute, TypeAttribute and FlagsAttribute. The new API + is much more flexible; it allows to combine the Attributes arbitrarily + and also to define custom Attributes. The new API has the same performance + as the old next(Token) approach. (Michael Busch) + Bug fixes 1. LUCENE-1415: MultiPhraseQuery has incorrect hashCode() and equals() diff --git a/src/java/org/apache/lucene/analysis/CachingTokenFilter.java b/src/java/org/apache/lucene/analysis/CachingTokenFilter.java index d91074a2653..3a4ab989fa5 100644 --- a/src/java/org/apache/lucene/analysis/CachingTokenFilter.java +++ b/src/java/org/apache/lucene/analysis/CachingTokenFilter.java @@ -22,6 +22,8 @@ import java.util.Iterator; import java.util.LinkedList; import java.util.List; +import org.apache.lucene.util.AttributeSource; + /** * This class can be used if the Tokens of a TokenStream * are intended to be consumed more than once. It caches @@ -34,12 +36,31 @@ import java.util.List; */ public class CachingTokenFilter extends TokenFilter { private List cache; - private Iterator iterator; + private Iterator iterator; public CachingTokenFilter(TokenStream input) { super(input); } + public boolean incrementToken() throws IOException { + if (cache == null) { + // fill cache lazily + cache = new LinkedList(); + fillCache(); + iterator = cache.iterator(); + } + + if (!iterator.hasNext()) { + // the cache is exhausted, return null + return false; + } + // Since the TokenFilter can be reset, the tokens need to be preserved as immutable. + AttributeSource state = (AttributeSource) iterator.next(); + state.restoreState(this); + return true; + } + + /** @deprecated */ public Token next(final Token reusableToken) throws IOException { assert reusableToken != null; if (cache == null) { @@ -60,10 +81,17 @@ public class CachingTokenFilter extends TokenFilter { public void reset() throws IOException { if(cache != null) { - iterator = cache.iterator(); + iterator = cache.iterator(); } } + private void fillCache() throws IOException { + while(input.incrementToken()) { + cache.add(captureState()); + } + } + + /** @deprecated */ private void fillCache(final Token reusableToken) throws IOException { for (Token nextToken = input.next(reusableToken); nextToken != null; nextToken = input.next(reusableToken)) { cache.add(nextToken.clone()); diff --git a/src/java/org/apache/lucene/analysis/CharTokenizer.java b/src/java/org/apache/lucene/analysis/CharTokenizer.java index d4356651fdf..5d090e71640 100644 --- a/src/java/org/apache/lucene/analysis/CharTokenizer.java +++ b/src/java/org/apache/lucene/analysis/CharTokenizer.java @@ -20,16 +20,24 @@ package org.apache.lucene.analysis; import java.io.IOException; import java.io.Reader; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; + /** An abstract base class for simple, character-oriented tokenizers.*/ public abstract class CharTokenizer extends Tokenizer { public CharTokenizer(Reader input) { super(input); + offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); } private int offset = 0, bufferIndex = 0, dataLen = 0; private static final int MAX_WORD_LEN = 255; private static final int IO_BUFFER_SIZE = 4096; private final char[] ioBuffer = new char[IO_BUFFER_SIZE]; + + private TermAttribute termAtt; + private OffsetAttribute offsetAtt; /** Returns true iff a character should be included in a token. This * tokenizer generates as tokens adjacent sequences of characters which @@ -44,6 +52,50 @@ public abstract class CharTokenizer extends Tokenizer { return c; } + public final boolean incrementToken() throws IOException { + clearAttributes(); + int length = 0; + int start = bufferIndex; + char[] buffer = termAtt.termBuffer(); + while (true) { + + if (bufferIndex >= dataLen) { + offset += dataLen; + dataLen = input.read(ioBuffer); + if (dataLen == -1) { + if (length > 0) + break; + else + return false; + } + bufferIndex = 0; + } + + final char c = ioBuffer[bufferIndex++]; + + if (isTokenChar(c)) { // if it's a token char + + if (length == 0) // start of token + start = offset + bufferIndex - 1; + else if (length == buffer.length) + buffer = termAtt.resizeTermBuffer(1+length); + + buffer[length++] = normalize(c); // buffer it, normalized + + if (length == MAX_WORD_LEN) // buffer overflow! + break; + + } else if (length > 0) // at non-Letter w/ chars + break; // return 'em + } + + termAtt.setTermLength(length); + offsetAtt.setStartOffset(start); + offsetAtt.setEndOffset(start+length); + return true; + } + + /** @deprecated */ public final Token next(final Token reusableToken) throws IOException { assert reusableToken != null; reusableToken.clear(); diff --git a/src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java b/src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java index 8f10e984702..3a5a1170ece 100644 --- a/src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java +++ b/src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java @@ -1,5 +1,7 @@ package org.apache.lucene.analysis; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; + /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -27,11 +29,33 @@ package org.apache.lucene.analysis; public class ISOLatin1AccentFilter extends TokenFilter { public ISOLatin1AccentFilter(TokenStream input) { super(input); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); } private char[] output = new char[256]; private int outputPos; - + private TermAttribute termAtt; + + public final boolean incrementToken() throws java.io.IOException { + if (input.incrementToken()) { + final char[] buffer = termAtt.termBuffer(); + final int length = termAtt.termLength(); + // If no characters actually require rewriting then we + // just return token as-is: + for(int i=0;i= '\u00c0' && c <= '\uFB06') { + removeAccents(buffer, length); + termAtt.setTermBuffer(output, 0, outputPos); + break; + } + } + return true; + } else + return false; + } + + /** @deprecated */ public final Token next(final Token reusableToken) throws java.io.IOException { assert reusableToken != null; Token nextToken = input.next(reusableToken); @@ -241,7 +265,7 @@ public class ISOLatin1AccentFilter extends TokenFilter { case '\uFB06': // st output[outputPos++] = 's'; output[outputPos++] = 't'; - break; + break; default : output[outputPos++] = c; break; diff --git a/src/java/org/apache/lucene/analysis/KeywordTokenizer.java b/src/java/org/apache/lucene/analysis/KeywordTokenizer.java index 5b1cbf5f17f..3576ac15472 100644 --- a/src/java/org/apache/lucene/analysis/KeywordTokenizer.java +++ b/src/java/org/apache/lucene/analysis/KeywordTokenizer.java @@ -20,6 +20,9 @@ package org.apache.lucene.analysis; import java.io.IOException; import java.io.Reader; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; + /** * Emits the entire input as a single token. */ @@ -28,7 +31,9 @@ public class KeywordTokenizer extends Tokenizer { private static final int DEFAULT_BUFFER_SIZE = 256; private boolean done; - + private TermAttribute termAtt; + private OffsetAttribute offsetAtt; + public KeywordTokenizer(Reader input) { this(input, DEFAULT_BUFFER_SIZE); } @@ -36,8 +41,32 @@ public class KeywordTokenizer extends Tokenizer { public KeywordTokenizer(Reader input, int bufferSize) { super(input); this.done = false; + termAtt = (TermAttribute) addAttribute(TermAttribute.class); + offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); + } + + public boolean incrementToken() throws IOException { + if (!done) { + done = true; + int upto = 0; + termAtt.clear(); + char[] buffer = termAtt.termBuffer(); + while (true) { + final int length = input.read(buffer, upto, buffer.length-upto); + if (length == -1) break; + upto += length; + if (upto == buffer.length) + buffer = termAtt.resizeTermBuffer(1+buffer.length); + } + termAtt.setTermLength(upto); + offsetAtt.setStartOffset(0); + offsetAtt.setEndOffset(upto); + return true; + } + return false; } + /** @deprecated */ public Token next(final Token reusableToken) throws IOException { assert reusableToken != null; if (!done) { diff --git a/src/java/org/apache/lucene/analysis/LengthFilter.java b/src/java/org/apache/lucene/analysis/LengthFilter.java index 8176c86b182..b090cd23d9c 100644 --- a/src/java/org/apache/lucene/analysis/LengthFilter.java +++ b/src/java/org/apache/lucene/analysis/LengthFilter.java @@ -19,6 +19,8 @@ package org.apache.lucene.analysis; import java.io.IOException; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; + /** * Removes words that are too long and too short from the stream. * @@ -29,6 +31,8 @@ public final class LengthFilter extends TokenFilter { final int min; final int max; + + private TermAttribute termAtt; /** * Build a filter that removes words that are too long or too @@ -39,10 +43,28 @@ public final class LengthFilter extends TokenFilter { super(in); this.min = min; this.max = max; + termAtt = (TermAttribute) addAttribute(TermAttribute.class); + } + + /** + * Returns the next input Token whose term() is the right len + */ + public final boolean incrementToken() throws IOException { + // return the first non-stop word found + while (input.incrementToken()) { + int len = termAtt.termLength(); + if (len >= min && len <= max) { + return true; + } + // note: else we ignore it but should we index each part of it? + } + // reached EOS -- return null + return false; } /** * Returns the next input Token whose term() is the right len + * @deprecated */ public final Token next(final Token reusableToken) throws IOException { diff --git a/src/java/org/apache/lucene/analysis/LowerCaseFilter.java b/src/java/org/apache/lucene/analysis/LowerCaseFilter.java index 1e6316db1bd..0c146e2a64d 100644 --- a/src/java/org/apache/lucene/analysis/LowerCaseFilter.java +++ b/src/java/org/apache/lucene/analysis/LowerCaseFilter.java @@ -19,6 +19,8 @@ package org.apache.lucene.analysis; import java.io.IOException; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; + /** * Normalizes token text to lower case. * @@ -27,8 +29,25 @@ import java.io.IOException; public final class LowerCaseFilter extends TokenFilter { public LowerCaseFilter(TokenStream in) { super(in); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); } + private TermAttribute termAtt; + + public final boolean incrementToken() throws IOException { + if (input.incrementToken()) { + + final char[] buffer = termAtt.termBuffer(); + final int length = termAtt.termLength(); + for(int i=0;i*/ lst = new ArrayList/**/(); protected Iterator/**/ iter; - + public SinkTokenizer(List/**/ input) { this.lst = input; if (this.lst == null) this.lst = new ArrayList/**/(); @@ -61,10 +63,30 @@ public class SinkTokenizer extends Tokenizer { return lst; } + /** + * Increments this stream to the next token out of the list of cached tokens + * @throws IOException + */ + public boolean incrementToken() throws IOException { + if (iter == null) iter = lst.iterator(); + // Since this TokenStream can be reset we have to maintain the tokens as immutable + if (iter.hasNext()) { + AttributeSource state = (AttributeSource) iter.next(); + state.restoreState(this); + return true; + } + return false; + } + + public void add(AttributeSource source) throws IOException { + lst.add(source); + } + /** * Returns the next token out of the list of cached tokens * @return The next {@link org.apache.lucene.analysis.Token} in the Sink. * @throws IOException + * @deprecated */ public Token next(final Token reusableToken) throws IOException { assert reusableToken != null; @@ -77,8 +99,6 @@ public class SinkTokenizer extends Tokenizer { return null; } - - /** * Override this method to cache only certain tokens, or new tokens based * on the old tokens. diff --git a/src/java/org/apache/lucene/analysis/StopFilter.java b/src/java/org/apache/lucene/analysis/StopFilter.java index 2fdd86c3036..b5fd0e9bf99 100644 --- a/src/java/org/apache/lucene/analysis/StopFilter.java +++ b/src/java/org/apache/lucene/analysis/StopFilter.java @@ -21,6 +21,9 @@ import java.io.IOException; import java.util.Arrays; import java.util.Set; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; + /** * Removes stop words from a token stream. */ @@ -32,6 +35,9 @@ public final class StopFilter extends TokenFilter { private final CharArraySet stopWords; private boolean enablePositionIncrements = ENABLE_POSITION_INCREMENTS_DEFAULT; + private TermAttribute termAtt; + private PositionIncrementAttribute posIncrAtt; + /** * Construct a token stream filtering the given input. */ @@ -47,6 +53,7 @@ public final class StopFilter extends TokenFilter { public StopFilter(TokenStream in, String[] stopWords, boolean ignoreCase) { super(in); this.stopWords = (CharArraySet)makeStopSet(stopWords, ignoreCase); + init(); } @@ -74,6 +81,7 @@ public final class StopFilter extends TokenFilter { this.stopWords = new CharArraySet(stopWords.size(), ignoreCase); this.stopWords.addAll(stopWords); } + init(); } /** @@ -85,6 +93,11 @@ public final class StopFilter extends TokenFilter { public StopFilter(TokenStream in, Set stopWords) { this(in, stopWords, false); } + + public void init() { + termAtt = (TermAttribute) addAttribute(TermAttribute.class); + posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); + } /** * Builds a Set from an array of stop words, @@ -109,9 +122,29 @@ public final class StopFilter extends TokenFilter { stopSet.addAll(Arrays.asList(stopWords)); return stopSet; } + + /** + * Returns the next input Token whose term() is not a stop word. + */ + public final boolean incrementToken() throws IOException { + // return the first non-stop word found + int skippedPositions = 0; + while (input.incrementToken()) { + if (!stopWords.contains(termAtt.termBuffer(), 0, termAtt.termLength())) { + if (enablePositionIncrements) { + posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions); + } + return true; + } + skippedPositions += posIncrAtt.getPositionIncrement(); + } + // reached EOS -- return null + return false; + } /** * Returns the next input Token whose term() is not a stop word. + * @deprecated */ public final Token next(final Token reusableToken) throws IOException { assert reusableToken != null; diff --git a/src/java/org/apache/lucene/analysis/TeeTokenFilter.java b/src/java/org/apache/lucene/analysis/TeeTokenFilter.java index 0a3ea04ad28..ec2606c1a00 100644 --- a/src/java/org/apache/lucene/analysis/TeeTokenFilter.java +++ b/src/java/org/apache/lucene/analysis/TeeTokenFilter.java @@ -18,6 +18,7 @@ package org.apache.lucene.analysis; import java.io.IOException; +import java.util.Iterator; /** @@ -60,8 +61,21 @@ public class TeeTokenFilter extends TokenFilter { public TeeTokenFilter(TokenStream input, SinkTokenizer sink) { super(input); this.sink = sink; + Iterator it = getAttributesIterator(); + while (it.hasNext()) { + sink.addAttribute(it.next().getClass()); + } + } + + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + sink.add(captureState()); + return true; + } + return false; } + /** @deprecated */ public Token next(final Token reusableToken) throws IOException { assert reusableToken != null; Token nextToken = input.next(reusableToken); diff --git a/src/java/org/apache/lucene/analysis/Token.java b/src/java/org/apache/lucene/analysis/Token.java index e831f383c46..f5d48516898 100644 --- a/src/java/org/apache/lucene/analysis/Token.java +++ b/src/java/org/apache/lucene/analysis/Token.java @@ -21,7 +21,11 @@ import org.apache.lucene.index.Payload; import org.apache.lucene.index.TermPositions; // for javadoc import org.apache.lucene.util.ArrayUtil; -/** A Token is an occurrence of a term from the text of a field. It consists of +/** + This class is now deprecated and a new TokenStream API was introduced with Lucene 2.9. + See Javadocs in {@link TokenStream} for further details. +

+ A Token is an occurrence of a term from the text of a field. It consists of a term's text, the start and end offset of the term in the text of the field, and a type string.

@@ -114,6 +118,8 @@ import org.apache.lucene.util.ArrayUtil;

@see org.apache.lucene.index.Payload + @deprecated A new TokenStream API was introduced with Lucene 2.9. + See javadocs in {@link TokenStream} for further details. */ public class Token implements Cloneable { diff --git a/src/java/org/apache/lucene/analysis/TokenFilter.java b/src/java/org/apache/lucene/analysis/TokenFilter.java index 300cb550a86..6988e332cc1 100644 --- a/src/java/org/apache/lucene/analysis/TokenFilter.java +++ b/src/java/org/apache/lucene/analysis/TokenFilter.java @@ -22,9 +22,16 @@ import java.io.IOException; /** A TokenFilter is a TokenStream whose input is another token stream.

This is an abstract class. - NOTE: subclasses must override {@link #next(Token)}. It's - also OK to instead override {@link #next()} but that - method is now deprecated in favor of {@link #next(Token)}. + NOTE: subclasses must override + {@link #incrementToken()} if the new TokenStream API is used + and {@link #next(Token)} or {@link #next()} if the old + TokenStream API is used. + *

+ * WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental. + * The APIs introduced in these classes with Lucene 2.9 might change in the future. + * We will make our best efforts to keep the APIs backwards-compatible. +

+ See {@link TokenStream} */ public abstract class TokenFilter extends TokenStream { /** The source of tokens for this filter. */ @@ -32,9 +39,10 @@ public abstract class TokenFilter extends TokenStream { /** Construct a token stream filtering the given input. */ protected TokenFilter(TokenStream input) { + super(input); this.input = input; } - + /** Close the input TokenStream. */ public void close() throws IOException { input.close(); @@ -45,4 +53,17 @@ public abstract class TokenFilter extends TokenStream { super.reset(); input.reset(); } + + public boolean useNewAPI() { + return input.useNewAPI(); + } + + /** + * Sets whether or not to use the new TokenStream API. Settings this + * will apply to this Filter and all TokenStream/Filters upstream. + */ + public void setUseNewAPI(boolean use) { + input.setUseNewAPI(use); + } + } diff --git a/src/java/org/apache/lucene/analysis/TokenStream.java b/src/java/org/apache/lucene/analysis/TokenStream.java index 604f4a27cd5..6a9161e8ae3 100644 --- a/src/java/org/apache/lucene/analysis/TokenStream.java +++ b/src/java/org/apache/lucene/analysis/TokenStream.java @@ -17,9 +17,12 @@ package org.apache.lucene.analysis; * limitations under the License. */ -import org.apache.lucene.index.Payload; - import java.io.IOException; +import java.util.Iterator; + +import org.apache.lucene.index.Payload; +import org.apache.lucene.util.Attribute; +import org.apache.lucene.util.AttributeSource; /** A TokenStream enumerates the sequence of tokens, either from fields of a document or from query text. @@ -31,13 +34,140 @@ import java.io.IOException;

  • {@link TokenFilter}, a TokenStream whose input is another TokenStream. - NOTE: subclasses must override {@link #next(Token)}. It's - also OK to instead override {@link #next()} but that - method is now deprecated in favor of {@link #next(Token)}. + A new TokenStream API is introduced with Lucene 2.9. Since + 2.9 Token is deprecated and the preferred way to store + the information of a token is to use {@link Attribute}s. +

    + For that reason TokenStream extends {@link AttributeSource} + now. Note that only one instance per {@link Attribute} is + created and reused for every token. This approach reduces + object creations and allows local caching of references to + the {@link Attribute}s. See {@link #incrementToken()} for further details. +

    + The workflow of the new TokenStream API is as follows: +

      +
    1. Instantiation of TokenStream/TokenFilters which add/get attributes + to/from the {@link AttributeSource}. +
    2. The consumer calls {@link TokenStream#reset()}. +
    3. the consumer retrieves attributes from the + stream and stores local references to all attributes it wants to access +
    4. The consumer calls {@link #incrementToken()} until it returns false and + consumes the attributes after each call. +
    + To make sure that filters and consumers know which attributes are available + the attributes must be added in the during instantiation. Filters and + consumers are not required to check for availability of attributes in {@link #incrementToken()}. +

    + Sometimes it is desirable to capture a current state of a + TokenStream, e. g. for buffering purposes (see {@link CachingTokenFilter}, + {@link TeeTokenFilter}/{@link SinkTokenizer}). For this usecase + {@link AttributeSource#captureState()} and {@link AttributeSource#restoreState(AttributeSource)} can be used. +

    + NOTE: In order to enable the new API the method + {@link #useNewAPI()} has to be called with useNewAPI=true. + Otherwise the deprecated method {@link #next(Token)} will + be used by Lucene consumers (indexer and queryparser) to + consume the tokens. {@link #next(Token)} will be removed + in Lucene 3.0. +

    + NOTE: To use the old API subclasses must override {@link #next(Token)}. + It's also OK to instead override {@link #next()} but that + method is slower compared to {@link #next(Token)}. + *

    + * WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental. + * The APIs introduced in these classes with Lucene 2.9 might change in the future. + * We will make our best efforts to keep the APIs backwards-compatible. */ -public abstract class TokenStream { +public abstract class TokenStream extends AttributeSource { + private static boolean useNewAPIDefault = false; + private boolean useNewAPI = useNewAPIDefault; + + protected TokenStream() { + super(); + } + + protected TokenStream(AttributeSource input) { + super(input); + } + /** + * Returns whether or not the new TokenStream APIs are used + * by default. + * (see {@link #incrementToken()}, {@link AttributeSource}). + */ + public static boolean useNewAPIDefault() { + return useNewAPIDefault; + } + + /** + * Use this API to enable or disable the new TokenStream API. + * by default. Can be overridden by calling {@link #setUseNewAPI(boolean)}. + * (see {@link #incrementToken()}, {@link AttributeSource}). + *

    + * If set to true, the indexer will call {@link #incrementToken()} + * to consume Tokens from this stream. + *

    + * If set to false, the indexer will call {@link #next(Token)} + * instead. + */ + public static void setUseNewAPIDefault(boolean use) { + useNewAPIDefault = use; + } + + /** + * Returns whether or not the new TokenStream APIs are used + * for this stream. + * (see {@link #incrementToken()}, {@link AttributeSource}). + */ + public boolean useNewAPI() { + return useNewAPI; + } + + /** + * Use this API to enable or disable the new TokenStream API + * for this stream. Overrides {@link #setUseNewAPIDefault(boolean)}. + * (see {@link #incrementToken()}, {@link AttributeSource}). + *

    + * If set to true, the indexer will call {@link #incrementToken()} + * to consume Tokens from this stream. + *

    + * If set to false, the indexer will call {@link #next(Token)} + * instead. + *

    + * NOTE: All streams and filters in one chain must use the + * same API. + */ + public void setUseNewAPI(boolean use) { + useNewAPI = use; + } + + /** + * Consumers (e. g. the indexer) use this method to advance the stream + * to the next token. Implementing classes must implement this method + * and update the appropriate {@link Attribute}s with content of the + * next token. + *

    + * This method is called for every token of a document, so an efficient + * implementation is crucial for good performance. To avoid calls to + * {@link #addAttribute(Class)} and {@link #getAttribute(Class)} and + * downcasts, references to all {@link Attribute}s that this stream uses + * should be retrieved during instantiation. + *

    + * To make sure that filters and consumers know which attributes are available + * the attributes must be added during instantiation. Filters and + * consumers are not required to check for availability of attributes in {@link #incrementToken()}. + * + * @return false for end of stream; true otherwise + * + *

    + * Note that this method will be defined abstract in Lucene 3.0. + */ + public boolean incrementToken() throws IOException { + // subclasses must implement this method; will be made abstract in Lucene 3.0 + return false; + } + /** Returns the next token in the stream, or null at EOS. * @deprecated The returned Token is a "full private copy" (not * re-used across calls to next()) but will be slower @@ -84,6 +214,8 @@ public abstract class TokenStream { * is not required to check for null before using it, but it is a * good idea to assert that it is not null.) * @return next token in the stream or null if end-of-stream was hit + * @deprecated The new {@link #incrementToken()} and {@link AttributeSource} + * APIs should be used instead. See also {@link #useNewAPI()}. */ public Token next(final Token reusableToken) throws IOException { // We don't actually use inputToken, but still add this assert @@ -107,4 +239,25 @@ public abstract class TokenStream { /** Releases resources associated with this stream. */ public void close() throws IOException {} + + public String toString() { + StringBuffer sb = new StringBuffer(); + sb.append('('); + + if (hasAttributes()) { + // TODO Java 1.5 + //Iterator it = attributes.values().iterator(); + Iterator it = getAttributesIterator(); + if (it.hasNext()) { + sb.append(it.next().toString()); + } + while (it.hasNext()) { + sb.append(','); + sb.append(it.next().toString()); + } + } + sb.append(')'); + return sb.toString(); + } + } diff --git a/src/java/org/apache/lucene/analysis/Tokenizer.java b/src/java/org/apache/lucene/analysis/Tokenizer.java index 4c6dc4772a2..1222e73761d 100644 --- a/src/java/org/apache/lucene/analysis/Tokenizer.java +++ b/src/java/org/apache/lucene/analysis/Tokenizer.java @@ -24,12 +24,23 @@ import java.io.IOException;

    This is an abstract class.

    - NOTE: subclasses must override {@link #next(Token)}. It's - also OK to instead override {@link #next()} but that - method is now deprecated in favor of {@link #next(Token)}. + NOTE: In order to enable the new API the method + {@link #useNewAPI()} has to be called with useNewAPI=true. + Otherwise the deprecated method {@link #next(Token)} will + be used by Lucene consumers (indexer and queryparser) to + consume the tokens. {@link #next(Token)} will be removed + in Lucene 3.0.

    + NOTE: To use the old API subclasses must override {@link #next(Token)}. + It's also OK to instead override {@link #next()} but that + method is slower compared to {@link #next(Token)}. +

    NOTE: subclasses overriding {@link #next(Token)} must call {@link Token#clear()}. + *

    + * WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental. + * The APIs introduced in these classes with Lucene 2.9 might change in the future. + * We will make our best efforts to keep the APIs backwards-compatible. */ public abstract class Tokenizer extends TokenStream { diff --git a/src/java/org/apache/lucene/analysis/package.html b/src/java/org/apache/lucene/analysis/package.html index d5fe91c0041..3ae9fb0f627 100644 --- a/src/java/org/apache/lucene/analysis/package.html +++ b/src/java/org/apache/lucene/analysis/package.html @@ -35,8 +35,7 @@ application using Lucene to use an appropriate Parser to convert the orig

    Tokenization

    Plain text passed to Lucene for indexing goes through a process generally called tokenization – namely breaking of the -input text into small indexing elements – -{@link org.apache.lucene.analysis.Token Tokens}. +input text into small indexing elements – tokens. The way input text is broken into tokens very much dictates further capabilities of search upon that text. For instance, sentences beginnings and endings can be identified to provide for more accurate phrase @@ -72,12 +71,13 @@ providing for several functions, including (but not limited to):

  • {@link org.apache.lucene.analysis.Analyzer} – An Analyzer is responsible for building a {@link org.apache.lucene.analysis.TokenStream} which can be consumed by the indexing and searching processes. See below for more information on implementing your own Analyzer.
  • {@link org.apache.lucene.analysis.Tokenizer} – A Tokenizer is a {@link org.apache.lucene.analysis.TokenStream} and is responsible for breaking - up incoming text into {@link org.apache.lucene.analysis.Token}s. In most cases, an Analyzer will use a Tokenizer as the first step in + up incoming text into tokens. In most cases, an Analyzer will use a Tokenizer as the first step in the analysis process.
  • {@link org.apache.lucene.analysis.TokenFilter} – A TokenFilter is also a {@link org.apache.lucene.analysis.TokenStream} and is responsible - for modifying {@link org.apache.lucene.analysis.Token}s that have been created by the Tokenizer. Common modifications performed by a + for modifying tokenss that have been created by the Tokenizer. Common modifications performed by a TokenFilter are: deletion, stemming, synonym injection, and down casing. Not all Analyzers require TokenFilters
  • + Since Lucene 2.9 the TokenStream API was changed. Please see section "New TokenStream API" below for details.

    Hints, Tips and Traps

    @@ -140,9 +140,8 @@ providing for several functions, including (but not limited to):

           Analyzer analyzer = new StandardAnalyzer(); // or any other analyzer
           TokenStream ts = analyzer.tokenStream("myfield",new StringReader("some text goes here"));
    -      Token t = ts.next();
    -      while (t!=null) {
    -        System.out.println("token: "+t));
    +      while (ts.incrementToken()) {
    +        System.out.println("token: "+ts));
             t = ts.next();
           }
       
    @@ -179,7 +178,7 @@ the source code of any one of the many samples located in this package.

    The following sections discuss some aspects of implementing your own analyzer.

    -

    Field Section Boundaries

    +

    Field Section Boundaries

    When {@link org.apache.lucene.document.Document#add(org.apache.lucene.document.Fieldable) document.add(field)} is called multiple times for the same field name, we could say that each such call creates a new @@ -208,10 +207,10 @@ the source code of any one of the many samples located in this package. };

    -

    Token Position Increments

    +

    Token Position Increments

    By default, all tokens created by Analyzers and Tokenizers have a - {@link org.apache.lucene.analysis.Token#getPositionIncrement() position increment} of one. + {@link org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute#getPositionIncrement() position increment} of one. This means that the position stored for that token in the index would be one more than that of the previous token. Recall that phrase and proximity searches rely on position info. @@ -227,26 +226,29 @@ the source code of any one of the many samples located in this package. If this behavior does not fit the application needs, a modified analyzer can be used, that would increment further the positions of tokens following a removed stop word, using - {@link org.apache.lucene.analysis.Token#setPositionIncrement(int)}. + {@link org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute#setPositionIncrement(int)}. This can be done with something like:

           public TokenStream tokenStream(final String fieldName, Reader reader) {
             final TokenStream ts = someAnalyzer.tokenStream(fieldName, reader);
             TokenStream res = new TokenStream() {
    -          public Token next() throws IOException {
    +          TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class);
    +          PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
    +        
    +          public boolean incrementToken() throws IOException {
                 int extraIncrement = 0;
                 while (true) {
    -              Token t = ts.next();
    -              if (t!=null) {
    -                if (stopWords.contains(t.termText())) {
    +              boolean hasNext = ts.incrementToken();
    +              if (hasNext) {
    +                if (stopWords.contains(termAtt.term())) {
                       extraIncrement++; // filter this word
                       continue;
                     } 
                     if (extraIncrement>0) {
    -                  t.setPositionIncrement(t.getPositionIncrement()+extraIncrement);
    +                  posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement()+extraIncrement);
                     }
                   }
    -              return t;
    +              return hasNext;
                 }
               }
             };
    @@ -268,5 +270,336 @@ the source code of any one of the many samples located in this package.
              same position as that token, and so would they be seen by phrase and proximity searches.
        
     

    +

    New TokenStream API

    +

    + With Lucene 2.9 we introduce a new TokenStream API. The old API used to produce Tokens. A Token + has getter and setter methods for different properties like positionIncrement and termText. + While this approach was sufficient for the default indexing format, it is not versatile enough for + Flexible Indexing, a term which summarizes the effort of making the Lucene indexer pluggable and extensible for custom + index formats. +

    +

    +A fully customizable indexer means that users will be able to store custom data structures on disk. Therefore an API +is necessary that can transport custom types of data from the documents to the indexer. +

    +

    Attribute and AttributeSource

    +Lucene 2.9 therefore introduces a new pair of classes called {@link org.apache.lucene.util.Attribute} and +{@link org.apache.lucene.util.AttributeSource}. An Attribute serves as a +particular piece of information about a text token. For example, {@link org.apache.lucene.analysis.tokenattributes.TermAttribute} + contains the term text of a token, and {@link org.apache.lucene.analysis.tokenattributes.OffsetAttribute} contains the start and end character offsets of a token. +An AttributeSource is a collection of Attributes with a restriction: there may be only one instance of each attribute type. TokenStream now extends AttributeSource, which +means that one can add Attributes to a TokenStream. Since TokenFilter extends TokenStream, all filters are also +AttributeSources. +

    + Lucene now provides six Attributes out of the box, which replace the variables the Token class has: +

      +
    • {@link org.apache.lucene.analysis.tokenattributes.TermAttribute}

      The term text of a token.

    • +
    • {@link org.apache.lucene.analysis.tokenattributes.OffsetAttribute}

      The start and end offset of token in characters.

    • +
    • {@link org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute}

      See above for detailed information about position increment.

    • +
    • {@link org.apache.lucene.analysis.tokenattributes.PayloadAttribute}

      The payload that a Token can optionally have.

    • +
    • {@link org.apache.lucene.analysis.tokenattributes.TypeAttribute}

      The type of the token. Default is 'word'.

    • +
    • {@link org.apache.lucene.analysis.tokenattributes.FlagsAttribute}

      Optional flags a token can have.

    • +
    +

    +

    Using the new TokenStream API

    +There are a few important things to know in order to use the new API efficiently which are summarized here. You may want +to walk through the example below first and come back to this section afterwards. +
    1. +Please keep in mind that an AttributeSource can only have one instance of a particular Attribute. Furthermore, if +a chain of a TokenStream and multiple TokenFilters is used, then all TokenFilters in that chain share the Attributes +with the TokenStream. +
    2. +
      +
    3. +Attribute instances are reused for all tokens of a document. Thus, a TokenStream/-Filter needs to update +the appropriate Attribute(s) in incrementToken(). The consumer, commonly the Lucene indexer, consumes the data in the +Attributes and then calls incrementToken() again until it retuns false, which indicates that the end of the stream +was reached. This means that in each call of incrementToken() a TokenStream/-Filter can safely overwrite the data in +the Attribute instances. +
    4. +
      +
    5. +For performance reasons a TokenStream/-Filter should add/get Attributes during instantiation; i.e., create an attribute in the +constructor and store references to it in an instance variable. Using an instance variable instead of calling addAttribute()/getAttribute() +in incrementToken() will avoid expensive casting and attribute lookups for every token in the document. +
    6. +
      +
    7. +All methods in AttributeSource are idempotent, which means calling them multiple times always yields the same +result. This is especially important to know for addAttribute(). The method takes the type (Class) +of an Attribute as an argument and returns an instance. If an Attribute of the same type was previously added, then +the already existing instance is returned, otherwise a new instance is created and returned. Therefore TokenStreams/-Filters +can safely call addAttribute() with the same Attribute type multiple times. +
    +

    Example

    +In this example we will create a WhiteSpaceTokenizer and use a LengthFilter to suppress all words that only +have two or less characters. The LengthFilter is part of the Lucene core and its implementation will be explained +here to illustrate the usage of the new TokenStream API.
    +Then we will develop a custom Attribute, a PartOfSpeechAttribute, and add another filter to the chain which +utilizes the new custom attribute, and call it PartOfSpeechTaggingFilter. +

    Whitespace tokenization

    +
    +public class MyAnalyzer extends Analyzer {
    +
    +  public TokenStream tokenStream(String fieldName, Reader reader) {
    +    TokenStream stream = new WhitespaceTokenizer(reader);
    +    return stream;
    +  }
    +  
    +  public static void main(String[] args) throws IOException {
    +    // text to tokenize
    +    final String text = "This is a demo of the new TokenStream API";
    +    
    +    MyAnalyzer analyzer = new MyAnalyzer();
    +    TokenStream stream = analyzer.tokenStream("field", new StringReader(text));
    +    
    +    // get the TermAttribute from the TokenStream
    +    TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class);
    +    
    +    // print all tokens until stream is exhausted
    +    while (stream.incrementToken()) {
    +      System.out.println(termAtt.term());
    +    }
    +  }
    +}
    +
    +In this easy example a simple white space tokenization is performed. In main() a loop consumes the stream and +prints the term text of the tokens by accessing the TermAttribute that the WhitespaceTokenizer provides. +Here is the output: +
    +This
    +is
    +a
    +demo
    +of
    +the
    +new
    +TokenStream
    +API
    +
    +

    Adding a LengthFilter

    +We want to suppress all tokens that have 2 or less characters. We can do that easily by adding a LengthFilter +to the chain. Only the tokenStream() method in our analyzer needs to be changed: +
    +  public TokenStream tokenStream(String fieldName, Reader reader) {
    +    TokenStream stream = new WhitespaceTokenizer(reader);
    +    stream = new LengthFilter(stream, 3, Integer.MAX_VALUE);
    +    return stream;
    +  }
    +
    +Note how now only words with 3 or more characters are contained in the output: +
    +This
    +demo
    +the
    +new
    +TokenStream
    +API
    +
    +Now let's take a look how the LengthFilter is implemented (it is part of Lucene's core): +
    +public final class LengthFilter extends TokenFilter {
    +
    +  final int min;
    +  final int max;
    +  
    +  private TermAttribute termAtt;
    +
    +  /**
    +   * Build a filter that removes words that are too long or too
    +   * short from the text.
    +   */
    +  public LengthFilter(TokenStream in, int min, int max)
    +  {
    +    super(in);
    +    this.min = min;
    +    this.max = max;
    +    termAtt = (TermAttribute) addAttribute(TermAttribute.class);
    +  }
    +  
    +  /**
    +   * Returns the next input Token whose term() is the right len
    +   */
    +  public final boolean incrementToken() throws IOException
    +  {
    +    assert termAtt != null;
    +    // return the first non-stop word found
    +    while (input.incrementToken()) {
    +      int len = termAtt.termLength();
    +      if (len >= min && len <= max) {
    +          return true;
    +      }
    +      // note: else we ignore it but should we index each part of it?
    +    }
    +    // reached EOS -- return null
    +    return false;
    +  }
    +}
    +
    +The TermAttribute is added in the constructor and stored in the instance variable termAtt. +Remember that there can only be a single instance of TermAttribute in the chain, so in our example the +addAttribute() call in LengthFilter returns the TermAttribute that the WhitespaceTokenizer already added. The tokens +are retrieved from the input stream in the incrementToken() method. By looking at the term text +in the TermAttribute the length of the term can be determined and too short or too long tokens are skipped. +Note how incrementToken() can efficiently access the instance variable; no attribute lookup or downcasting +is neccessary. The same is true for the consumer, which can simply use local references to the Attributes. +

    Adding a custom Attribute

    +Now we're going to implement our own custom Attribute for part-of-speech tagging and call it consequently +PartOfSpeechAttribute: +
    +  public static enum PartOfSpeech {
    +    Noun, Verb, Adjective, Adverb, Pronoun, Preposition, Conjunction, Article, Unknown
    +  }
    +  
    +  public static final class PartOfSpeechAttribute extends Attribute {
    +    
    +    private PartOfSpeech pos = PartOfSpeech.Unknown;
    +    
    +    public void setPartOfSpeech(PartOfSpeech pos) {
    +      this.pos = pos;
    +    }
    +    
    +    public PartOfSpeech getPartOfSpeech() {
    +      return pos;
    +    }
    +
    +    public void clear() {
    +      pos = PartOfSpeech.Unknown;
    +    }
    +
    +    public void copyTo(Attribute target) {
    +      ((PartOfSpeechAttribute) target).pos = pos;
    +    }
    +
    +    public boolean equals(Object other) {
    +      if (other == this) {
    +        return true;
    +      }
    +      
    +      if (other instanceof PartOfSpeechAttribute) {
    +        return pos == ((PartOfSpeechAttribute) other).pos;
    +      }
    +   
    +      return false;
    +    }
    +
    +    public int hashCode() {
    +      return pos.ordinal();
    +    }
    +
    +    public String toString() {
    +      return "PartOfSpeech=" + pos;
    +    }
    +  }
    +
    +This is a simple Attribute that has only a single variable that stores the part-of-speech of a token. It extends the +new Attribute class and therefore implements its abstract methods clear(), copyTo(), equals(), hashCode(), toString(). +Now we need a TokenFilter that can set this new PartOfSpeechAttribute for each token. In this example we show a very naive filter +that tags every word with a leading upper-case letter as a 'Noun' and all other words as 'Unknown'. +
    +  public static class PartOfSpeechTaggingFilter extends TokenFilter {
    +    PartOfSpeechAttribute posAtt;
    +    TermAttribute termAtt;
    +    
    +    protected PartOfSpeechTaggingFilter(TokenStream input) {
    +      super(input);
    +      posAtt = (PartOfSpeechAttribute) addAttribute(PartOfSpeechAttribute.class);
    +      termAtt = (TermAttribute) addAttribute(TermAttribute.class);
    +    }
    +    
    +    public boolean incrementToken() throws IOException {
    +      if (!input.incrementToken()) {return false;}
    +      posAtt.setPartOfSpeech(determinePOS(termAtt.termBuffer(), 0, termAtt.termLength()));
    +      return true;
    +    }
    +    
    +    // determine the part of speech for the given term
    +    protected PartOfSpeech determinePOS(char[] term, int offset, int length) {
    +      // naive implementation that tags every uppercased word as noun
    +      if (length > 0 && Character.isUpperCase(term[0])) {
    +        return PartOfSpeech.Noun;
    +      }
    +      return PartOfSpeech.Unknown;
    +    }
    +  }
    +
    +Just like the LengthFilter, this new filter accesses the attributes it needs in the constructor and +stores references in instance variables. Now we need to add the filter to the chain: +
    +  public TokenStream tokenStream(String fieldName, Reader reader) {
    +    TokenStream stream = new WhitespaceTokenizer(reader);
    +    stream = new LengthFilter(stream, 3, Integer.MAX_VALUE);
    +    stream = new PartOfSpeechTaggingFilter(stream);
    +    return stream;
    +  }
    +
    +Now let's look at the output: +
    +This
    +demo
    +the
    +new
    +TokenStream
    +API
    +
    +Apparently it hasn't changed, which shows that adding a custom attribute to a TokenStream/Filter chain does not +affect any existing consumers, simply because they don't know the new Attribute. Now let's change the consumer +to make use of the new PartOfSpeechAttribute and print it out: +
    +  public static void main(String[] args) throws IOException {
    +    // text to tokenize
    +    final String text = "This is a demo of the new TokenStream API";
    +    
    +    MyAnalyzer analyzer = new MyAnalyzer();
    +    TokenStream stream = analyzer.tokenStream("field", new StringReader(text));
    +    
    +    // get the TermAttribute from the TokenStream
    +    TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class);
    +    
    +    // get the PartOfSpeechAttribute from the TokenStream
    +    PartOfSpeechAttribute posAtt = (PartOfSpeechAttribute) stream.getAttribute(PartOfSpeechAttribute.class);
    +    
    +    // print all tokens until stream is exhausted
    +    while (stream.incrementToken()) {
    +      System.out.println(termAtt.term() + ": " + posAtt.getPartOfSpeech());
    +    }
    +  }
    +
    +The change that was made is to get the PartOfSpeechAttribute from the TokenStream and print out its contents in +the while loop that consumes the stream. Here is the new output: +
    +This: Noun
    +demo: Unknown
    +the: Unknown
    +new: Unknown
    +TokenStream: Noun
    +API: Noun
    +
    +Each word is now followed by its assigned PartOfSpeech tag. Of course this is a naive +part-of-speech tagging. The word 'This' should not even be tagged as noun; it is only spelled capitalized because it +is the first word of a sentence. Actually this is a good opportunity for an excerise. To practice the usage of the new +API the reader could now write an Attribute and TokenFilter that can specify for each word if it was the first token +of a sentence or not. Then the PartOfSpeechTaggingFilter can make use of this knowledge and only tag capitalized words +as nouns if not the first word of a sentence (we know, this is still not a correct behavior, but hey, it's a good exercise). +As a small hint, this is how the new Attribute class could begin: +
    +  public class FirstTokenOfSentenceAttribute extends Attribute {
    +    
    +    private boolean firstToken;
    +    
    +    public void setFirstToken(boolean firstToken) {
    +      this.firstToken = firstToken;
    +    }
    +    
    +    public boolean getFirstToken() {
    +      return firstToken;
    +    }
    +
    +    public void clear() {
    +      firstToken = false;
    +    }
    +
    +  ...
    +
    diff --git a/src/java/org/apache/lucene/analysis/standard/StandardFilter.java b/src/java/org/apache/lucene/analysis/standard/StandardFilter.java index c596984129a..72ff4bffc23 100644 --- a/src/java/org/apache/lucene/analysis/standard/StandardFilter.java +++ b/src/java/org/apache/lucene/analysis/standard/StandardFilter.java @@ -17,9 +17,11 @@ package org.apache.lucene.analysis.standard; * limitations under the License. */ -import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; /** Normalizes tokens extracted with {@link StandardTokenizer}. */ @@ -29,15 +31,54 @@ public final class StandardFilter extends TokenFilter { /** Construct filtering in. */ public StandardFilter(TokenStream in) { super(in); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); + typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class); } private static final String APOSTROPHE_TYPE = StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.APOSTROPHE]; private static final String ACRONYM_TYPE = StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]; + // this filters uses attribute type + private TypeAttribute typeAtt; + private TermAttribute termAtt; + /** Returns the next token in the stream, or null at EOS. *

    Removes 's from the end of words. *

    Removes dots from acronyms. */ + public final boolean incrementToken() throws java.io.IOException { + if (!input.incrementToken()) { + return false; + } + + char[] buffer = termAtt.termBuffer(); + final int bufferLength = termAtt.termLength(); + final String type = typeAtt.type(); + + if (type == APOSTROPHE_TYPE && // remove 's + bufferLength >= 2 && + buffer[bufferLength-2] == '\'' && + (buffer[bufferLength-1] == 's' || buffer[bufferLength-1] == 'S')) { + // Strip last 2 characters off + termAtt.setTermLength(bufferLength - 2); + } else if (type == ACRONYM_TYPE) { // remove dots + int upto = 0; + for(int i=0;iRemoves 's from the end of words. + *

    Removes dots from acronyms. + * @deprecated + */ public final Token next(final Token reusableToken) throws java.io.IOException { assert reusableToken != null; Token nextToken = input.next(reusableToken); diff --git a/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java b/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java index c982387a844..035697bc6ec 100644 --- a/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java +++ b/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java @@ -22,6 +22,10 @@ import java.io.Reader; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; /** A grammar-based tokenizer constructed with JFlex * @@ -84,7 +88,7 @@ public class StandardTokenizer extends Tokenizer { * * @deprecated this should be removed in the next release (3.0). */ - private boolean replaceInvalidAcronym = false; + private boolean replaceInvalidAcronym; void setInput(Reader reader) { this.input = reader; @@ -103,14 +107,13 @@ public class StandardTokenizer extends Tokenizer { return maxTokenLength; } - /** - * Creates a new instance of the {@link StandardTokenizer}. Attaches the - * input to a newly created JFlex scanner. - */ - public StandardTokenizer(Reader input) { - this.input = input; - this.scanner = new StandardTokenizerImpl(input); - } + /** + * Creates a new instance of the {@link StandardTokenizer}. Attaches the + * input to a newly created JFlex scanner. + */ + public StandardTokenizer(Reader input) { + this(input, false); + } /** * Creates a new instance of the {@link org.apache.lucene.analysis.standard.StandardTokenizer}. Attaches @@ -125,13 +128,68 @@ public class StandardTokenizer extends Tokenizer { this.replaceInvalidAcronym = replaceInvalidAcronym; this.input = input; this.scanner = new StandardTokenizerImpl(input); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); + offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); + posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); + typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class); } + // this tokenizer generates three attributes: + // offset, positionIncrement and type + private TermAttribute termAtt; + private OffsetAttribute offsetAtt; + private PositionIncrementAttribute posIncrAtt; + private TypeAttribute typeAtt; + /* * (non-Javadoc) * * @see org.apache.lucene.analysis.TokenStream#next() */ + public boolean incrementToken() throws IOException { + int posIncr = 1; + + while(true) { + int tokenType = scanner.getNextToken(); + + if (tokenType == StandardTokenizerImpl.YYEOF) { + return false; + } + + if (scanner.yylength() <= maxTokenLength) { + termAtt.clear(); + posIncrAtt.setPositionIncrement(posIncr); + scanner.getText(termAtt); + final int start = scanner.yychar(); + offsetAtt.setStartOffset(start); + offsetAtt.setEndOffset(start+termAtt.termLength()); + // This 'if' should be removed in the next release. For now, it converts + // invalid acronyms to HOST. When removed, only the 'else' part should + // remain. + if (tokenType == StandardTokenizerImpl.ACRONYM_DEP) { + if (replaceInvalidAcronym) { + typeAtt.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]); + termAtt.setTermLength(termAtt.termLength() - 1); // remove extra '.' + } else { + typeAtt.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]); + } + } else { + typeAtt.setType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]); + } + return true; + } else + // When we skip a too-long term, we still increment the + // position increment + posIncr++; + } + } + + /* + * (non-Javadoc) + * + * @see org.apache.lucene.analysis.TokenStream#next() + */ + /** @deprecated */ public Token next(final Token reusableToken) throws IOException { assert reusableToken != null; int posIncr = 1; diff --git a/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java b/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java index 9f3a86a0d9b..da9ab969774 100644 --- a/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java +++ b/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java @@ -30,6 +30,7 @@ NOTE: if you change this file and need to regenerate the tokenizer, */ import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; /** @@ -368,6 +369,13 @@ final void getText(Token t) { t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead); } +/** + * Fills TermAttribute with the current token text. + */ +final void getText(TermAttribute t) { + t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead); +} + /** * Creates a new scanner diff --git a/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex b/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex index 0103c86b27a..939fd811657 100644 --- a/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex +++ b/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex @@ -29,6 +29,7 @@ NOTE: if you change StandardTokenizerImpl.jflex and need to regenerate */ import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; %% @@ -69,6 +70,14 @@ public final int yychar() final void getText(Token t) { t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead); } + +/** + * Fills TermAttribute with the current token text. + */ +final void getText(TermAttribute t) { + t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead); +} + %} THAI = [\u0E00-\u0E59] diff --git a/src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttribute.java b/src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttribute.java new file mode 100644 index 00000000000..1ee1a617ad1 --- /dev/null +++ b/src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttribute.java @@ -0,0 +1,86 @@ +package org.apache.lucene.analysis.tokenattributes; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Serializable; + +import org.apache.lucene.util.Attribute; + +/** + * This attribute can be used to pass different flags down the tokenizer chain, + * e. g. from one TokenFilter to another one. + * + *

    + * WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental. + * The APIs introduced in these classes with Lucene 2.9 might change in the future. + * We will make our best efforts to keep the APIs backwards-compatible. + + */ +public class FlagsAttribute extends Attribute implements Cloneable, Serializable { + private int flags = 0; + + /** + * EXPERIMENTAL: While we think this is here to stay, we may want to change it to be a long. + *

    + * + * Get the bitset for any bits that have been set. This is completely distinct from {@link TypeAttribute#type()}, although they do share similar purposes. + * The flags can be used to encode information about the token for use by other {@link org.apache.lucene.analysis.TokenFilter}s. + * + * + * @return The bits + */ + public int getFlags() { + return flags; + } + + /** + * @see #getFlags() + */ + public void setFlags(int flags) { + this.flags = flags; + } + + public void clear() { + flags = 0; + } + + public String toString() { + return "flags=" + flags; + } + + public boolean equals(Object other) { + if (this == other) { + return true; + } + + if (other instanceof FlagsAttribute) { + return ((FlagsAttribute) other).flags == flags; + } + + return false; + } + + public int hashCode() { + return flags; + } + + public void copyTo(Attribute target) { + FlagsAttribute t = (FlagsAttribute) target; + t.setFlags(flags); + } +} diff --git a/src/java/org/apache/lucene/analysis/tokenattributes/OffsetAttribute.java b/src/java/org/apache/lucene/analysis/tokenattributes/OffsetAttribute.java new file mode 100644 index 00000000000..d562257787d --- /dev/null +++ b/src/java/org/apache/lucene/analysis/tokenattributes/OffsetAttribute.java @@ -0,0 +1,98 @@ +package org.apache.lucene.analysis.tokenattributes; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Serializable; + +import org.apache.lucene.util.Attribute; + +/** + * The start and end character offset of a Token. + * + *

    + * WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental. + * The APIs introduced in these classes with Lucene 2.9 might change in the future. + * We will make our best efforts to keep the APIs backwards-compatible. + */ +public class OffsetAttribute extends Attribute implements Cloneable, Serializable { + private int startOffset; + private int endOffset; + + /** Returns this Token's starting offset, the position of the first character + corresponding to this token in the source text. + + Note that the difference between endOffset() and startOffset() may not be + equal to termText.length(), as the term text may have been altered by a + stemmer or some other filter. */ + public int startOffset() { + return startOffset; + } + + /** Set the starting offset. + @see #startOffset() */ + public void setStartOffset(int offset) { + this.startOffset = offset; + } + + /** Returns this Token's ending offset, one greater than the position of the + last character corresponding to this token in the source text. The length + of the token in the source text is (endOffset - startOffset). */ + public int endOffset() { + return endOffset; + } + + /** Set the ending offset. + @see #endOffset() */ + public void setEndOffset(int offset) { + this.endOffset = offset; + } + + public void clear() { + startOffset = 0; + endOffset = 0; + } + + public String toString() { + return "start=" + startOffset + ",end=" + endOffset; + } + + public boolean equals(Object other) { + if (other == this) { + return true; + } + + if (other instanceof OffsetAttribute) { + OffsetAttribute o = (OffsetAttribute) other; + return o.startOffset == startOffset && o.endOffset == endOffset; + } + + return false; + } + + public int hashCode() { + int code = startOffset; + code = code * 31 + endOffset; + return code; + } + + public void copyTo(Attribute target) { + OffsetAttribute t = (OffsetAttribute) target; + t.setStartOffset(startOffset); + t.setEndOffset(endOffset); + } +} diff --git a/src/java/org/apache/lucene/analysis/tokenattributes/PayloadAttribute.java b/src/java/org/apache/lucene/analysis/tokenattributes/PayloadAttribute.java new file mode 100644 index 00000000000..8f0a37e03cd --- /dev/null +++ b/src/java/org/apache/lucene/analysis/tokenattributes/PayloadAttribute.java @@ -0,0 +1,109 @@ +package org.apache.lucene.analysis.tokenattributes; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Serializable; + +import org.apache.lucene.index.Payload; +import org.apache.lucene.util.Attribute; + +/** + * The payload of a Token. See also {@link Payload}. + * + *

    + * WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental. + * The APIs introduced in these classes with Lucene 2.9 might change in the future. + * We will make our best efforts to keep the APIs backwards-compatible. + */ +public class PayloadAttribute extends Attribute implements Cloneable, Serializable { + private Payload payload; + + /** + * Initialize this attribute with no payload. + */ + public PayloadAttribute() {} + + /** + * Initialize this attribute with the given payload. + */ + public PayloadAttribute(Payload payload) { + this.payload = payload; + } + + /** + * Returns this Token's payload. + */ + public Payload getPayload() { + return this.payload; + } + + /** + * Sets this Token's payload. + */ + public void setPayload(Payload payload) { + this.payload = payload; + } + + public void clear() { + payload = null; + } + + public String toString() { + if (payload == null) { + return "payload=null"; + } + + return "payload=" + payload.toString(); + } + + public Object clone() { + PayloadAttribute clone = (PayloadAttribute) super.clone(); + if (payload != null) { + clone.payload = (Payload) payload.clone(); + } + return clone; + } + + public boolean equals(Object other) { + if (other == this) { + return true; + } + + if (other instanceof PayloadAttribute) { + PayloadAttribute o = (PayloadAttribute) other; + if (o.payload == null || payload == null) { + return o.payload == null && payload == null; + } + + return o.payload.equals(payload); + } + + return false; + } + + public int hashCode() { + return (payload == null) ? 0 : payload.hashCode(); + } + + public void copyTo(Attribute target) { + PayloadAttribute t = (PayloadAttribute) target; + t.setPayload((payload == null) ? null : (Payload) payload.clone()); + } + + +} diff --git a/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttribute.java b/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttribute.java new file mode 100644 index 00000000000..50400cc14e5 --- /dev/null +++ b/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttribute.java @@ -0,0 +1,106 @@ +package org.apache.lucene.analysis.tokenattributes; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Serializable; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.util.Attribute; + +/** The positionIncrement determines the position of this token + * relative to the previous Token in a {@link TokenStream}, used in phrase + * searching. + * + *

    The default value is one. + * + *

    Some common uses for this are:

      + * + *
    • Set it to zero to put multiple terms in the same position. This is + * useful if, e.g., a word has multiple stems. Searches for phrases + * including either stem will match. In this case, all but the first stem's + * increment should be set to zero: the increment of the first instance + * should be one. Repeating a token with an increment of zero can also be + * used to boost the scores of matches on that token. + * + *
    • Set it to values greater than one to inhibit exact phrase matches. + * If, for example, one does not want phrases to match across removed stop + * words, then one could build a stop word filter that removes stop words and + * also sets the increment to the number of stop words removed before each + * non-stop word. Then exact phrase queries will only match when the terms + * occur with no intervening stop words. + * + *
    + * + *

    + * WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental. + * The APIs introduced in these classes with Lucene 2.9 might change in the future. + * We will make our best efforts to keep the APIs backwards-compatible. + * + * @see org.apache.lucene.index.TermPositions + */ +public class PositionIncrementAttribute extends Attribute implements Cloneable, Serializable { + private int positionIncrement = 1; + + /** Set the position increment. The default value is one. + * + * @param positionIncrement the distance from the prior term + */ + public void setPositionIncrement(int positionIncrement) { + if (positionIncrement < 0) + throw new IllegalArgumentException + ("Increment must be zero or greater: " + positionIncrement); + this.positionIncrement = positionIncrement; + } + + /** Returns the position increment of this Token. + * @see #setPositionIncrement + */ + public int getPositionIncrement() { + return positionIncrement; + } + + public void clear() { + this.positionIncrement = 1; + } + + public String toString() { + return "positionIncrement=" + positionIncrement; + } + + public boolean equals(Object other) { + if (other == this) { + return true; + } + + if (other instanceof PositionIncrementAttribute) { + return positionIncrement == ((PositionIncrementAttribute) other).positionIncrement; + } + + return false; + } + + public int hashCode() { + return positionIncrement; + } + + public void copyTo(Attribute target) { + PositionIncrementAttribute t = (PositionIncrementAttribute) target; + t.setPositionIncrement(positionIncrement); + } + +} diff --git a/src/java/org/apache/lucene/analysis/tokenattributes/TermAttribute.java b/src/java/org/apache/lucene/analysis/tokenattributes/TermAttribute.java new file mode 100644 index 00000000000..8aa05071967 --- /dev/null +++ b/src/java/org/apache/lucene/analysis/tokenattributes/TermAttribute.java @@ -0,0 +1,242 @@ +package org.apache.lucene.analysis.tokenattributes; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Serializable; + +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.Attribute; + +/** + * The term text of a Token. + * + *

    + * WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental. + * The APIs introduced in these classes with Lucene 2.9 might change in the future. + * We will make our best efforts to keep the APIs backwards-compatible. + */ +public class TermAttribute extends Attribute implements Cloneable, Serializable { + private static int MIN_BUFFER_SIZE = 10; + + private char[] termBuffer; + private int termLength; + + /** Returns the Token's term text. + * + * This method has a performance penalty + * because the text is stored internally in a char[]. If + * possible, use {@link #termBuffer()} and {@link + * #termLength()} directly instead. If you really need a + * String, use this method, which is nothing more than + * a convenience call to new String(token.termBuffer(), 0, token.termLength()) + */ + public String term() { + initTermBuffer(); + return new String(termBuffer, 0, termLength); + } + + /** Copies the contents of buffer, starting at offset for + * length characters, into the termBuffer array. + * @param buffer the buffer to copy + * @param offset the index in the buffer of the first character to copy + * @param length the number of characters to copy + */ + public void setTermBuffer(char[] buffer, int offset, int length) { + char[] newCharBuffer = growTermBuffer(length); + if (newCharBuffer != null) { + termBuffer = newCharBuffer; + } + System.arraycopy(buffer, offset, termBuffer, 0, length); + termLength = length; + } + + /** Copies the contents of buffer into the termBuffer array. + * @param buffer the buffer to copy + */ + public void setTermBuffer(String buffer) { + int length = buffer.length(); + char[] newCharBuffer = growTermBuffer(length); + if (newCharBuffer != null) { + termBuffer = newCharBuffer; + } + buffer.getChars(0, length, termBuffer, 0); + termLength = length; + } + + /** Copies the contents of buffer, starting at offset and continuing + * for length characters, into the termBuffer array. + * @param buffer the buffer to copy + * @param offset the index in the buffer of the first character to copy + * @param length the number of characters to copy + */ + public void setTermBuffer(String buffer, int offset, int length) { + assert offset <= buffer.length(); + assert offset + length <= buffer.length(); + char[] newCharBuffer = growTermBuffer(length); + if (newCharBuffer != null) { + termBuffer = newCharBuffer; + } + buffer.getChars(offset, offset + length, termBuffer, 0); + termLength = length; + } + + /** Returns the internal termBuffer character array which + * you can then directly alter. If the array is too + * small for your token, use {@link + * #resizeTermBuffer(int)} to increase it. After + * altering the buffer be sure to call {@link + * #setTermLength} to record the number of valid + * characters that were placed into the termBuffer. */ + public char[] termBuffer() { + initTermBuffer(); + return termBuffer; + } + + /** Grows the termBuffer to at least size newSize, preserving the + * existing content. Note: If the next operation is to change + * the contents of the term buffer use + * {@link #setTermBuffer(char[], int, int)}, + * {@link #setTermBuffer(String)}, or + * {@link #setTermBuffer(String, int, int)} + * to optimally combine the resize with the setting of the termBuffer. + * @param newSize minimum size of the new termBuffer + * @return newly created termBuffer with length >= newSize + */ + public char[] resizeTermBuffer(int newSize) { + char[] newCharBuffer = growTermBuffer(newSize); + if (termBuffer == null) { + // If there were termText, then preserve it. + // note that if termBuffer is null then newCharBuffer cannot be null + assert newCharBuffer != null; + termBuffer = newCharBuffer; + } else if (newCharBuffer != null) { + // Note: if newCharBuffer != null then termBuffer needs to grow. + // If there were a termBuffer, then preserve it + System.arraycopy(termBuffer, 0, newCharBuffer, 0, termBuffer.length); + termBuffer = newCharBuffer; + } + return termBuffer; + } + + /** Allocates a buffer char[] of at least newSize + * @param newSize minimum size of the buffer + * @return newly created buffer with length >= newSize or null if the current termBuffer is big enough + */ + private char[] growTermBuffer(int newSize) { + if (termBuffer != null) { + if (termBuffer.length >= newSize) + // Already big enough + return null; + else + // Not big enough; create a new array with slight + // over allocation: + return new char[ArrayUtil.getNextSize(newSize)]; + } else { + + // determine the best size + // The buffer is always at least MIN_BUFFER_SIZE + if (newSize < MIN_BUFFER_SIZE) { + newSize = MIN_BUFFER_SIZE; + } + + return new char[newSize]; + } + } + + // TODO: once we remove the deprecated termText() method + // and switch entirely to char[] termBuffer we don't need + // to use this method anymore + private void initTermBuffer() { + if (termBuffer == null) { + termBuffer = new char[MIN_BUFFER_SIZE]; + termLength = 0; + } + } + + /** Return number of valid characters (length of the term) + * in the termBuffer array. */ + public int termLength() { + initTermBuffer(); + return termLength; + } + + /** Set number of valid characters (length of the term) in + * the termBuffer array. Use this to truncate the termBuffer + * or to synchronize with external manipulation of the termBuffer. + * Note: to grow the size of the array, + * use {@link #resizeTermBuffer(int)} first. + * @param length the truncated length + */ + public void setTermLength(int length) { + initTermBuffer(); + if (length > termBuffer.length) + throw new IllegalArgumentException("length " + length + " exceeds the size of the termBuffer (" + termBuffer.length + ")"); + termLength = length; + } + + public int hashCode() { + initTermBuffer(); + int code = termLength; + code = code * 31 + ArrayUtil.hashCode(termBuffer, 0, termLength); + return code; + } + + public void clear() { + termLength = 0; + } + + public Object clone() { + TermAttribute t = (TermAttribute)super.clone(); + // Do a deep clone + if (termBuffer != null) { + t.termBuffer = (char[]) termBuffer.clone(); + } + return t; + } + + public boolean equals(Object other) { + if (other == this) { + return true; + } + + if (other instanceof TermAttribute) { + initTermBuffer(); + TermAttribute o = ((TermAttribute) other); + o.initTermBuffer(); + + for(int i=0;i + * WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental. + * The APIs introduced in these classes with Lucene 2.9 might change in the future. + * We will make our best efforts to keep the APIs backwards-compatible. + */ +public class TypeAttribute extends Attribute implements Cloneable, Serializable { + private String type; + public static final String DEFAULT_TYPE = "word"; + + public TypeAttribute() { + this(DEFAULT_TYPE); + } + + public TypeAttribute(String type) { + this.type = type; + } + + /** Returns this Token's lexical type. Defaults to "word". */ + public String type() { + return type; + } + + /** Set the lexical type. + @see #type() */ + public void setType(String type) { + this.type = type; + } + + public void clear() { + type = DEFAULT_TYPE; + } + + public String toString() { + return "type=" + type; + } + + public boolean equals(Object other) { + if (other == this) { + return true; + } + + if (other instanceof TypeAttribute) { + return type.equals(((TypeAttribute) other).type); + } + + return false; + } + + public int hashCode() { + return type.hashCode(); + } + + public void copyTo(Attribute target) { + TypeAttribute t = (TypeAttribute) target; + t.setType(new String(type)); + } +} diff --git a/src/java/org/apache/lucene/index/DocInverter.java b/src/java/org/apache/lucene/index/DocInverter.java index f5e57e47123..85eed3fdf6d 100644 --- a/src/java/org/apache/lucene/index/DocInverter.java +++ b/src/java/org/apache/lucene/index/DocInverter.java @@ -17,12 +17,14 @@ package org.apache.lucene.index; * limitations under the License. */ -import java.util.Map; +import java.io.IOException; +import java.util.Collection; import java.util.HashMap; import java.util.HashSet; -import java.util.Collection; import java.util.Iterator; -import java.io.IOException; +import java.util.Map; + +import org.apache.lucene.util.AttributeSource; /** This is a DocFieldConsumer that inverts each field, * separately, from a Document, and accepts a diff --git a/src/java/org/apache/lucene/index/DocInverterPerField.java b/src/java/org/apache/lucene/index/DocInverterPerField.java index 20f358367d7..454090f97ea 100644 --- a/src/java/org/apache/lucene/index/DocInverterPerField.java +++ b/src/java/org/apache/lucene/index/DocInverterPerField.java @@ -22,6 +22,8 @@ import java.io.Reader; import org.apache.lucene.document.Fieldable; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; /** * Holds state for inverting all occurrences of a single @@ -79,10 +81,14 @@ final class DocInverterPerField extends DocFieldConsumerPerField { if (!field.isTokenized()) { // un-tokenized field String stringValue = field.stringValue(); final int valueLength = stringValue.length(); - Token token = perThread.localToken.reinit(stringValue, 0, valueLength); + perThread.singleTokenTokenStream.reinit(stringValue, 0, valueLength); + fieldState.attributeSource = perThread.singleTokenTokenStream; + perThread.localTokenStream.reset(); + consumer.start(field); + boolean success = false; try { - consumer.add(token); + consumer.add(); success = true; } finally { if (!success) @@ -122,7 +128,22 @@ final class DocInverterPerField extends DocFieldConsumerPerField { try { int offsetEnd = fieldState.offset-1; - final Token localToken = perThread.localToken; + + boolean useNewTokenStreamAPI = stream.useNewAPI(); + Token localToken = null; + + if (useNewTokenStreamAPI) { + fieldState.attributeSource = stream; + } else { + fieldState.attributeSource = perThread.localTokenStream; + localToken = perThread.localToken; + } + + consumer.start(field); + + OffsetAttribute offsetAttribute = (OffsetAttribute) fieldState.attributeSource.addAttribute(OffsetAttribute.class); + PositionIncrementAttribute posIncrAttribute = (PositionIncrementAttribute) fieldState.attributeSource.addAttribute(PositionIncrementAttribute.class); + for(;;) { // If we hit an exception in stream.next below @@ -131,10 +152,16 @@ final class DocInverterPerField extends DocFieldConsumerPerField { // non-aborting and (above) this one document // will be marked as deleted, but still // consume a docID - Token token = stream.next(localToken); - - if (token == null) break; - final int posIncr = token.getPositionIncrement(); + Token token = null; + if (useNewTokenStreamAPI) { + if (!stream.incrementToken()) break; + } else { + token = stream.next(localToken); + if (token == null) break; + perThread.localTokenStream.set(token); + } + + final int posIncr = posIncrAttribute.getPositionIncrement(); fieldState.position += posIncr - 1; if (posIncr == 0) fieldState.numOverlap++; @@ -147,14 +174,14 @@ final class DocInverterPerField extends DocFieldConsumerPerField { // internal state of the consumer is now // corrupt and should not be flushed to a // new segment: - consumer.add(token); + consumer.add(); success = true; } finally { if (!success) docState.docWriter.setAborting(); } fieldState.position++; - offsetEnd = fieldState.offset + token.endOffset(); + offsetEnd = fieldState.offset + offsetAttribute.endOffset(); if (++fieldState.length >= maxFieldLength) { if (docState.infoStream != null) docState.infoStream.println("maxFieldLength " +maxFieldLength+ " reached for field " + fieldInfo.name + ", ignoring following tokens"); diff --git a/src/java/org/apache/lucene/index/DocInverterPerThread.java b/src/java/org/apache/lucene/index/DocInverterPerThread.java index 1b80286c66c..19cf39311cc 100644 --- a/src/java/org/apache/lucene/index/DocInverterPerThread.java +++ b/src/java/org/apache/lucene/index/DocInverterPerThread.java @@ -20,6 +20,14 @@ package org.apache.lucene.index; import java.io.IOException; import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.FlagsAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.apache.lucene.util.Attribute; /** This is a DocFieldConsumer that inverts each field, * separately, from a Document, and accepts a @@ -30,6 +38,94 @@ final class DocInverterPerThread extends DocFieldConsumerPerThread { final InvertedDocConsumerPerThread consumer; final InvertedDocEndConsumerPerThread endConsumer; final Token localToken = new Token(); + //TODO: change to SingleTokenTokenStream after Token was removed + final SingleTokenTokenStream singleTokenTokenStream = new SingleTokenTokenStream(); + final BackwardsCompatibilityStream localTokenStream = new BackwardsCompatibilityStream(); + + static class SingleTokenTokenStream extends TokenStream { + TermAttribute termAttribute; + OffsetAttribute offsetAttribute; + + SingleTokenTokenStream() { + termAttribute = (TermAttribute) addAttribute(TermAttribute.class); + offsetAttribute = (OffsetAttribute) addAttribute(OffsetAttribute.class); + } + + public void reinit(String stringValue, int startOffset, int endOffset) { + termAttribute.setTermBuffer(stringValue); + offsetAttribute.setStartOffset(startOffset); + offsetAttribute.setEndOffset(endOffset); + } + } + + /** This stream wrapper is only used to maintain backwards compatibility with the + * old TokenStream API and can be removed in Lucene 3.0 + * @deprecated + */ + static class BackwardsCompatibilityStream extends TokenStream { + private Token token; + + TermAttribute termAttribute = new TermAttribute() { + public String term() { + return token.term(); + } + + public char[] termBuffer() { + return token.termBuffer(); + } + + public int termLength() { + return token.termLength(); + } + }; + OffsetAttribute offsetAttribute = new OffsetAttribute() { + public int startOffset() { + return token.startOffset(); + } + + public int endOffset() { + return token.endOffset(); + } + }; + + PositionIncrementAttribute positionIncrementAttribute = new PositionIncrementAttribute() { + public int getPositionIncrement() { + return token.getPositionIncrement(); + } + }; + + FlagsAttribute flagsAttribute = new FlagsAttribute() { + public int getFlags() { + return token.getFlags(); + } + }; + + PayloadAttribute payloadAttribute = new PayloadAttribute() { + public Payload getPayload() { + return token.getPayload(); + } + }; + + TypeAttribute typeAttribute = new TypeAttribute() { + public String type() { + return token.type(); + } + }; + + BackwardsCompatibilityStream() { + attributes.put(TermAttribute.class, termAttribute); + attributes.put(OffsetAttribute.class, offsetAttribute); + attributes.put(PositionIncrementAttribute.class, positionIncrementAttribute); + attributes.put(FlagsAttribute.class, flagsAttribute); + attributes.put(PayloadAttribute.class, payloadAttribute); + attributes.put(TypeAttribute.class, typeAttribute); + } + + public void set(Token token) { + this.token = token; + } + }; + final DocumentsWriter.DocState docState; final FieldInvertState fieldState = new FieldInvertState(); diff --git a/src/java/org/apache/lucene/index/FieldInvertState.java b/src/java/org/apache/lucene/index/FieldInvertState.java index c10455d59d2..5929469c7bd 100644 --- a/src/java/org/apache/lucene/index/FieldInvertState.java +++ b/src/java/org/apache/lucene/index/FieldInvertState.java @@ -17,6 +17,7 @@ package org.apache.lucene.index; import org.apache.lucene.search.Similarity; +import org.apache.lucene.util.AttributeSource; /** * This class tracks the number and position / offset parameters of terms @@ -32,6 +33,7 @@ public final class FieldInvertState { int numOverlap; int offset; float boost; + AttributeSource attributeSource; public FieldInvertState() { } @@ -54,6 +56,7 @@ public final class FieldInvertState { numOverlap = 0; offset = 0; boost = docBoost; + attributeSource = null; } /** @@ -97,4 +100,8 @@ public final class FieldInvertState { public float getBoost() { return boost; } + + public AttributeSource getAttributeSource() { + return attributeSource; + } } diff --git a/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java b/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java index 151338b138e..11845f16175 100644 --- a/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java +++ b/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java @@ -19,7 +19,7 @@ package org.apache.lucene.index; import java.io.IOException; import org.apache.lucene.document.Fieldable; -import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; // TODO: break into separate freq and prox writers as // codecs; make separate container (tii/tis/skip/*) that can @@ -32,6 +32,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem final DocumentsWriter.DocState docState; final FieldInvertState fieldState; boolean omitTf; + PayloadAttribute payloadAttribute; public FreqProxTermsWriterPerField(TermsHashPerField termsHashPerField, FreqProxTermsWriterPerThread perThread, FieldInfo fieldInfo) { this.termsHashPerField = termsHashPerField; @@ -53,7 +54,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem boolean hasPayloads; - void skippingLongTerm(Token t) throws IOException {} + void skippingLongTerm() throws IOException {} public int compareTo(Object other0) { FreqProxTermsWriterPerField other = (FreqProxTermsWriterPerField) other0; @@ -64,6 +65,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem // Record, up front, whether our in-RAM format will be // with or without term freqs: omitTf = fieldInfo.omitTf; + payloadAttribute = null; } boolean start(Fieldable[] fields, int count) { @@ -72,9 +74,23 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem return true; return false; } + + void start(Fieldable f) { + if (fieldState.attributeSource.hasAttribute(PayloadAttribute.class)) { + payloadAttribute = (PayloadAttribute) fieldState.attributeSource.getAttribute(PayloadAttribute.class); + } else { + payloadAttribute = null; + } + } - final void writeProx(Token t, FreqProxTermsWriter.PostingList p, int proxCode) { - final Payload payload = t.getPayload(); + final void writeProx(FreqProxTermsWriter.PostingList p, int proxCode) { + final Payload payload; + if (payloadAttribute == null) { + payload = null; + } else { + payload = payloadAttribute.getPayload(); + } + if (payload != null && payload.length > 0) { termsHashPerField.writeVInt(1, (proxCode<<1)|1); termsHashPerField.writeVInt(1, payload.length); @@ -85,7 +101,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem p.lastPosition = fieldState.position; } - final void newTerm(Token t, RawPostingList p0) { + final void newTerm(RawPostingList p0) { // First time we're seeing this term since the last // flush assert docState.testPoint("FreqProxTermsWriterPerField.newTerm start"); @@ -96,11 +112,11 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem } else { p.lastDocCode = docState.docID << 1; p.docFreq = 1; - writeProx(t, p, fieldState.position); + writeProx(p, fieldState.position); } } - final void addTerm(Token t, RawPostingList p0) { + final void addTerm(RawPostingList p0) { assert docState.testPoint("FreqProxTermsWriterPerField.addTerm start"); @@ -132,10 +148,10 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem p.docFreq = 1; p.lastDocCode = (docState.docID - p.lastDocID) << 1; p.lastDocID = docState.docID; - writeProx(t, p, fieldState.position); + writeProx(p, fieldState.position); } else { p.docFreq++; - writeProx(t, p, fieldState.position-p.lastPosition); + writeProx(p, fieldState.position-p.lastPosition); } } } diff --git a/src/java/org/apache/lucene/index/InvertedDocConsumerPerField.java b/src/java/org/apache/lucene/index/InvertedDocConsumerPerField.java index f195148f43a..cb7a333d878 100644 --- a/src/java/org/apache/lucene/index/InvertedDocConsumerPerField.java +++ b/src/java/org/apache/lucene/index/InvertedDocConsumerPerField.java @@ -17,10 +17,10 @@ package org.apache.lucene.index; * limitations under the License. */ -import org.apache.lucene.document.Fieldable; -import org.apache.lucene.analysis.Token; import java.io.IOException; +import org.apache.lucene.document.Fieldable; + abstract class InvertedDocConsumerPerField { // Called once per field, and is given all Fieldable @@ -29,8 +29,11 @@ abstract class InvertedDocConsumerPerField { // fields: abstract boolean start(Fieldable[] fields, int count) throws IOException; + // Called before a field instance is being processed + abstract void start(Fieldable field); + // Called once per inverted token - abstract void add(Token token) throws IOException; + abstract void add() throws IOException; // Called once per field per document, after all Fieldable // occurrences are inverted diff --git a/src/java/org/apache/lucene/index/Payload.java b/src/java/org/apache/lucene/index/Payload.java index 8e6d4302911..e3585c0d612 100644 --- a/src/java/org/apache/lucene/index/Payload.java +++ b/src/java/org/apache/lucene/index/Payload.java @@ -19,7 +19,6 @@ package org.apache.lucene.index; import java.io.Serializable; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.util.ArrayUtil; @@ -29,7 +28,7 @@ import org.apache.lucene.util.ArrayUtil; * specific term. *

    * To store payloads in the index a {@link TokenStream} has to be used that - * produces {@link Token}s containing payload data. + * produces payload data. *

    * Use {@link TermPositions#getPayloadLength()} and {@link TermPositions#getPayload(byte[], int)} * to retrieve the payloads from the index.
    diff --git a/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java b/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java index 9b61f5ff6ad..c7bab18633e 100644 --- a/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java +++ b/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java @@ -18,10 +18,11 @@ package org.apache.lucene.index; */ import java.io.IOException; -import org.apache.lucene.util.UnicodeUtil; -import org.apache.lucene.analysis.Token; + +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.document.Fieldable; import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.UnicodeUtil; final class TermVectorsTermsWriterPerField extends TermsHashConsumerPerField { @@ -37,7 +38,8 @@ final class TermVectorsTermsWriterPerField extends TermsHashConsumerPerField { boolean doVectorOffsets; int maxNumPostings; - + OffsetAttribute offsetAttribute = null; + public TermVectorsTermsWriterPerField(TermsHashPerField termsHashPerField, TermVectorsTermsWriterPerThread perThread, FieldInfo fieldInfo) { this.termsHashPerField = termsHashPerField; this.perThread = perThread; @@ -191,8 +193,16 @@ final class TermVectorsTermsWriterPerField extends TermsHashConsumerPerField { termsHashPerField.shrinkHash(maxNumPostings); maxNumPostings = 0; } + + void start(Fieldable f) { + if (doVectorOffsets && fieldState.attributeSource.hasAttribute(OffsetAttribute.class)) { + offsetAttribute = (OffsetAttribute) fieldState.attributeSource.getAttribute(OffsetAttribute.class); + } else { + offsetAttribute = null; + } + } - void newTerm(Token t, RawPostingList p0) { + void newTerm(RawPostingList p0) { assert docState.testPoint("TermVectorsTermsWriterPerField.newTerm start"); @@ -201,8 +211,9 @@ final class TermVectorsTermsWriterPerField extends TermsHashConsumerPerField { p.freq = 1; if (doVectorOffsets) { - final int startOffset = fieldState.offset + t.startOffset(); - final int endOffset = fieldState.offset + t.endOffset(); + int startOffset = fieldState.offset + offsetAttribute.startOffset();; + int endOffset = fieldState.offset + offsetAttribute.endOffset(); + termsHashPerField.writeVInt(1, startOffset); termsHashPerField.writeVInt(1, endOffset - startOffset); p.lastOffset = endOffset; @@ -214,7 +225,7 @@ final class TermVectorsTermsWriterPerField extends TermsHashConsumerPerField { } } - void addTerm(Token t, RawPostingList p0) { + void addTerm(RawPostingList p0) { assert docState.testPoint("TermVectorsTermsWriterPerField.addTerm start"); @@ -222,8 +233,9 @@ final class TermVectorsTermsWriterPerField extends TermsHashConsumerPerField { p.freq++; if (doVectorOffsets) { - final int startOffset = fieldState.offset + t.startOffset(); - final int endOffset = fieldState.offset + t.endOffset(); + int startOffset = fieldState.offset + offsetAttribute.startOffset();; + int endOffset = fieldState.offset + offsetAttribute.endOffset(); + termsHashPerField.writeVInt(1, startOffset - p.lastOffset); termsHashPerField.writeVInt(1, endOffset - startOffset); p.lastOffset = endOffset; @@ -235,5 +247,5 @@ final class TermVectorsTermsWriterPerField extends TermsHashConsumerPerField { } } - void skippingLongTerm(Token t) {} + void skippingLongTerm() {} } diff --git a/src/java/org/apache/lucene/index/TermsHashConsumerPerField.java b/src/java/org/apache/lucene/index/TermsHashConsumerPerField.java index 2c716418f44..a7ad15c2ea1 100644 --- a/src/java/org/apache/lucene/index/TermsHashConsumerPerField.java +++ b/src/java/org/apache/lucene/index/TermsHashConsumerPerField.java @@ -23,14 +23,15 @@ package org.apache.lucene.index; * multiple streams for each unique Token. */ import java.io.IOException; + import org.apache.lucene.document.Fieldable; -import org.apache.lucene.analysis.Token; abstract class TermsHashConsumerPerField { abstract boolean start(Fieldable[] fields, int count) throws IOException; abstract void finish() throws IOException; - abstract void skippingLongTerm(Token t) throws IOException; - abstract void newTerm(Token t, RawPostingList p) throws IOException; - abstract void addTerm(Token t, RawPostingList p) throws IOException; + abstract void skippingLongTerm() throws IOException; + abstract void start(Fieldable field); + abstract void newTerm(RawPostingList p) throws IOException; + abstract void addTerm(RawPostingList p) throws IOException; abstract int getStreamCount(); } diff --git a/src/java/org/apache/lucene/index/TermsHashPerField.java b/src/java/org/apache/lucene/index/TermsHashPerField.java index bd87b05f354..27b550cd6c3 100644 --- a/src/java/org/apache/lucene/index/TermsHashPerField.java +++ b/src/java/org/apache/lucene/index/TermsHashPerField.java @@ -20,8 +20,8 @@ package org.apache.lucene.index; import java.io.IOException; import java.util.Arrays; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.document.Fieldable; -import org.apache.lucene.analysis.Token; import org.apache.lucene.util.UnicodeUtil; final class TermsHashPerField extends InvertedDocConsumerPerField { @@ -31,7 +31,8 @@ final class TermsHashPerField extends InvertedDocConsumerPerField { final TermsHashPerThread perThread; final DocumentsWriter.DocState docState; final FieldInvertState fieldState; - + TermAttribute termAtt; + // Copied from our perThread final CharBlockPool charPool; final IntBlockPool intPool; @@ -49,7 +50,7 @@ final class TermsHashPerField extends InvertedDocConsumerPerField { private int postingsHashMask = postingsHashSize-1; private RawPostingList[] postingsHash = new RawPostingList[postingsHashSize]; private RawPostingList p; - + public TermsHashPerField(DocInverterPerField docInverterPerField, final TermsHashPerThread perThread, final TermsHashPerThread nextPerThread, final FieldInfo fieldInfo) { this.perThread = perThread; intPool = perThread.intPool; @@ -247,6 +248,14 @@ final class TermsHashPerField extends InvertedDocConsumerPerField { private boolean doCall; private boolean doNextCall; + void start(Fieldable f) { + termAtt = (TermAttribute) fieldState.attributeSource.getAttribute(TermAttribute.class); + consumer.start(f); + if (nextPerField != null) { + nextPerField.start(f); + } + } + boolean start(Fieldable[] fields, int count) throws IOException { doCall = consumer.start(fields, count); if (nextPerField != null) @@ -257,7 +266,7 @@ final class TermsHashPerField extends InvertedDocConsumerPerField { // Secondary entry point (for 2nd & subsequent TermsHash), // because token text has already been "interned" into // textStart, so we hash by textStart - public void add(Token token, int textStart) throws IOException { + public void add(int textStart) throws IOException { int code = textStart; @@ -320,17 +329,17 @@ final class TermsHashPerField extends InvertedDocConsumerPerField { } p.byteStart = intUptos[intUptoStart]; - consumer.newTerm(token, p); + consumer.newTerm(p); } else { intUptos = intPool.buffers[p.intStart >> DocumentsWriter.INT_BLOCK_SHIFT]; intUptoStart = p.intStart & DocumentsWriter.INT_BLOCK_MASK; - consumer.addTerm(token, p); + consumer.addTerm(p); } } // Primary entry point (for first TermsHash) - void add(Token token) throws IOException { + void add() throws IOException { assert !postingsCompacted; @@ -338,8 +347,8 @@ final class TermsHashPerField extends InvertedDocConsumerPerField { // term text into textStart address // Get the text of this term. - final char[] tokenText = token.termBuffer(); - final int tokenTextLen = token.termLength(); + final char[] tokenText = termAtt.termBuffer();; + final int tokenTextLen = termAtt.termLength(); // Compute hashcode & replace any invalid UTF16 sequences int downto = tokenTextLen; @@ -403,7 +412,7 @@ final class TermsHashPerField extends InvertedDocConsumerPerField { if (docState.maxTermPrefix == null) docState.maxTermPrefix = new String(tokenText, 0, 30); - consumer.skippingLongTerm(token); + consumer.skippingLongTerm(); return; } charPool.nextBuffer(); @@ -450,16 +459,16 @@ final class TermsHashPerField extends InvertedDocConsumerPerField { } p.byteStart = intUptos[intUptoStart]; - consumer.newTerm(token, p); + consumer.newTerm(p); } else { intUptos = intPool.buffers[p.intStart >> DocumentsWriter.INT_BLOCK_SHIFT]; intUptoStart = p.intStart & DocumentsWriter.INT_BLOCK_MASK; - consumer.addTerm(token, p); + consumer.addTerm(p); } if (doNextCall) - nextPerField.add(token, p.textStart); + nextPerField.add(p.textStart); } int[] intUptos; diff --git a/src/java/org/apache/lucene/queryParser/QueryParser.java b/src/java/org/apache/lucene/queryParser/QueryParser.java index e17c6b56f52..c16ae226378 100644 --- a/src/java/org/apache/lucene/queryParser/QueryParser.java +++ b/src/java/org/apache/lucene/queryParser/QueryParser.java @@ -3,8 +3,8 @@ package org.apache.lucene.queryParser; import java.io.IOException; import java.io.StringReader; -import java.text.DateFormat; import java.text.Collator; +import java.text.DateFormat; import java.util.ArrayList; import java.util.Calendar; import java.util.Date; @@ -15,7 +15,10 @@ import java.util.Map; import java.util.Vector; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.CachingTokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.document.DateField; import org.apache.lucene.document.DateTools; import org.apache.lucene.index.Term; @@ -518,48 +521,126 @@ public class QueryParser implements QueryParserConstants { // PhraseQuery, or nothing based on the term count TokenStream source = analyzer.tokenStream(field, new StringReader(queryText)); - List list = new ArrayList(); - final org.apache.lucene.analysis.Token reusableToken = new org.apache.lucene.analysis.Token(); - org.apache.lucene.analysis.Token nextToken; + CachingTokenFilter buffer = new CachingTokenFilter(source); + TermAttribute termAtt = null; + PositionIncrementAttribute posIncrAtt = null; + int numTokens = 0; + + org.apache.lucene.analysis.Token reusableToken = null; + org.apache.lucene.analysis.Token nextToken = null; + + + boolean useNewAPI = TokenStream.useNewAPIDefault(); + + if (useNewAPI) { + boolean success = false; + try { + buffer.reset(); + success = true; + } catch (IOException e) { + // success==false if we hit an exception + } + if (success) { + if (buffer.hasAttribute(TermAttribute.class)) { + termAtt = (TermAttribute) buffer.getAttribute(TermAttribute.class); + } + if (buffer.hasAttribute(PositionIncrementAttribute.class)) { + posIncrAtt = (PositionIncrementAttribute) buffer.getAttribute(PositionIncrementAttribute.class); + } + } + } else { + reusableToken = new org.apache.lucene.analysis.Token(); + } + int positionCount = 0; boolean severalTokensAtSamePosition = false; - while (true) { - try { - nextToken = source.next(reusableToken); + if (useNewAPI) { + if (termAtt != null) { + try { + while (buffer.incrementToken()) { + numTokens++; + int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1; + if (positionIncrement != 0) { + positionCount += positionIncrement; + } else { + severalTokensAtSamePosition = true; + } + } + } catch (IOException e) { + // ignore + } } - catch (IOException e) { - nextToken = null; + } else { + while (true) { + try { + nextToken = buffer.next(reusableToken); + } + catch (IOException e) { + nextToken = null; + } + if (nextToken == null) + break; + numTokens++; + if (nextToken.getPositionIncrement() != 0) + positionCount += nextToken.getPositionIncrement(); + else + severalTokensAtSamePosition = true; } - if (nextToken == null) - break; - list.add(nextToken.clone()); - if (nextToken.getPositionIncrement() != 0) - positionCount += nextToken.getPositionIncrement(); - else - severalTokensAtSamePosition = true; } try { + // rewind the buffer stream + buffer.reset(); + + // close original stream - all tokens buffered source.close(); } catch (IOException e) { // ignore } - if (list.size() == 0) + if (numTokens == 0) return null; - else if (list.size() == 1) { - nextToken = (org.apache.lucene.analysis.Token) list.get(0); - return newTermQuery(new Term(field, nextToken.term())); + else if (numTokens == 1) { + String term = null; + try { + + if (useNewAPI) { + boolean hasNext = buffer.incrementToken(); + assert hasNext == true; + term = termAtt.term(); + } else { + nextToken = buffer.next(reusableToken); + assert nextToken != null; + term = nextToken.term(); + } + } catch (IOException e) { + // safe to ignore, because we know the number of tokens + } + return newTermQuery(new Term(field, term)); } else { if (severalTokensAtSamePosition) { if (positionCount == 1) { // no phrase query: BooleanQuery q = newBooleanQuery(true); - for (int i = 0; i < list.size(); i++) { - nextToken = (org.apache.lucene.analysis.Token) list.get(i); + for (int i = 0; i < numTokens; i++) { + String term = null; + try { + if (useNewAPI) { + boolean hasNext = buffer.incrementToken(); + assert hasNext == true; + term = termAtt.term(); + } else { + nextToken = buffer.next(reusableToken); + assert nextToken != null; + term = nextToken.term(); + } + } catch (IOException e) { + // safe to ignore, because we know the number of tokens + } + Query currentQuery = newTermQuery( - new Term(field, nextToken.term())); + new Term(field, term)); q.add(currentQuery, BooleanClause.Occur.SHOULD); } return q; @@ -570,9 +651,28 @@ public class QueryParser implements QueryParserConstants { mpq.setSlop(phraseSlop); List multiTerms = new ArrayList(); int position = -1; - for (int i = 0; i < list.size(); i++) { - nextToken = (org.apache.lucene.analysis.Token) list.get(i); - if (nextToken.getPositionIncrement() > 0 && multiTerms.size() > 0) { + for (int i = 0; i < numTokens; i++) { + String term = null; + int positionIncrement = 1; + try { + if (useNewAPI) { + boolean hasNext = buffer.incrementToken(); + assert hasNext == true; + term = termAtt.term(); + if (posIncrAtt != null) { + positionIncrement = posIncrAtt.getPositionIncrement(); + } + } else { + nextToken = buffer.next(reusableToken); + assert nextToken != null; + term = nextToken.term(); + positionIncrement = nextToken.getPositionIncrement(); + } + } catch (IOException e) { + // safe to ignore, because we know the number of tokens + } + + if (positionIncrement > 0 && multiTerms.size() > 0) { if (enablePositionIncrements) { mpq.add((Term[])multiTerms.toArray(new Term[0]),position); } else { @@ -580,8 +680,8 @@ public class QueryParser implements QueryParserConstants { } multiTerms.clear(); } - position += nextToken.getPositionIncrement(); - multiTerms.add(new Term(field, nextToken.term())); + position += positionIncrement; + multiTerms.add(new Term(field, term)); } if (enablePositionIncrements) { mpq.add((Term[])multiTerms.toArray(new Term[0]),position); @@ -595,13 +695,36 @@ public class QueryParser implements QueryParserConstants { PhraseQuery pq = newPhraseQuery(); pq.setSlop(phraseSlop); int position = -1; - for (int i = 0; i < list.size(); i++) { - nextToken = (org.apache.lucene.analysis.Token) list.get(i); + + + for (int i = 0; i < numTokens; i++) { + String term = null; + int positionIncrement = 1; + + try { + if (useNewAPI) { + + boolean hasNext = buffer.incrementToken(); + assert hasNext == true; + term = termAtt.term(); + if (posIncrAtt != null) { + positionIncrement = posIncrAtt.getPositionIncrement(); + } + } else { + nextToken = buffer.next(reusableToken); + assert nextToken != null; + term = nextToken.term(); + positionIncrement = nextToken.getPositionIncrement(); + } + } catch (IOException e) { + // safe to ignore, because we know the number of tokens + } + if (enablePositionIncrements) { - position += nextToken.getPositionIncrement(); - pq.add(new Term(field, nextToken.term()),position); + position += positionIncrement; + pq.add(new Term(field, term),position); } else { - pq.add(new Term(field, nextToken.term())); + pq.add(new Term(field, term)); } } return pq; @@ -610,6 +733,7 @@ public class QueryParser implements QueryParserConstants { } + /** * Base implementation delegates to {@link #getFieldQuery(String,String)}. * This method may be overridden, for example, to return @@ -1503,12 +1627,6 @@ public class QueryParser implements QueryParserConstants { finally { jj_save(0, xla); } } - private boolean jj_3R_3() { - if (jj_scan_token(STAR)) return true; - if (jj_scan_token(COLON)) return true; - return false; - } - private boolean jj_3R_2() { if (jj_scan_token(TERM)) return true; if (jj_scan_token(COLON)) return true; @@ -1525,6 +1643,12 @@ public class QueryParser implements QueryParserConstants { return false; } + private boolean jj_3R_3() { + if (jj_scan_token(STAR)) return true; + if (jj_scan_token(COLON)) return true; + return false; + } + /** Generated Token Manager. */ public QueryParserTokenManager token_source; /** Current token. */ diff --git a/src/java/org/apache/lucene/queryParser/QueryParser.jj b/src/java/org/apache/lucene/queryParser/QueryParser.jj index 06cf094f7c0..059851c4e8b 100644 --- a/src/java/org/apache/lucene/queryParser/QueryParser.jj +++ b/src/java/org/apache/lucene/queryParser/QueryParser.jj @@ -27,8 +27,8 @@ package org.apache.lucene.queryParser; import java.io.IOException; import java.io.StringReader; -import java.text.DateFormat; import java.text.Collator; +import java.text.DateFormat; import java.util.ArrayList; import java.util.Calendar; import java.util.Date; @@ -39,7 +39,10 @@ import java.util.Map; import java.util.Vector; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.CachingTokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.document.DateField; import org.apache.lucene.document.DateTools; import org.apache.lucene.index.Term; @@ -542,48 +545,126 @@ public class QueryParser { // PhraseQuery, or nothing based on the term count TokenStream source = analyzer.tokenStream(field, new StringReader(queryText)); - List list = new ArrayList(); - final org.apache.lucene.analysis.Token reusableToken = new org.apache.lucene.analysis.Token(); - org.apache.lucene.analysis.Token nextToken; + CachingTokenFilter buffer = new CachingTokenFilter(source); + TermAttribute termAtt = null; + PositionIncrementAttribute posIncrAtt = null; + int numTokens = 0; + + org.apache.lucene.analysis.Token reusableToken = null; + org.apache.lucene.analysis.Token nextToken = null; + + + boolean useNewAPI = TokenStream.useNewAPI(); + + if (useNewAPI) { + boolean success = false; + try { + buffer.start(); + success = true; + } catch (IOException e) { + // success==false if we hit an exception + } + if (success) { + if (buffer.hasAttribute(TermAttribute.class)) { + termAtt = (TermAttribute) buffer.getAttribute(TermAttribute.class); + } + if (buffer.hasAttribute(PositionIncrementAttribute.class)) { + posIncrAtt = (PositionIncrementAttribute) buffer.getAttribute(PositionIncrementAttribute.class); + } + } + } else { + reusableToken = new org.apache.lucene.analysis.Token(); + } + int positionCount = 0; boolean severalTokensAtSamePosition = false; - while (true) { - try { - nextToken = source.next(reusableToken); + if (useNewAPI) { + if (termAtt != null) { + try { + while (buffer.incrementToken()) { + numTokens++; + int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1; + if (positionIncrement != 0) { + positionCount += positionIncrement; + } else { + severalTokensAtSamePosition = true; + } + } + } catch (IOException e) { + // ignore + } } - catch (IOException e) { - nextToken = null; - } - if (nextToken == null) - break; - list.add(nextToken.clone()); - if (nextToken.getPositionIncrement() != 0) - positionCount += nextToken.getPositionIncrement(); - else - severalTokensAtSamePosition = true; + } else { + while (true) { + try { + nextToken = buffer.next(reusableToken); + } + catch (IOException e) { + nextToken = null; + } + if (nextToken == null) + break; + numTokens++; + if (nextToken.getPositionIncrement() != 0) + positionCount += nextToken.getPositionIncrement(); + else + severalTokensAtSamePosition = true; + } } try { + // rewind the buffer stream + buffer.reset(); + + // close original stream - all tokens buffered source.close(); } catch (IOException e) { // ignore } - - if (list.size() == 0) + + if (numTokens == 0) return null; - else if (list.size() == 1) { - nextToken = (org.apache.lucene.analysis.Token) list.get(0); - return newTermQuery(new Term(field, nextToken.term())); + else if (numTokens == 1) { + String term = null; + try { + + if (useNewAPI) { + boolean hasNext = buffer.incrementToken(); + assert hasNext == true; + term = termAtt.term(); + } else { + nextToken = buffer.next(reusableToken); + assert nextToken != null; + term = nextToken.term(); + } + } catch (IOException e) { + // safe to ignore, because we know the number of tokens + } + return newTermQuery(new Term(field, term)); } else { if (severalTokensAtSamePosition) { if (positionCount == 1) { // no phrase query: BooleanQuery q = newBooleanQuery(true); - for (int i = 0; i < list.size(); i++) { - nextToken = (org.apache.lucene.analysis.Token) list.get(i); + for (int i = 0; i < numTokens; i++) { + String term = null; + try { + if (useNewAPI) { + boolean hasNext = buffer.incrementToken(); + assert hasNext == true; + term = termAtt.term(); + } else { + nextToken = buffer.next(reusableToken); + assert nextToken != null; + term = nextToken.term(); + } + } catch (IOException e) { + // safe to ignore, because we know the number of tokens + } + Query currentQuery = newTermQuery( - new Term(field, nextToken.term())); + new Term(field, term)); q.add(currentQuery, BooleanClause.Occur.SHOULD); } return q; @@ -594,9 +675,28 @@ public class QueryParser { mpq.setSlop(phraseSlop); List multiTerms = new ArrayList(); int position = -1; - for (int i = 0; i < list.size(); i++) { - nextToken = (org.apache.lucene.analysis.Token) list.get(i); - if (nextToken.getPositionIncrement() > 0 && multiTerms.size() > 0) { + for (int i = 0; i < numTokens; i++) { + String term = null; + int positionIncrement = 1; + try { + if (useNewAPI) { + boolean hasNext = buffer.incrementToken(); + assert hasNext == true; + term = termAtt.term(); + if (posIncrAtt != null) { + positionIncrement = posIncrAtt.getPositionIncrement(); + } + } else { + nextToken = buffer.next(reusableToken); + assert nextToken != null; + term = nextToken.term(); + positionIncrement = nextToken.getPositionIncrement(); + } + } catch (IOException e) { + // safe to ignore, because we know the number of tokens + } + + if (positionIncrement > 0 && multiTerms.size() > 0) { if (enablePositionIncrements) { mpq.add((Term[])multiTerms.toArray(new Term[0]),position); } else { @@ -604,8 +704,8 @@ public class QueryParser { } multiTerms.clear(); } - position += nextToken.getPositionIncrement(); - multiTerms.add(new Term(field, nextToken.term())); + position += positionIncrement; + multiTerms.add(new Term(field, term)); } if (enablePositionIncrements) { mpq.add((Term[])multiTerms.toArray(new Term[0]),position); @@ -619,13 +719,36 @@ public class QueryParser { PhraseQuery pq = newPhraseQuery(); pq.setSlop(phraseSlop); int position = -1; - for (int i = 0; i < list.size(); i++) { - nextToken = (org.apache.lucene.analysis.Token) list.get(i); + + + for (int i = 0; i < numTokens; i++) { + String term = null; + int positionIncrement = 1; + + try { + if (useNewAPI) { + + boolean hasNext = buffer.incrementToken(); + assert hasNext == true; + term = termAtt.term(); + if (posIncrAtt != null) { + positionIncrement = posIncrAtt.getPositionIncrement(); + } + } else { + nextToken = buffer.next(reusableToken); + assert nextToken != null; + term = nextToken.term(); + positionIncrement = nextToken.getPositionIncrement(); + } + } catch (IOException e) { + // safe to ignore, because we know the number of tokens + } + if (enablePositionIncrements) { - position += nextToken.getPositionIncrement(); - pq.add(new Term(field, nextToken.term()),position); + position += positionIncrement; + pq.add(new Term(field, term),position); } else { - pq.add(new Term(field, nextToken.term())); + pq.add(new Term(field, term)); } } return pq; @@ -634,6 +757,7 @@ public class QueryParser { } + /** * Base implementation delegates to {@link #getFieldQuery(String,String)}. * This method may be overridden, for example, to return diff --git a/src/java/org/apache/lucene/queryParser/QueryParserTokenManager.java b/src/java/org/apache/lucene/queryParser/QueryParserTokenManager.java index 41bb40755f9..dc8899a1772 100644 --- a/src/java/org/apache/lucene/queryParser/QueryParserTokenManager.java +++ b/src/java/org/apache/lucene/queryParser/QueryParserTokenManager.java @@ -2,8 +2,8 @@ package org.apache.lucene.queryParser; import java.io.IOException; import java.io.StringReader; -import java.text.DateFormat; import java.text.Collator; +import java.text.DateFormat; import java.util.ArrayList; import java.util.Calendar; import java.util.Date; @@ -13,7 +13,10 @@ import java.util.Locale; import java.util.Map; import java.util.Vector; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.CachingTokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.document.DateField; import org.apache.lucene.document.DateTools; import org.apache.lucene.index.Term; diff --git a/src/java/org/apache/lucene/search/QueryTermVector.java b/src/java/org/apache/lucene/search/QueryTermVector.java index 8a1a8bd252b..3070896a248 100644 --- a/src/java/org/apache/lucene/search/QueryTermVector.java +++ b/src/java/org/apache/lucene/search/QueryTermVector.java @@ -29,6 +29,7 @@ import java.util.Map; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.index.TermFreqVector; /** @@ -58,9 +59,17 @@ public class QueryTermVector implements TermFreqVector { { List terms = new ArrayList(); try { - final Token reusableToken = new Token(); - for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) { - terms.add(nextToken.term()); + if (stream.useNewAPI()) { + stream.reset(); + TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class); + while (stream.incrementToken()) { + terms.add(termAtt.term()); + } + } else { + final Token reusableToken = new Token(); + for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) { + terms.add(nextToken.term()); + } } processTerms((String[])terms.toArray(new String[terms.size()])); } catch (IOException e) { diff --git a/src/java/org/apache/lucene/util/Attribute.java b/src/java/org/apache/lucene/util/Attribute.java new file mode 100644 index 00000000000..ad5de1eb651 --- /dev/null +++ b/src/java/org/apache/lucene/util/Attribute.java @@ -0,0 +1,95 @@ +package org.apache.lucene.util; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Serializable; + +/** + * Base class for Attributes that can be added to a + * {@link org.apache.lucene.util.AttributeSource}. + *

    + * Attributes are used to add data in a dynamic, yet type-safe way to a source + * of usually streamed objects, e. g. a {@link org.apache.lucene.analysis.TokenStream}. + *

    + * WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental. + * The APIs introduced in these classes with Lucene 2.9 might change in the future. + * We will make our best efforts to keep the APIs backwards-compatible. + */ +public abstract class Attribute implements Cloneable, Serializable { + /** + * Clears the values in this Attribute and resets it to its + * default value. + */ + public abstract void clear(); + + /** + * Subclasses must implement this method and should follow a syntax + * similar to this one: + * + *

    +   *   public String toString() {
    +   *     return "start=" + startOffset + ",end=" + endOffset;
    +   *   }
    +   * 
    + */ + public abstract String toString(); + + /** + * Subclasses must implement this method and should compute + * a hashCode similar to this: + *
    +   *   public int hashCode() {
    +   *     int code = startOffset;
    +   *     code = code * 31 + endOffset;
    +   *     return code;
    +   *   }
    +   * 
    + * + * see also {@link #equals(Object)} + */ + public abstract int hashCode(); + + /** + * All values used for computation of {@link #hashCode()} + * should be checked here for equality. + * + * see also {@link Object#equals(Object)} + */ + public abstract boolean equals(Object other); + + /** + * Copies the values from this Attribute into the passed-in + * target attribute. The type of the target must match the type + * of this attribute. + */ + public abstract void copyTo(Attribute target); + + /** + * Shallow clone. Subclasses must override this if they + * need to clone any members deeply, + */ + public Object clone() { + Object clone = null; + try { + clone = super.clone(); + } catch (CloneNotSupportedException e) { + throw new RuntimeException(e); // shouldn't happen + } + return clone; + } +} diff --git a/src/java/org/apache/lucene/util/AttributeSource.java b/src/java/org/apache/lucene/util/AttributeSource.java new file mode 100644 index 00000000000..afecea2d26c --- /dev/null +++ b/src/java/org/apache/lucene/util/AttributeSource.java @@ -0,0 +1,274 @@ +package org.apache.lucene.util; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Iterator; +import java.util.LinkedHashMap; +import java.util.Map; + +import org.apache.lucene.analysis.TokenStream; + + +/** + * An AttributeSource contains a list of different {@link Attribute}s, + * and methods to add and get them. There can only be a single instance + * of an attribute in the same AttributeSource instance. This is ensured + * by passing in the actual type of the Attribute (Class<Attribute>) to + * the {@link #addAttribute(Class)}, which then checks if an instance of + * that type is already present. If yes, it returns the instance, otherwise + * it creates a new instance and returns it. + * + *

    + * WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental. + * The APIs introduced in these classes with Lucene 2.9 might change in the future. + * We will make our best efforts to keep the APIs backwards-compatible. + */ +public class AttributeSource { + /** + * An AttributeAcceptor defines only a single method {@link #accept(Class)}. + * It can be used for e. g. buffering purposes to specify which attributes + * to buffer. + */ + public static abstract class AttributeAcceptor { + /** Return true, to accept this attribute; false otherwise */ + public abstract boolean accept(Class attClass); + } + + /** + * Default AttributeAcceptor that accepts all attributes. + */ + public static final AttributeAcceptor AllAcceptor = new AttributeAcceptor() { + public boolean accept(Class attClass) {return true;} + }; + + /** + * Holds the Class<Attribute> -> Attribute mapping + */ + protected Map attributes; + + public AttributeSource() { + this.attributes = new LinkedHashMap(); + } + + public AttributeSource(AttributeSource input) { + this.attributes = input.attributes; + } + + /** Returns an iterator that iterates the attributes + * in the same order they were added in. + */ + public Iterator getAttributesIterator() { + return attributes.values().iterator(); + } + + /** + * The caller must pass in a Class<? extends Attribute> value. + * This method first checks if an instance of that class is + * already in this AttributeSource and returns it. Otherwise a + * new instance is created, added to this AttributeSource and returned. + */ + public Attribute addAttribute(Class attClass) { + Attribute att = (Attribute) attributes.get(attClass); + if (att == null) { + try { + att = (Attribute) attClass.newInstance(); + } catch (InstantiationException e) { + throw new IllegalArgumentException("Could not instantiate class " + attClass); + } catch (IllegalAccessException e) { + throw new IllegalArgumentException("Could not instantiate class " + attClass); + } + + attributes.put(attClass, att); + } + return att; + } + + /** Returns true, iff this AttributeSource has any attributes */ + public boolean hasAttributes() { + return !this.attributes.isEmpty(); + } + + /** + * The caller must pass in a Class<? extends Attribute> value. + * Returns true, iff this AttributeSource contains the passed-in Attribute. + */ + public boolean hasAttribute(Class attClass) { + return this.attributes.containsKey(attClass); + } + + /** + * The caller must pass in a Class<? extends Attribute> value. + * Returns the instance of the passed in Attribute contained in this AttributeSource + * + * @throws IllegalArgumentException if this AttributeSource does not contain the + * Attribute + */ + public Attribute getAttribute(Class attClass) { + Attribute att = (Attribute) this.attributes.get(attClass); + if (att == null) { + throw new IllegalArgumentException("This token does not have the attribute '" + attClass + "'."); + } + + return att; + } + + /** + * Resets all Attributes in this AttributeSource by calling + * {@link Attribute#clear()} on each Attribute. + */ + public void clearAttributes() { + Iterator it = getAttributesIterator(); + while (it.hasNext()) { + ((Attribute) it.next()).clear(); + } + } + + /** + * Captures the current state of the passed in TokenStream. + *

    + * This state will contain all of the passed in TokenStream's + * {@link Attribute}s. If only a subset of the attributes is needed + * please use {@link #captureState(AttributeAcceptor)} + */ + public AttributeSource captureState() { + return captureState(AllAcceptor); + } + + /** + * Captures the current state of the passed in TokenStream. + *

    + * This state will contain all of the passed in TokenStream's + * {@link Attribute}s which the {@link AttributeAcceptor} accepts. + */ + public AttributeSource captureState(AttributeAcceptor acceptor) { + AttributeSource state = new AttributeSource(); + + Iterator it = getAttributesIterator(); + while(it.hasNext()) { + Attribute att = (Attribute) it.next(); + if (acceptor.accept(att.getClass())) { + Attribute clone = (Attribute) att.clone(); + state.attributes.put(att.getClass(), clone); + } + } + + return state; + } + + /** + * Restores this state by copying the values of all attributes + * that this state contains into the attributes of the targetStream. + * The targetStream must contain a corresponding instance for each argument + * contained in this state. + *

    + * Note that this method does not affect attributes of the targetStream + * that are not contained in this state. In other words, if for example + * the targetStream contains an OffsetAttribute, but this state doesn't, then + * the value of the OffsetAttribute remains unchanged. It might be desirable to + * reset its value to the default, in which case the caller should first + * call {@link TokenStream#clearAttributes()} on the targetStream. + */ + public void restoreState(AttributeSource target) { + Iterator it = getAttributesIterator(); + while (it.hasNext()) { + Attribute att = (Attribute) it.next(); + Attribute targetAtt = target.getAttribute(att.getClass()); + att.copyTo(targetAtt); + } + } + + public int hashCode() { + int code = 0; + if (hasAttributes()) { + Iterator it = getAttributesIterator(); + while (it.hasNext()) { + code = code * 31 + it.next().hashCode(); + } + } + + return code; + } + + public boolean equals(Object obj) { + if (obj == this) { + return true; + } + + if (obj instanceof AttributeSource) { + AttributeSource other = (AttributeSource) obj; + + if (hasAttributes()) { + if (!other.hasAttributes()) { + return false; + } + + if (attributes.size() != other.attributes.size()) { + return false; + } + + Iterator it = getAttributesIterator(); + while (it.hasNext()) { + Class attName = it.next().getClass(); + + Attribute otherAtt = (Attribute) other.attributes.get(attName); + if (otherAtt == null || !otherAtt.equals(attributes.get(attName))) { + return false; + } + } + return true; + } else { + return !other.hasAttributes(); + } + } else + return false; + } + + +// TODO: Java 1.5 +// private Map, Attribute> attributes; +// public T addAttribute(Class attClass) { +// T att = (T) attributes.get(attClass); +// if (att == null) { +// try { +// att = attClass.newInstance(); +// } catch (InstantiationException e) { +// throw new IllegalArgumentException("Could not instantiate class " + attClass); +// } catch (IllegalAccessException e) { +// throw new IllegalArgumentException("Could not instantiate class " + attClass); +// } +// +// attributes.put(attClass, att); +// } +// return att; +// } +// +// public boolean hasAttribute(Class attClass) { +// return this.attributes.containsKey(attClass); +// } +// +// public T getAttribute(Class attClass) { +// Attribute att = this.attributes.get(attClass); +// if (att == null) { +// throw new IllegalArgumentException("This token does not have the attribute '" + attClass + "'."); +// } +// +// return (T) att; +// } +// + +} diff --git a/src/test/org/apache/lucene/AnalysisTest.java b/src/test/org/apache/lucene/AnalysisTest.java index a3b4dbd0cb2..be83163145b 100644 --- a/src/test/org/apache/lucene/AnalysisTest.java +++ b/src/test/org/apache/lucene/AnalysisTest.java @@ -17,19 +17,20 @@ package org.apache.lucene; * limitations under the License. */ -import org.apache.lucene.analysis.SimpleAnalyzer; -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Token; - -import java.io.Reader; -import java.io.StringReader; +import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; -import java.io.BufferedReader; import java.io.InputStreamReader; +import java.io.Reader; +import java.io.StringReader; import java.util.Date; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.SimpleAnalyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; + class AnalysisTest { static File tmpFile; public static void main(String[] args) { @@ -70,12 +71,15 @@ class AnalysisTest { Date start = new Date(); int count = 0; - final Token reusableToken = new Token(); - for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) { + + stream.reset(); + TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class); + OffsetAttribute offsetAtt = (OffsetAttribute) stream.getAttribute(OffsetAttribute.class); + while (stream.incrementToken()) { if (verbose) { - System.out.println("Text=" + nextToken.term() - + " start=" + nextToken.startOffset() - + " end=" + nextToken.endOffset()); + System.out.println("Text=" + termAtt.term() + + " start=" + offsetAtt.startOffset() + + " end=" + offsetAtt.endOffset()); } count++; } diff --git a/src/test/org/apache/lucene/analysis/TeeSinkTokenTest.java b/src/test/org/apache/lucene/analysis/TeeSinkTokenTest.java index e668d5c9d26..e7d766e3ed1 100644 --- a/src/test/org/apache/lucene/analysis/TeeSinkTokenTest.java +++ b/src/test/org/apache/lucene/analysis/TeeSinkTokenTest.java @@ -18,6 +18,9 @@ package org.apache.lucene.analysis; import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.English; import org.apache.lucene.util.LuceneTestCase; @@ -40,7 +43,8 @@ public class TeeSinkTokenTest extends LuceneTestCase { super(s); } - protected void setUp() { + protected void setUp() throws Exception { + super.setUp(); tokens1 = new String[]{"The", "quick", "Burgundy", "Fox", "jumped", "over", "the", "lazy", "Red", "Dogs"}; tokens2 = new String[]{"The", "Lazy", "Dogs", "should", "stay", "on", "the", "porch"}; buffer1 = new StringBuffer(); @@ -62,24 +66,29 @@ public class TeeSinkTokenTest extends LuceneTestCase { public void test() throws IOException { SinkTokenizer sink1 = new SinkTokenizer(null) { - public void add(Token t) { - if (t != null && t.term().equalsIgnoreCase("The")) { - super.add(t); + public void add(AttributeSource a) throws IOException { + TermAttribute termAtt = null; + if (a.hasAttribute(TermAttribute.class)) { + termAtt = (TermAttribute) a.getAttribute(TermAttribute.class); + } + if (termAtt != null && termAtt.term().equalsIgnoreCase("The")) { + super.add(a); } } }; TokenStream source = new TeeTokenFilter(new WhitespaceTokenizer(new StringReader(buffer1.toString())), sink1); int i = 0; - final Token reusableToken = new Token(); - for (Token nextToken = source.next(reusableToken); nextToken != null; nextToken = source.next(reusableToken)) { - assertTrue(nextToken.term() + " is not equal to " + tokens1[i], nextToken.term().equals(tokens1[i]) == true); + TermAttribute termAtt = (TermAttribute) source.getAttribute(TermAttribute.class); + while (source.incrementToken()) { + assertTrue(termAtt.term() + " is not equal to " + tokens1[i], termAtt.term().equals(tokens1[i]) == true); i++; } assertTrue(i + " does not equal: " + tokens1.length, i == tokens1.length); assertTrue("sink1 Size: " + sink1.getTokens().size() + " is not: " + 2, sink1.getTokens().size() == 2); i = 0; - for (Token token = sink1.next(reusableToken); token != null; token = sink1.next(reusableToken)) { - assertTrue(token.term() + " is not equal to " + "The", token.term().equalsIgnoreCase("The") == true); + termAtt = (TermAttribute) sink1.getAttribute(TermAttribute.class); + while (sink1.incrementToken()) { + assertTrue(termAtt.term() + " is not equal to " + "The", termAtt.term().equalsIgnoreCase("The") == true); i++; } assertTrue(i + " does not equal: " + sink1.getTokens().size(), i == sink1.getTokens().size()); @@ -87,55 +96,67 @@ public class TeeSinkTokenTest extends LuceneTestCase { public void testMultipleSources() throws Exception { SinkTokenizer theDetector = new SinkTokenizer(null) { - public void add(Token t) { - if (t != null && t.term().equalsIgnoreCase("The")) { - super.add(t); + public void add(AttributeSource a) throws IOException { + TermAttribute termAtt = null; + if (a.hasAttribute(TermAttribute.class)) { + termAtt = (TermAttribute) a.getAttribute(TermAttribute.class); + } + if (termAtt != null && termAtt.term().equalsIgnoreCase("The")) { + super.add(a); } } }; - SinkTokenizer dogDetector = new SinkTokenizer(null) { - public void add(Token t) { - if (t != null && t.term().equalsIgnoreCase("Dogs")) { - super.add(t); + SinkTokenizer dogDetector = new SinkTokenizer(null) { + public void add(AttributeSource a) throws IOException { + TermAttribute termAtt = null; + if (a.hasAttribute(TermAttribute.class)) { + termAtt = (TermAttribute) a.getAttribute(TermAttribute.class); + } + if (termAtt != null && termAtt.term().equalsIgnoreCase("Dogs")) { + super.add(a); } } }; TokenStream source1 = new CachingTokenFilter(new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(new StringReader(buffer1.toString())), theDetector), dogDetector)); TokenStream source2 = new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(new StringReader(buffer2.toString())), theDetector), dogDetector); int i = 0; - final Token reusableToken = new Token(); - for (Token nextToken = source1.next(reusableToken); nextToken != null; nextToken = source1.next(reusableToken)) { - assertTrue(nextToken.term() + " is not equal to " + tokens1[i], nextToken.term().equals(tokens1[i]) == true); + TermAttribute termAtt = (TermAttribute) source1.getAttribute(TermAttribute.class); + while (source1.incrementToken()) { + assertTrue(termAtt.term() + " is not equal to " + tokens1[i], termAtt.term().equals(tokens1[i]) == true); i++; } assertTrue(i + " does not equal: " + tokens1.length, i == tokens1.length); assertTrue("theDetector Size: " + theDetector.getTokens().size() + " is not: " + 2, theDetector.getTokens().size() == 2); assertTrue("dogDetector Size: " + dogDetector.getTokens().size() + " is not: " + 1, dogDetector.getTokens().size() == 1); i = 0; - for (Token nextToken = source2.next(reusableToken); nextToken != null; nextToken = source2.next(reusableToken)) { - assertTrue(nextToken.term() + " is not equal to " + tokens2[i], nextToken.term().equals(tokens2[i]) == true); + termAtt = (TermAttribute) source2.getAttribute(TermAttribute.class); + while (source2.incrementToken()) { + assertTrue(termAtt.term() + " is not equal to " + tokens2[i], termAtt.term().equals(tokens2[i]) == true); i++; } assertTrue(i + " does not equal: " + tokens2.length, i == tokens2.length); assertTrue("theDetector Size: " + theDetector.getTokens().size() + " is not: " + 4, theDetector.getTokens().size() == 4); assertTrue("dogDetector Size: " + dogDetector.getTokens().size() + " is not: " + 2, dogDetector.getTokens().size() == 2); i = 0; - for (Token nextToken = theDetector.next(reusableToken); nextToken != null; nextToken = theDetector.next(reusableToken)) { - assertTrue(nextToken.term() + " is not equal to " + "The", nextToken.term().equalsIgnoreCase("The") == true); + termAtt = (TermAttribute) theDetector.getAttribute(TermAttribute.class); + while (theDetector.incrementToken()) { + assertTrue(termAtt.term() + " is not equal to " + "The", termAtt.term().equalsIgnoreCase("The") == true); i++; } assertTrue(i + " does not equal: " + theDetector.getTokens().size(), i == theDetector.getTokens().size()); i = 0; - for (Token nextToken = dogDetector.next(reusableToken); nextToken != null; nextToken = dogDetector.next(reusableToken)) { - assertTrue(nextToken.term() + " is not equal to " + "Dogs", nextToken.term().equalsIgnoreCase("Dogs") == true); + termAtt = (TermAttribute) dogDetector.getAttribute(TermAttribute.class); + while (dogDetector.incrementToken()) { + assertTrue(termAtt.term() + " is not equal to " + "Dogs", termAtt.term().equalsIgnoreCase("Dogs") == true); i++; } assertTrue(i + " does not equal: " + dogDetector.getTokens().size(), i == dogDetector.getTokens().size()); source1.reset(); TokenStream lowerCasing = new LowerCaseFilter(source1); i = 0; - for (Token nextToken = lowerCasing.next(reusableToken); nextToken != null; nextToken = lowerCasing.next(reusableToken)) { - assertTrue(nextToken.term() + " is not equal to " + tokens1[i].toLowerCase(), nextToken.term().equals(tokens1[i].toLowerCase()) == true); + termAtt = (TermAttribute) lowerCasing.getAttribute(TermAttribute.class); + while (lowerCasing.incrementToken()) { + assertTrue(termAtt.term() + " is not equal to " + tokens1[i].toLowerCase(), termAtt.term().equals(tokens1[i].toLowerCase()) == true); i++; } assertTrue(i + " does not equal: " + tokens1.length, i == tokens1.length); @@ -157,21 +178,20 @@ public class TeeSinkTokenTest extends LuceneTestCase { } //make sure we produce the same tokens ModuloSinkTokenizer sink = new ModuloSinkTokenizer(tokCount[k], 100); - final Token reusableToken = new Token(); TokenStream stream = new TeeTokenFilter(new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString()))), sink); - while (stream.next(reusableToken) != null) { + while (stream.incrementToken()) { } stream = new ModuloTokenFilter(new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString()))), 100); List tmp = new ArrayList(); - for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) { - tmp.add(nextToken.clone()); + while (stream.incrementToken()) { + tmp.add(stream.captureState()); } List sinkList = sink.getTokens(); assertTrue("tmp Size: " + tmp.size() + " is not: " + sinkList.size(), tmp.size() == sinkList.size()); for (int i = 0; i < tmp.size(); i++) { - Token tfTok = (Token) tmp.get(i); - Token sinkTok = (Token) sinkList.get(i); - assertTrue(tfTok.term() + " is not equal to " + sinkTok.term() + " at token: " + i, tfTok.term().equals(sinkTok.term()) == true); + AttributeSource tfTok = (AttributeSource) tmp.get(i); + AttributeSource sinkTok = (AttributeSource) sinkList.get(i); + assertTrue(tfTok + " is not equal to " + sinkTok + " at token: " + i, tfTok.equals(sinkTok) == true); } //simulate two fields, each being analyzed once, for 20 documents @@ -180,12 +200,14 @@ public class TeeSinkTokenTest extends LuceneTestCase { long start = System.currentTimeMillis(); for (int i = 0; i < 20; i++) { stream = new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString()))); - for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) { - tfPos += nextToken.getPositionIncrement(); + PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) stream.getAttribute(PositionIncrementAttribute.class); + while (stream.incrementToken()) { + tfPos += posIncrAtt.getPositionIncrement(); } stream = new ModuloTokenFilter(new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString()))), modCounts[j]); - for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) { - tfPos += nextToken.getPositionIncrement(); + posIncrAtt = (PositionIncrementAttribute) stream.getAttribute(PositionIncrementAttribute.class); + while (stream.incrementToken()) { + tfPos += posIncrAtt.getPositionIncrement(); } } long finish = System.currentTimeMillis(); @@ -196,13 +218,15 @@ public class TeeSinkTokenTest extends LuceneTestCase { for (int i = 0; i < 20; i++) { sink = new ModuloSinkTokenizer(tokCount[k], modCounts[j]); stream = new TeeTokenFilter(new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString()))), sink); - for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) { - sinkPos += nextToken.getPositionIncrement(); + PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) stream.getAttribute(PositionIncrementAttribute.class); + while (stream.incrementToken()) { + sinkPos += posIncrAtt.getPositionIncrement(); } //System.out.println("Modulo--------"); stream = sink; - for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) { - sinkPos += nextToken.getPositionIncrement(); + posIncrAtt = (PositionIncrementAttribute) stream.getAttribute(PositionIncrementAttribute.class); + while (stream.incrementToken()) { + sinkPos += posIncrAtt.getPositionIncrement(); } } finish = System.currentTimeMillis(); @@ -228,15 +252,15 @@ public class TeeSinkTokenTest extends LuceneTestCase { int count = 0; //return every 100 tokens - public Token next(final Token reusableToken) throws IOException { - Token nextToken = null; - for (nextToken = input.next(reusableToken); - nextToken != null && count % modCount != 0; - nextToken = input.next(reusableToken)) { + public boolean incrementToken() throws IOException { + boolean hasNext; + for (hasNext = input.incrementToken(); + hasNext && count % modCount != 0; + hasNext = input.incrementToken()) { count++; } count++; - return nextToken; + return hasNext; } } @@ -250,9 +274,9 @@ public class TeeSinkTokenTest extends LuceneTestCase { lst = new ArrayList(numToks % mc); } - public void add(Token t) { - if (t != null && count % modCount == 0) { - super.add(t); + public void add(AttributeSource a) throws IOException { + if (a != null && count % modCount == 0) { + super.add(a); } count++; } diff --git a/src/test/org/apache/lucene/analysis/TestAnalyzers.java b/src/test/org/apache/lucene/analysis/TestAnalyzers.java index 6af5cf2b957..ed1e7b31d3d 100644 --- a/src/test/org/apache/lucene/analysis/TestAnalyzers.java +++ b/src/test/org/apache/lucene/analysis/TestAnalyzers.java @@ -19,10 +19,10 @@ package org.apache.lucene.analysis; import java.io.IOException; import java.io.StringReader; -import java.util.LinkedList; -import java.util.List; import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.index.Payload; import org.apache.lucene.util.LuceneTestCase; @@ -36,13 +36,12 @@ public class TestAnalyzers extends LuceneTestCase { String input, String[] output) throws Exception { TokenStream ts = a.tokenStream("dummy", new StringReader(input)); - final Token reusableToken = new Token(); + TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class); for (int i=0; i test with enable-increments-"+(enableIcrements?"enabled":"disabled")); stpf.setEnablePositionIncrements(enableIcrements); - final Token reusableToken = new Token(); + TermAttribute termAtt = (TermAttribute) stpf.getAttribute(TermAttribute.class); + PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) stpf.getAttribute(PositionIncrementAttribute.class); for (int i=0; i<20; i+=3) { - Token nextToken = stpf.next(reusableToken); - log("Token "+i+": "+nextToken); + assertTrue(stpf.incrementToken()); + log("Token "+i+": "+stpf); String w = English.intToEnglish(i).trim(); - assertEquals("expecting token "+i+" to be "+w,w,nextToken.term()); - assertEquals("all but first token must have position increment of 3",enableIcrements?(i==0?1:3):1,nextToken.getPositionIncrement()); + assertEquals("expecting token "+i+" to be "+w,w,termAtt.term()); + assertEquals("all but first token must have position increment of 3",enableIcrements?(i==0?1:3):1,posIncrAtt.getPositionIncrement()); } - assertNull(stpf.next(reusableToken)); + assertFalse(stpf.incrementToken()); } // print debug info depending on VERBOSE diff --git a/src/test/org/apache/lucene/analysis/TestToken.java b/src/test/org/apache/lucene/analysis/TestToken.java index 14201bb2d3f..d133aa8b89c 100644 --- a/src/test/org/apache/lucene/analysis/TestToken.java +++ b/src/test/org/apache/lucene/analysis/TestToken.java @@ -19,6 +19,7 @@ package org.apache.lucene.analysis; import org.apache.lucene.util.LuceneTestCase; +/** @deprecated */ public class TestToken extends LuceneTestCase { public TestToken(String name) { diff --git a/src/test/org/apache/lucene/index/TestDocumentWriter.java b/src/test/org/apache/lucene/index/TestDocumentWriter.java index caddd7873e0..d82b3d8a69f 100644 --- a/src/test/org/apache/lucene/index/TestDocumentWriter.java +++ b/src/test/org/apache/lucene/index/TestDocumentWriter.java @@ -22,12 +22,14 @@ import java.io.Reader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.SimpleAnalyzer; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.WhitespaceAnalyzer; import org.apache.lucene.analysis.WhitespaceTokenizer; import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.Fieldable; @@ -35,6 +37,7 @@ import org.apache.lucene.document.Field.Index; import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.Field.TermVector; import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util._TestUtil; @@ -138,33 +141,38 @@ public class TestDocumentWriter extends LuceneTestCase { public TokenStream tokenStream(String fieldName, Reader reader) { return new TokenFilter(new WhitespaceTokenizer(reader)) { boolean first=true; - Token buffered; + AttributeSource state; - public Token next(final Token reusableToken) throws IOException { - if (buffered != null) { - Token nextToken = buffered; - buffered=null; - return nextToken; + public boolean incrementToken() throws IOException { + if (state != null) { + state.restoreState(this); + payloadAtt.setPayload(null); + posIncrAtt.setPositionIncrement(0); + termAtt.setTermBuffer(new char[]{'b'}, 0, 1); + state = null; + return true; } - Token nextToken = input.next(reusableToken); - if (nextToken==null) return null; - if (Character.isDigit(nextToken.termBuffer()[0])) { - nextToken.setPositionIncrement(nextToken.termBuffer()[0] - '0'); + + boolean hasNext = input.incrementToken(); + if (!hasNext) return false; + if (Character.isDigit(termAtt.termBuffer()[0])) { + posIncrAtt.setPositionIncrement(termAtt.termBuffer()[0] - '0'); } if (first) { // set payload on first position only - nextToken.setPayload(new Payload(new byte[]{100})); + payloadAtt.setPayload(new Payload(new byte[]{100})); first = false; } // index a "synonym" for every token - buffered = (Token)nextToken.clone(); - buffered.setPayload(null); - buffered.setPositionIncrement(0); - buffered.setTermBuffer(new char[]{'b'}, 0, 1); + state = captureState(); + return true; - return nextToken; } + + TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class); + PayloadAttribute payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class); + PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); }; } }; @@ -201,12 +209,14 @@ public class TestDocumentWriter extends LuceneTestCase { private String[] tokens = new String[] {"term1", "term2", "term3", "term2"}; private int index = 0; - public Token next(final Token reusableToken) throws IOException { - assert reusableToken != null; + private TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class); + + public boolean incrementToken() throws IOException { if (index == tokens.length) { - return null; + return false; } else { - return reusableToken.reinit(tokens[index++], 0, 0); + termAtt.setTermBuffer(tokens[index++]); + return true; } } diff --git a/src/test/org/apache/lucene/index/TestIndexWriter.java b/src/test/org/apache/lucene/index/TestIndexWriter.java index 73d8c7fba41..32af7c2c617 100644 --- a/src/test/org/apache/lucene/index/TestIndexWriter.java +++ b/src/test/org/apache/lucene/index/TestIndexWriter.java @@ -17,48 +17,48 @@ package org.apache.lucene.index; * limitations under the License. */ -import java.io.IOException; -import java.io.Reader; -import java.io.File; import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.IOException; import java.io.PrintStream; -import java.util.Arrays; +import java.io.Reader; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import java.util.Random; -import org.apache.lucene.util.LuceneTestCase; -import org.apache.lucene.util.UnicodeUtil; - -import org.apache.lucene.analysis.WhitespaceAnalyzer; -import org.apache.lucene.analysis.WhitespaceTokenizer; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.SinkTokenizer; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.WhitespaceAnalyzer; +import org.apache.lucene.analysis.WhitespaceTokenizer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.standard.StandardTokenizer; -import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.PhraseQuery; +import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; -import org.apache.lucene.search.Query; import org.apache.lucene.search.spans.SpanTermQuery; -import org.apache.lucene.search.PhraseQuery; +import org.apache.lucene.store.AlreadyClosedException; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; -import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; -import org.apache.lucene.store.AlreadyClosedException; -import org.apache.lucene.util._TestUtil; - -import org.apache.lucene.store.MockRAMDirectory; -import org.apache.lucene.store.LockFactory; import org.apache.lucene.store.Lock; +import org.apache.lucene.store.LockFactory; +import org.apache.lucene.store.MockRAMDirectory; +import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.store.SingleInstanceLockFactory; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.util._TestUtil; /** * @@ -1793,11 +1793,11 @@ public class TestIndexWriter extends LuceneTestCase return new TokenFilter(new StandardTokenizer(reader)) { private int count = 0; - public Token next(final Token reusableToken) throws IOException { + public boolean incrementToken() throws IOException { if (count++ == 5) { throw new IOException(); } - return input.next(reusableToken); + return input.incrementToken(); } }; } @@ -1916,10 +1916,10 @@ public class TestIndexWriter extends LuceneTestCase this.fieldName = fieldName; } - public Token next(final Token reusableToken) throws IOException { + public boolean incrementToken() throws IOException { if (this.fieldName.equals("crash") && count++ >= 4) throw new IOException("I'm experiencing problems"); - return input.next(reusableToken); + return input.incrementToken(); } public void reset() throws IOException { @@ -3577,21 +3577,47 @@ public class TestIndexWriter extends LuceneTestCase } } + private static class MyAnalyzer extends Analyzer { + + public TokenStream tokenStream(String fieldName, Reader reader) { + TokenStream s = new WhitespaceTokenizer(reader); + s.addAttribute(PositionIncrementAttribute.class); + return s; + } + + } + // LUCENE-1255 public void testNegativePositions() throws Throwable { SinkTokenizer tokens = new SinkTokenizer(); - Token t = new Token(); - t.setTermBuffer("a"); - t.setPositionIncrement(0); - tokens.add(t); - t.setTermBuffer("b"); - t.setPositionIncrement(1); - tokens.add(t); - t.setTermBuffer("c"); - tokens.add(t); + tokens.addAttribute(TermAttribute.class); + tokens.addAttribute(PositionIncrementAttribute.class); + + AttributeSource state = new AttributeSource(); + TermAttribute termAtt = (TermAttribute) state.addAttribute(TermAttribute.class); + PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) state.addAttribute(PositionIncrementAttribute.class); + termAtt.setTermBuffer("a"); + posIncrAtt.setPositionIncrement(0); + tokens.add(state); + + state = new AttributeSource(); + termAtt = (TermAttribute) state.addAttribute(TermAttribute.class); + posIncrAtt = (PositionIncrementAttribute) state.addAttribute(PositionIncrementAttribute.class); + + termAtt.setTermBuffer("b"); + posIncrAtt.setPositionIncrement(1); + tokens.add(state); + + state = new AttributeSource(); + termAtt = (TermAttribute) state.addAttribute(TermAttribute.class); + posIncrAtt = (PositionIncrementAttribute) state.addAttribute(PositionIncrementAttribute.class); + + termAtt.setTermBuffer("c"); + posIncrAtt.setPositionIncrement(1); + tokens.add(state); MockRAMDirectory dir = new MockRAMDirectory(); - IndexWriter w = new IndexWriter(dir, new WhitespaceAnalyzer(), true, IndexWriter.MaxFieldLength.UNLIMITED); + IndexWriter w = new IndexWriter(dir, new MyAnalyzer(), true, IndexWriter.MaxFieldLength.UNLIMITED); Document doc = new Document(); doc.add(new Field("field", tokens)); w.addDocument(doc); diff --git a/src/test/org/apache/lucene/index/TestMultiLevelSkipList.java b/src/test/org/apache/lucene/index/TestMultiLevelSkipList.java index bf24e005e92..1de3c36c01d 100644 --- a/src/test/org/apache/lucene/index/TestMultiLevelSkipList.java +++ b/src/test/org/apache/lucene/index/TestMultiLevelSkipList.java @@ -20,19 +20,18 @@ package org.apache.lucene.index; import java.io.IOException; import java.io.Reader; -import org.apache.lucene.util.LuceneTestCase; - import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.LowerCaseTokenizer; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.Field.Index; import org.apache.lucene.document.Field.Store; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.LuceneTestCase; /** * This testcase tests whether multi-level skipping is being used @@ -99,17 +98,19 @@ public class TestMultiLevelSkipList extends LuceneTestCase { private static class PayloadFilter extends TokenFilter { static int count = 0; + PayloadAttribute payloadAtt; + protected PayloadFilter(TokenStream input) { super(input); + payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class); } - public Token next(final Token reusableToken) throws IOException { - assert reusableToken != null; - Token nextToken = input.next(reusableToken); - if (nextToken != null) { - nextToken.setPayload(new Payload(new byte[] { (byte) count++ })); - } - return nextToken; + public boolean incrementToken() throws IOException { + boolean hasNext = input.incrementToken(); + if (hasNext) { + payloadAtt.setPayload(new Payload(new byte[] { (byte) count++ })); + } + return hasNext; } } diff --git a/src/test/org/apache/lucene/index/TestPayloads.java b/src/test/org/apache/lucene/index/TestPayloads.java index b5cb4e3bcdb..f915e6d8aa2 100644 --- a/src/test/org/apache/lucene/index/TestPayloads.java +++ b/src/test/org/apache/lucene/index/TestPayloads.java @@ -27,20 +27,20 @@ import java.util.List; import java.util.Map; import java.util.Random; -import org.apache.lucene.util.LuceneTestCase; -import org.apache.lucene.util.UnicodeUtil; - import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.WhitespaceAnalyzer; import org.apache.lucene.analysis.WhitespaceTokenizer; +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.UnicodeUtil; public class TestPayloads extends LuceneTestCase { @@ -442,32 +442,33 @@ public class TestPayloads extends LuceneTestCase { private int length; private int offset; Payload payload = new Payload(); + PayloadAttribute payloadAtt; public PayloadFilter(TokenStream in, byte[] data, int offset, int length) { super(in); this.data = data; this.length = length; this.offset = offset; + payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class); } - public Token next(final Token reusableToken) throws IOException { - assert reusableToken != null; - Token nextToken = input.next(reusableToken); - if (nextToken != null) { + public boolean incrementToken() throws IOException { + boolean hasNext = input.incrementToken(); + if (hasNext) { if (offset + length <= data.length) { Payload p = null; if (p == null) { p = new Payload(); - nextToken.setPayload(p); + payloadAtt.setPayload(p); } p.setData(data, offset, length); offset += length; } else { - nextToken.setPayload(null); + payloadAtt.setPayload(null); } } - return nextToken; + return hasNext; } } @@ -529,19 +530,25 @@ public class TestPayloads extends LuceneTestCase { private boolean first; private ByteArrayPool pool; private String term; + + TermAttribute termAtt; + PayloadAttribute payloadAtt; + PoolingPayloadTokenStream(ByteArrayPool pool) { this.pool = pool; payload = pool.get(); generateRandomData(payload); term = pool.bytesToString(payload); first = true; + payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); } - public Token next(final Token reusableToken) throws IOException { - if (!first) return null; - reusableToken.reinit(term, 0, 0); - reusableToken.setPayload(new Payload(payload)); - return reusableToken; + public boolean incrementToken() throws IOException { + if (!first) return false; + termAtt.setTermBuffer(term); + payloadAtt.setPayload(new Payload(payload)); + return true; } public void close() throws IOException { diff --git a/src/test/org/apache/lucene/index/TestTermVectorsReader.java b/src/test/org/apache/lucene/index/TestTermVectorsReader.java index c66fa52efb4..a7b07838fd1 100644 --- a/src/test/org/apache/lucene/index/TestTermVectorsReader.java +++ b/src/test/org/apache/lucene/index/TestTermVectorsReader.java @@ -17,14 +17,6 @@ package org.apache.lucene.index; * limitations under the License. */ -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.Token; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; -import org.apache.lucene.store.MockRAMDirectory; -import org.apache.lucene.util.LuceneTestCase; - import java.io.IOException; import java.io.Reader; import java.util.Arrays; @@ -32,6 +24,16 @@ import java.util.Iterator; import java.util.Map; import java.util.SortedSet; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.store.MockRAMDirectory; +import org.apache.lucene.util.LuceneTestCase; + public class TestTermVectorsReader extends LuceneTestCase { //Must be lexicographically sorted, will do in setup, versus trying to maintain here private String[] testFields = {"f1", "f2", "f3", "f4"}; @@ -118,17 +120,31 @@ public class TestTermVectorsReader extends LuceneTestCase { private class MyTokenStream extends TokenStream { int tokenUpto; - public Token next(final Token reusableToken) { + + TermAttribute termAtt; + PositionIncrementAttribute posIncrAtt; + OffsetAttribute offsetAtt; + + public MyTokenStream() { + termAtt = (TermAttribute) addAttribute(TermAttribute.class); + posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); + offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); + } + + public boolean incrementToken() { if (tokenUpto >= tokens.length) - return null; + return false; else { final TestToken testToken = tokens[tokenUpto++]; - reusableToken.reinit(testToken.text, testToken.startOffset, testToken.endOffset); - if (tokenUpto > 1) - reusableToken.setPositionIncrement(testToken.pos - tokens[tokenUpto-2].pos); - else - reusableToken.setPositionIncrement(testToken.pos+1); - return reusableToken; + termAtt.setTermBuffer(testToken.text); + offsetAtt.setStartOffset(testToken.startOffset); + offsetAtt.setEndOffset(testToken.endOffset); + if (tokenUpto > 1) { + posIncrAtt.setPositionIncrement(testToken.pos - tokens[tokenUpto-2].pos); + } else { + posIncrAtt.setPositionIncrement(testToken.pos+1); + } + return true; } } } diff --git a/src/test/org/apache/lucene/index/TestTermdocPerf.java b/src/test/org/apache/lucene/index/TestTermdocPerf.java index f6ee71d5955..04b4d20b17a 100644 --- a/src/test/org/apache/lucene/index/TestTermdocPerf.java +++ b/src/test/org/apache/lucene/index/TestTermdocPerf.java @@ -17,18 +17,18 @@ package org.apache.lucene.index; */ -import org.apache.lucene.util.LuceneTestCase; -import org.apache.lucene.store.Directory; -import org.apache.lucene.store.RAMDirectory; +import java.io.IOException; +import java.io.Reader; +import java.util.Random; + import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; - -import java.io.Reader; -import java.io.IOException; -import java.util.Random; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.LuceneTestCase; /** * @version $Id$ @@ -36,15 +36,21 @@ import java.util.Random; class RepeatingTokenStream extends TokenStream { public int num; - Token t; + TermAttribute termAtt; + String value; public RepeatingTokenStream(String val) { - t = new Token(0,val.length()); - t.setTermBuffer(val); + this.value = val; + this.termAtt = (TermAttribute) addAttribute(TermAttribute.class); } - public Token next(final Token reusableToken) throws IOException { - return --num<0 ? null : (Token) t.clone(); + public boolean incrementToken() throws IOException { + num--; + if (num >= 0) { + termAtt.setTermBuffer(value); + return true; + } + return false; } } diff --git a/src/test/org/apache/lucene/queryParser/TestMultiAnalyzer.java b/src/test/org/apache/lucene/queryParser/TestMultiAnalyzer.java index 05684c65cd3..2f0253eaba8 100644 --- a/src/test/org/apache/lucene/queryParser/TestMultiAnalyzer.java +++ b/src/test/org/apache/lucene/queryParser/TestMultiAnalyzer.java @@ -17,17 +17,20 @@ package org.apache.lucene.queryParser; * limitations under the License. */ +import java.io.IOException; import java.io.Reader; -import org.apache.lucene.util.LuceneTestCase; - -import org.apache.lucene.search.Query; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.LowerCaseFilter; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.apache.lucene.search.Query; +import org.apache.lucene.util.LuceneTestCase; /** * Test QueryParser's ability to deal with Analyzers that return more @@ -140,34 +143,49 @@ public class TestMultiAnalyzer extends LuceneTestCase { private final class TestFilter extends TokenFilter { - private Token prevToken; + private String prevType; + private int prevStartOffset; + private int prevEndOffset; + + TermAttribute termAtt; + PositionIncrementAttribute posIncrAtt; + OffsetAttribute offsetAtt; + TypeAttribute typeAtt; public TestFilter(TokenStream in) { super(in); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); + posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); + offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); + typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class); } - public final Token next(final Token reusableToken) throws java.io.IOException { + public final boolean incrementToken() throws java.io.IOException { if (multiToken > 0) { - reusableToken.reinit("multi"+(multiToken+1), prevToken.startOffset(), prevToken.endOffset(), prevToken.type()); - reusableToken.setPositionIncrement(0); + termAtt.setTermBuffer("multi"+(multiToken+1)); + offsetAtt.setStartOffset(prevStartOffset); + offsetAtt.setEndOffset(prevEndOffset); + typeAtt.setType(prevType); + posIncrAtt.setPositionIncrement(0); multiToken--; - return reusableToken; + return true; } else { - Token nextToken = input.next(reusableToken); - if (nextToken == null) { - prevToken = null; - return null; + boolean next = input.incrementToken(); + if (next == false) { + return false; } - prevToken = (Token) nextToken.clone(); - String text = nextToken.term(); + prevType = typeAtt.type(); + prevStartOffset = offsetAtt.startOffset(); + prevEndOffset = offsetAtt.endOffset(); + String text = termAtt.term(); if (text.equals("triplemulti")) { multiToken = 2; - return nextToken; + return true; } else if (text.equals("multi")) { multiToken = 1; - return nextToken; + return true; } else { - return nextToken; + return true; } } } @@ -192,23 +210,28 @@ public class TestMultiAnalyzer extends LuceneTestCase { private final class TestPosIncrementFilter extends TokenFilter { + TermAttribute termAtt; + PositionIncrementAttribute posIncrAtt; + public TestPosIncrementFilter(TokenStream in) { super(in); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); + posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); } - public final Token next(final Token reusableToken) throws java.io.IOException { - for (Token nextToken = input.next(reusableToken); nextToken != null; nextToken = input.next(reusableToken)) { - if (nextToken.term().equals("the")) { + public final boolean incrementToken () throws java.io.IOException { + while(input.incrementToken()) { + if (termAtt.term().equals("the")) { // stopword, do nothing - } else if (nextToken.term().equals("quick")) { - nextToken.setPositionIncrement(2); - return nextToken; + } else if (termAtt.term().equals("quick")) { + posIncrAtt.setPositionIncrement(2); + return true; } else { - nextToken.setPositionIncrement(1); - return nextToken; + posIncrAtt.setPositionIncrement(1); + return true; } } - return null; + return false; } } diff --git a/src/test/org/apache/lucene/queryParser/TestMultiFieldQueryParser.java b/src/test/org/apache/lucene/queryParser/TestMultiFieldQueryParser.java index 2cfa2ed8cc8..c8e334a7445 100644 --- a/src/test/org/apache/lucene/queryParser/TestMultiFieldQueryParser.java +++ b/src/test/org/apache/lucene/queryParser/TestMultiFieldQueryParser.java @@ -22,7 +22,6 @@ import java.util.HashMap; import java.util.Map; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; diff --git a/src/test/org/apache/lucene/queryParser/TestQueryParser.java b/src/test/org/apache/lucene/queryParser/TestQueryParser.java index f8ad3f0cb45..a3114740a9e 100644 --- a/src/test/org/apache/lucene/queryParser/TestQueryParser.java +++ b/src/test/org/apache/lucene/queryParser/TestQueryParser.java @@ -19,8 +19,8 @@ package org.apache.lucene.queryParser; import java.io.IOException; import java.io.Reader; -import java.text.DateFormat; import java.text.Collator; +import java.text.DateFormat; import java.util.Calendar; import java.util.Date; import java.util.Locale; @@ -31,11 +31,12 @@ import org.apache.lucene.analysis.LowerCaseTokenizer; import org.apache.lucene.analysis.SimpleAnalyzer; import org.apache.lucene.analysis.StopAnalyzer; import org.apache.lucene.analysis.StopFilter; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.WhitespaceAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.document.DateField; import org.apache.lucene.document.DateTools; import org.apache.lucene.document.Document; @@ -64,36 +65,47 @@ public class TestQueryParser extends LuceneTestCase { public static Analyzer qpAnalyzer = new QPTestAnalyzer(); public static class QPTestFilter extends TokenFilter { + TermAttribute termAtt; + OffsetAttribute offsetAtt; + /** * Filter which discards the token 'stop' and which expands the * token 'phrase' into 'phrase1 phrase2' */ public QPTestFilter(TokenStream in) { super(in); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); + offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); } boolean inPhrase = false; int savedStart = 0, savedEnd = 0; - public Token next(final Token reusableToken) throws IOException { - assert reusableToken != null; + public boolean incrementToken() throws IOException { if (inPhrase) { inPhrase = false; - return reusableToken.reinit("phrase2", savedStart, savedEnd); + termAtt.setTermBuffer("phrase2"); + offsetAtt.setStartOffset(savedStart); + offsetAtt.setEndOffset(savedEnd); + return true; } else - for (Token nextToken = input.next(reusableToken); nextToken != null; nextToken = input.next(reusableToken)) { - if (nextToken.term().equals("phrase")) { + while (input.incrementToken()) { + if (termAtt.term().equals("phrase")) { inPhrase = true; - savedStart = nextToken.startOffset(); - savedEnd = nextToken.endOffset(); - return nextToken.reinit("phrase1", savedStart, savedEnd); - } else if (!nextToken.term().equals("stop")) - return nextToken; + savedStart = offsetAtt.startOffset(); + savedEnd = offsetAtt.endOffset(); + termAtt.setTermBuffer("phrase1"); + offsetAtt.setStartOffset(savedStart); + offsetAtt.setEndOffset(savedEnd); + return true; + } else if (!termAtt.term().equals("stop")) + return true; } - return null; + return false; } } + public static class QPTestAnalyzer extends Analyzer { /** Filters LowerCaseTokenizer with StopFilter. */ diff --git a/src/test/org/apache/lucene/search/TestPositionIncrement.java b/src/test/org/apache/lucene/search/TestPositionIncrement.java index 205b3f4f245..9c303cf1a3b 100644 --- a/src/test/org/apache/lucene/search/TestPositionIncrement.java +++ b/src/test/org/apache/lucene/search/TestPositionIncrement.java @@ -17,14 +17,16 @@ package org.apache.lucene.search; * limitations under the License. */ +import java.io.IOException; import java.io.Reader; -import java.io.StringReader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.StopFilter; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.WhitespaceAnalyzer; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter; @@ -49,14 +51,19 @@ public class TestPositionIncrement extends LuceneTestCase { private final int[] INCREMENTS = {1, 2, 1, 0, 1}; private int i = 0; - public Token next(final Token reusableToken) { - assert reusableToken != null; + PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); + TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class); + OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); + + public boolean incrementToken() { if (i == TOKENS.length) - return null; - reusableToken.reinit(TOKENS[i], i, i); - reusableToken.setPositionIncrement(INCREMENTS[i]); + return false; + termAtt.setTermBuffer(TOKENS[i]); + offsetAtt.setStartOffset(i); + offsetAtt.setEndOffset(i); + posIncrAtt.setPositionIncrement(INCREMENTS[i]); i++; - return reusableToken; + return true; } }; } @@ -196,18 +203,4 @@ public class TestPositionIncrement extends LuceneTestCase { StopFilter.setEnablePositionIncrementsDefault(dflt); } } - - /** - * Basic analyzer behavior should be to keep sequential terms in one - * increment from one another. - */ - public void testIncrementingPositions() throws Exception { - Analyzer analyzer = new WhitespaceAnalyzer(); - TokenStream ts = analyzer.tokenStream("field", - new StringReader("one two three four five")); - final Token reusableToken = new Token(); - for (Token nextToken = ts.next(reusableToken); nextToken != null; nextToken = ts.next(reusableToken)) { - assertEquals(nextToken.term(), 1, nextToken.getPositionIncrement()); - } - } } diff --git a/src/test/org/apache/lucene/search/TestRangeQuery.java b/src/test/org/apache/lucene/search/TestRangeQuery.java index df304a5ee54..4a5e5908945 100644 --- a/src/test/org/apache/lucene/search/TestRangeQuery.java +++ b/src/test/org/apache/lucene/search/TestRangeQuery.java @@ -26,7 +26,7 @@ import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.util.LuceneTestCase; import java.io.IOException; @@ -236,23 +236,25 @@ public class TestRangeQuery extends LuceneTestCase { private static class SingleCharTokenizer extends Tokenizer { char[] buffer = new char[1]; boolean done; - + TermAttribute termAtt; + public SingleCharTokenizer(Reader r) { super(r); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); } - public final Token next(final Token reusableToken) throws IOException { + public boolean incrementToken() throws IOException { int count = input.read(buffer); if (done) - return null; + return false; else { done = true; if (count == 1) { - reusableToken.termBuffer()[0] = buffer[0]; - reusableToken.setTermLength(1); + termAtt.termBuffer()[0] = buffer[0]; + termAtt.setTermLength(1); } else - reusableToken.setTermLength(0); - return reusableToken; + termAtt.setTermLength(0); + return true; } } diff --git a/src/test/org/apache/lucene/search/payloads/PayloadHelper.java b/src/test/org/apache/lucene/search/payloads/PayloadHelper.java index 4a998134e77..7565c3753f2 100644 --- a/src/test/org/apache/lucene/search/payloads/PayloadHelper.java +++ b/src/test/org/apache/lucene/search/payloads/PayloadHelper.java @@ -2,6 +2,7 @@ package org.apache.lucene.search.payloads; import org.apache.lucene.analysis.*; +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; import org.apache.lucene.index.Payload; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.store.RAMDirectory; @@ -41,34 +42,36 @@ public class PayloadHelper { public class PayloadFilter extends TokenFilter { String fieldName; int numSeen = 0; - + PayloadAttribute payloadAtt; + public PayloadFilter(TokenStream input, String fieldName) { super(input); this.fieldName = fieldName; + payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class); } - public Token next() throws IOException { - Token result = input.next(); - if (result != null) { + public boolean incrementToken() throws IOException { + + if (input.incrementToken()) { if (fieldName.equals(FIELD)) { - result.setPayload(new Payload(payloadField)); + payloadAtt.setPayload(new Payload(payloadField)); } else if (fieldName.equals(MULTI_FIELD)) { if (numSeen % 2 == 0) { - result.setPayload(new Payload(payloadMultiField1)); + payloadAtt.setPayload(new Payload(payloadMultiField1)); } else { - result.setPayload(new Payload(payloadMultiField2)); + payloadAtt.setPayload(new Payload(payloadMultiField2)); } numSeen++; } - + return true; } - return result; + return false; } } diff --git a/src/test/org/apache/lucene/search/payloads/TestBoostingTermQuery.java b/src/test/org/apache/lucene/search/payloads/TestBoostingTermQuery.java index 736bbcac51d..2bc3a831e9e 100644 --- a/src/test/org/apache/lucene/search/payloads/TestBoostingTermQuery.java +++ b/src/test/org/apache/lucene/search/payloads/TestBoostingTermQuery.java @@ -21,9 +21,9 @@ import java.io.Reader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.LowerCaseTokenizer; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter; @@ -66,29 +66,32 @@ public class TestBoostingTermQuery extends LuceneTestCase { private class PayloadFilter extends TokenFilter { String fieldName; int numSeen = 0; - + + PayloadAttribute payloadAtt; + public PayloadFilter(TokenStream input, String fieldName) { super(input); this.fieldName = fieldName; + payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class); } - - public Token next(final Token reusableToken) throws IOException { - assert reusableToken != null; - Token nextToken = input.next(reusableToken); - if (nextToken != null) { + + public boolean incrementToken() throws IOException { + boolean hasNext = input.incrementToken(); + if (hasNext) { if (fieldName.equals("field")) { - nextToken.setPayload(new Payload(payloadField)); + payloadAtt.setPayload(new Payload(payloadField)); } else if (fieldName.equals("multiField")) { if (numSeen % 2 == 0) { - nextToken.setPayload(new Payload(payloadMultiField1)); + payloadAtt.setPayload(new Payload(payloadMultiField1)); } else { - nextToken.setPayload(new Payload(payloadMultiField2)); + payloadAtt.setPayload(new Payload(payloadMultiField2)); } numSeen++; } - + return true; + } else { + return false; } - return nextToken; } } diff --git a/src/test/org/apache/lucene/search/spans/TestPayloadSpans.java b/src/test/org/apache/lucene/search/spans/TestPayloadSpans.java index e0b5719fefb..91796a29c8f 100644 --- a/src/test/org/apache/lucene/search/spans/TestPayloadSpans.java +++ b/src/test/org/apache/lucene/search/spans/TestPayloadSpans.java @@ -27,9 +27,11 @@ import junit.framework.TestCase; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.LowerCaseTokenizer; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; @@ -43,8 +45,9 @@ import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.payloads.PayloadHelper; import org.apache.lucene.search.payloads.PayloadSpanUtil; import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.LuceneTestCase; -public class TestPayloadSpans extends TestCase { +public class TestPayloadSpans extends LuceneTestCase { private final static boolean DEBUG = false; private IndexSearcher searcher; private Similarity similarity = new DefaultSimilarity(); @@ -54,7 +57,8 @@ public class TestPayloadSpans extends TestCase { super(s); } - protected void setUp() throws IOException { + protected void setUp() throws Exception { + super.setUp(); PayloadHelper helper = new PayloadHelper(); searcher = helper.setUp(similarity, 1000); indexReader = searcher.getIndexReader(); @@ -345,6 +349,9 @@ public class TestPayloadSpans extends TestCase { Set entities = new HashSet(); Set nopayload = new HashSet(); int pos; + PayloadAttribute payloadAtt; + TermAttribute termAtt; + PositionIncrementAttribute posIncrAtt; public PayloadFilter(TokenStream input, String fieldName) { super(input); @@ -354,24 +361,26 @@ public class TestPayloadSpans extends TestCase { entities.add("one"); nopayload.add("nopayload"); nopayload.add("np"); - + termAtt = (TermAttribute) addAttribute(TermAttribute.class); + posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); + payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class); } - public Token next() throws IOException { - Token result = input.next(); - if (result != null) { - String token = new String(result.termBuffer(), 0, result.termLength()); + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + String token = new String(termAtt.termBuffer(), 0, termAtt.termLength()); if (!nopayload.contains(token)) { if (entities.contains(token)) { - result.setPayload(new Payload((token + ":Entity:"+ pos ).getBytes())); + payloadAtt.setPayload(new Payload((token + ":Entity:"+ pos ).getBytes())); } else { - result.setPayload(new Payload((token + ":Noise:" + pos ).getBytes())); + payloadAtt.setPayload(new Payload((token + ":Noise:" + pos ).getBytes())); } } - pos += result.getPositionIncrement(); + pos += posIncrAtt.getPositionIncrement(); + return true; } - return result; + return false; } } } \ No newline at end of file diff --git a/src/test/org/apache/lucene/util/LuceneTestCase.java b/src/test/org/apache/lucene/util/LuceneTestCase.java index 8420853d5f2..2d3c46734d6 100644 --- a/src/test/org/apache/lucene/util/LuceneTestCase.java +++ b/src/test/org/apache/lucene/util/LuceneTestCase.java @@ -17,6 +17,7 @@ package org.apache.lucene.util; * limitations under the License. */ +import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.index.ConcurrentMergeScheduler; import junit.framework.TestCase; @@ -42,6 +43,7 @@ public abstract class LuceneTestCase extends TestCase { protected void setUp() throws Exception { ConcurrentMergeScheduler.setTestMode(); + TokenStream.setUseNewAPIDefault(true); } protected void tearDown() throws Exception {