From 31a5f0edcc474f739e8392eac7261b9483fc60bf Mon Sep 17 00:00:00 2001 From: Michael Busch Date: Fri, 24 Jul 2009 21:45:48 +0000 Subject: [PATCH] LUCENE-1693: Various improvements to the new TokenStream API. git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@797665 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 22 +- .../lucene/analysis/ASCIIFoldingFilter.java | 26 +- .../lucene/analysis/CachingTokenFilter.java | 58 +-- .../apache/lucene/analysis/CharTokenizer.java | 49 +-- .../analysis/ISOLatin1AccentFilter.java | 30 +- .../lucene/analysis/KeywordTokenizer.java | 34 +- .../apache/lucene/analysis/LengthFilter.java | 20 - .../lucene/analysis/LowerCaseFilter.java | 16 - .../lucene/analysis/NumericTokenStream.java | 34 -- .../lucene/analysis/PorterStemFilter.java | 12 - .../apache/lucene/analysis/SinkTokenizer.java | 30 +- .../apache/lucene/analysis/StopFilter.java | 21 - .../lucene/analysis/TeeSinkTokenFilter.java | 206 ++++++++++ .../lucene/analysis/TeeTokenFilter.java | 36 +- .../org/apache/lucene/analysis/Token.java | 75 ++-- .../apache/lucene/analysis/TokenFilter.java | 16 +- .../apache/lucene/analysis/TokenStream.java | 363 ++++++++++------- .../apache/lucene/analysis/TokenWrapper.java | 163 ++++++++ .../org/apache/lucene/analysis/Tokenizer.java | 9 +- .../org/apache/lucene/analysis/package.html | 113 +++--- .../analysis/standard/StandardFilter.java | 35 -- .../analysis/standard/StandardTokenizer.java | 79 ++-- .../tokenattributes/FlagsAttribute.java | 43 +- .../tokenattributes/FlagsAttributeImpl.java | 82 ++++ .../tokenattributes/OffsetAttribute.java | 54 +-- .../tokenattributes/OffsetAttributeImpl.java | 91 +++++ .../tokenattributes/PayloadAttribute.java | 74 +--- .../tokenattributes/PayloadAttributeImpl.java | 101 +++++ .../PositionIncrementAttribute.java | 50 +-- .../PositionIncrementAttributeImpl.java | 102 +++++ .../tokenattributes/TermAttribute.java | 169 +------- .../tokenattributes/TermAttributeImpl.java | 241 ++++++++++++ .../tokenattributes/TypeAttribute.java | 52 +-- .../tokenattributes/TypeAttributeImpl.java | 79 ++++ .../lucene/index/DocInverterPerField.java | 30 +- .../lucene/index/DocInverterPerThread.java | 81 +--- .../lucene/queryParser/QueryParser.java | 149 +++---- .../apache/lucene/queryParser/QueryParser.jj | 153 +++----- .../apache/lucene/search/QueryTermVector.java | 21 +- .../org/apache/lucene/util/Attribute.java | 73 +--- .../org/apache/lucene/util/AttributeImpl.java | 123 ++++++ .../apache/lucene/util/AttributeSource.java | 370 ++++++++++++------ .../analysis/TestASCIIFoldingFilter.java | 168 ++++---- .../analysis/TestNumericTokenStream.java | 39 +- .../analysis/TestTeeSinkTokenFilter.java | 267 +++++++++++++ .../lucene/analysis/TestTeeTokenFilter.java | 128 +++--- .../analysis/TestTokenStreamBWComp.java | 311 +++++++++++++++ .../lucene/index/TestDocumentWriter.java | 4 +- .../apache/lucene/index/TestIndexWriter.java | 47 +-- .../TestMultiFieldQueryParser.java | 5 +- .../apache/lucene/util/LuceneTestCase.java | 1 - .../lucene/util/TestAttributeSource.java | 122 ++++++ 52 files changed, 2923 insertions(+), 1754 deletions(-) create mode 100644 src/java/org/apache/lucene/analysis/TeeSinkTokenFilter.java create mode 100644 src/java/org/apache/lucene/analysis/TokenWrapper.java create mode 100644 src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttributeImpl.java create mode 100644 src/java/org/apache/lucene/analysis/tokenattributes/OffsetAttributeImpl.java create mode 100644 src/java/org/apache/lucene/analysis/tokenattributes/PayloadAttributeImpl.java create mode 100644 src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttributeImpl.java create mode 100644 src/java/org/apache/lucene/analysis/tokenattributes/TermAttributeImpl.java create mode 100644 src/java/org/apache/lucene/analysis/tokenattributes/TypeAttributeImpl.java create mode 100644 src/java/org/apache/lucene/util/AttributeImpl.java create mode 100644 src/test/org/apache/lucene/analysis/TestTeeSinkTokenFilter.java create mode 100644 src/test/org/apache/lucene/analysis/TestTokenStreamBWComp.java create mode 100644 src/test/org/apache/lucene/util/TestAttributeSource.java diff --git a/CHANGES.txt b/CHANGES.txt index 06ccb105fa3..c6bed5eafab 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -64,6 +64,22 @@ Changes in backwards compatibility policy process. It is not recommended to implement it, but rather extend Searcher. (Shai Erera via Mike McCandless) + 4. LUCENE-1422, LUCENE-1693: The new TokenStream API (see below) using + Attributes has some backwards breaks in rare cases. + We did our best to make the transition as easy as possible. You should + not have problems, if your tokenizers still implement next(Token) or + next(), the calls are automatically wrapped. The indexer and query parser + use the new API using incrementToken() calls. All core TokenStreams + are implemented using the new API. You can mix old and new API + style TokenFilters/TokenStream. Problems only occur when you have done + the following: + You have overridden next(Token) or next() in one of the non-abstract core + TokenStreams/-Filters. This classes should normally be final, but some + of them are not. In this case next(Token)/next() would never be called. + To early fail with a hard compile/runtime error, the next(Token)/next() + methods in these TokenStreams/-Filters were made final. + (Michael Busch, Uwe Schindler) + Changes in runtime behavior 1. LUCENE-1424: QueryParser now by default uses constant score query @@ -156,14 +172,16 @@ API Changes and deprecate FSDirectory.getDirectory(). FSDirectory instances are not required to be singletons per path. (yonik) -4. LUCENE-1422: New TokenStream API that uses a new class called +4. LUCENE-1422, LUCENE-1693: New TokenStream API that uses a new class called AttributeSource instead of the now deprecated Token class. All attributes that the Token class had have been moved into separate classes: TermAttribute, OffsetAttribute, PositionIncrementAttribute, PayloadAttribute, TypeAttribute and FlagsAttribute. The new API is much more flexible; it allows to combine the Attributes arbitrarily and also to define custom Attributes. The new API has the same performance - as the old next(Token) approach. (Michael Busch) + as the old next(Token) approach. + For conformance with this new API Tee-/SinkTokenizer was deprecated + and replaced by a new TeeSinkTokenFilter. (Michael Busch, Uwe Schindler) 5. LUCENE-1467: Add nextDoc() and next(int) methods to OpenBitSetIterator. These methods can be used to avoid additional calls to doc(). diff --git a/src/java/org/apache/lucene/analysis/ASCIIFoldingFilter.java b/src/java/org/apache/lucene/analysis/ASCIIFoldingFilter.java index 1a54293674d..65e1d652fda 100644 --- a/src/java/org/apache/lucene/analysis/ASCIIFoldingFilter.java +++ b/src/java/org/apache/lucene/analysis/ASCIIFoldingFilter.java @@ -1,5 +1,8 @@ package org.apache.lucene.analysis; +import java.io.IOException; + +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.util.ArrayUtil; /** @@ -53,24 +56,21 @@ import org.apache.lucene.util.ArrayUtil; * accents from Latin1 characters. For example, 'à' will be replaced by * 'a'. */ -public class ASCIIFoldingFilter extends TokenFilter { +public final class ASCIIFoldingFilter extends TokenFilter { public ASCIIFoldingFilter(TokenStream input) { super(input); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); } private char[] output = new char[512]; private int outputPos; + private TermAttribute termAtt; - public Token next(Token result) - throws java.io.IOException - { - result = input.next(result); - - if (result != null) - { - final char[] buffer = result.termBuffer(); - final int length = result.termLength(); + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + final char[] buffer = termAtt.termBuffer(); + final int length = termAtt.termLength(); // If no characters actually require rewriting then we // just return token as-is: @@ -79,13 +79,13 @@ public class ASCIIFoldingFilter extends TokenFilter { if (c >= '\u0080') { foldToASCII(buffer, length); - result.setTermBuffer(output, 0, outputPos); + termAtt.setTermBuffer(output, 0, outputPos); break; } } - return result; + return true; } else { - return null; + return false; } } diff --git a/src/java/org/apache/lucene/analysis/CachingTokenFilter.java b/src/java/org/apache/lucene/analysis/CachingTokenFilter.java index 3a4ab989fa5..c45b257b280 100644 --- a/src/java/org/apache/lucene/analysis/CachingTokenFilter.java +++ b/src/java/org/apache/lucene/analysis/CachingTokenFilter.java @@ -25,24 +25,35 @@ import java.util.List; import org.apache.lucene.util.AttributeSource; /** - * This class can be used if the Tokens of a TokenStream + * This class can be used if the token attributes of a TokenStream * are intended to be consumed more than once. It caches - * all Tokens locally in a List. + * all token attribute states locally in a List. * - * CachingTokenFilter implements the optional method + *

CachingTokenFilter implements the optional method * {@link TokenStream#reset()}, which repositions the * stream to the first Token. - * */ public class CachingTokenFilter extends TokenFilter { - private List cache; - private Iterator iterator; + private List cache = null; + private Iterator iterator = null; public CachingTokenFilter(TokenStream input) { super(input); } + + /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should + * not be overridden. Delegates to the backwards compatibility layer. */ + public final Token next(final Token reusableToken) throws IOException { + return super.next(reusableToken); + } + + /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should + * not be overridden. Delegates to the backwards compatibility layer. */ + public final Token next() throws IOException { + return super.next(); + } - public boolean incrementToken() throws IOException { + public final boolean incrementToken() throws IOException { if (cache == null) { // fill cache lazily cache = new LinkedList(); @@ -51,34 +62,14 @@ public class CachingTokenFilter extends TokenFilter { } if (!iterator.hasNext()) { - // the cache is exhausted, return null + // the cache is exhausted, return false return false; } // Since the TokenFilter can be reset, the tokens need to be preserved as immutable. - AttributeSource state = (AttributeSource) iterator.next(); - state.restoreState(this); + restoreState((AttributeSource.State) iterator.next()); return true; } - - /** @deprecated */ - public Token next(final Token reusableToken) throws IOException { - assert reusableToken != null; - if (cache == null) { - // fill cache lazily - cache = new LinkedList(); - fillCache(reusableToken); - iterator = cache.iterator(); - } - - if (!iterator.hasNext()) { - // the cache is exhausted, return null - return null; - } - // Since the TokenFilter can be reset, the tokens need to be preserved as immutable. - Token nextToken = (Token) iterator.next(); - return (Token) nextToken.clone(); - } - + public void reset() throws IOException { if(cache != null) { iterator = cache.iterator(); @@ -90,12 +81,5 @@ public class CachingTokenFilter extends TokenFilter { cache.add(captureState()); } } - - /** @deprecated */ - private void fillCache(final Token reusableToken) throws IOException { - for (Token nextToken = input.next(reusableToken); nextToken != null; nextToken = input.next(reusableToken)) { - cache.add(nextToken.clone()); - } - } } diff --git a/src/java/org/apache/lucene/analysis/CharTokenizer.java b/src/java/org/apache/lucene/analysis/CharTokenizer.java index fc934a0ee24..de270a88da1 100644 --- a/src/java/org/apache/lucene/analysis/CharTokenizer.java +++ b/src/java/org/apache/lucene/analysis/CharTokenizer.java @@ -94,49 +94,16 @@ public abstract class CharTokenizer extends Tokenizer { return true; } - /** @deprecated */ + /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should + * not be overridden. Delegates to the backwards compatibility layer. */ public final Token next(final Token reusableToken) throws IOException { - assert reusableToken != null; - reusableToken.clear(); - int length = 0; - int start = bufferIndex; - char[] buffer = reusableToken.termBuffer(); - while (true) { + return super.next(reusableToken); + } - if (bufferIndex >= dataLen) { - offset += dataLen; - dataLen = input.read(ioBuffer); - if (dataLen == -1) { - if (length > 0) - break; - else - return null; - } - bufferIndex = 0; - } - - final char c = ioBuffer[bufferIndex++]; - - if (isTokenChar(c)) { // if it's a token char - - if (length == 0) // start of token - start = offset + bufferIndex - 1; - else if (length == buffer.length) - buffer = reusableToken.resizeTermBuffer(1+length); - - buffer[length++] = normalize(c); // buffer it, normalized - - if (length == MAX_WORD_LEN) // buffer overflow! - break; - - } else if (length > 0) // at non-Letter w/ chars - break; // return 'em - } - - reusableToken.setTermLength(length); - reusableToken.setStartOffset(input.correctOffset(start)); - reusableToken.setEndOffset(input.correctOffset(start+length)); - return reusableToken; + /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should + * not be overridden. Delegates to the backwards compatibility layer. */ + public final Token next() throws IOException { + return super.next(); } public void reset(Reader input) throws IOException { diff --git a/src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java b/src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java index 3dbc70dc284..119f54fdf1e 100644 --- a/src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java +++ b/src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java @@ -57,27 +57,17 @@ public class ISOLatin1AccentFilter extends TokenFilter { } else return false; } - - /** @deprecated */ + + /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should + * not be overridden. Delegates to the backwards compatibility layer. */ public final Token next(final Token reusableToken) throws java.io.IOException { - assert reusableToken != null; - Token nextToken = input.next(reusableToken); - if (nextToken != null) { - final char[] buffer = nextToken.termBuffer(); - final int length = nextToken.termLength(); - // If no characters actually require rewriting then we - // just return token as-is: - for(int i=0;i= '\u00c0' && c <= '\uFB06') { - removeAccents(buffer, length); - nextToken.setTermBuffer(output, 0, outputPos); - break; - } - } - return nextToken; - } else - return null; + return super.next(reusableToken); + } + + /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should + * not be overridden. Delegates to the backwards compatibility layer. */ + public final Token next() throws java.io.IOException { + return super.next(); } /** diff --git a/src/java/org/apache/lucene/analysis/KeywordTokenizer.java b/src/java/org/apache/lucene/analysis/KeywordTokenizer.java index 5f1cac85207..62443fbd4c3 100644 --- a/src/java/org/apache/lucene/analysis/KeywordTokenizer.java +++ b/src/java/org/apache/lucene/analysis/KeywordTokenizer.java @@ -45,7 +45,7 @@ public class KeywordTokenizer extends Tokenizer { offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); } - public boolean incrementToken() throws IOException { + public final boolean incrementToken() throws IOException { if (!done) { done = true; int upto = 0; @@ -65,28 +65,16 @@ public class KeywordTokenizer extends Tokenizer { return false; } - /** @deprecated */ - public Token next(final Token reusableToken) throws IOException { - assert reusableToken != null; - if (!done) { - done = true; - int upto = 0; - reusableToken.clear(); - char[] buffer = reusableToken.termBuffer(); - while (true) { - final int length = input.read(buffer, upto, buffer.length-upto); - if (length == -1) break; - upto += length; - if (upto == buffer.length) - buffer = reusableToken.resizeTermBuffer(1+buffer.length); - } - reusableToken.setTermLength(upto); - reusableToken.setStartOffset(input.correctOffset(0)); - reusableToken.setEndOffset(input.correctOffset(upto)); - - return reusableToken; - } - return null; + /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should + * not be overridden. Delegates to the backwards compatibility layer. */ + public final Token next(final Token reusableToken) throws IOException { + return super.next(reusableToken); + } + + /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should + * not be overridden. Delegates to the backwards compatibility layer. */ + public final Token next() throws IOException { + return super.next(); } public void reset(Reader input) throws IOException { diff --git a/src/java/org/apache/lucene/analysis/LengthFilter.java b/src/java/org/apache/lucene/analysis/LengthFilter.java index b090cd23d9c..9894ec20bb1 100644 --- a/src/java/org/apache/lucene/analysis/LengthFilter.java +++ b/src/java/org/apache/lucene/analysis/LengthFilter.java @@ -61,24 +61,4 @@ public final class LengthFilter extends TokenFilter { // reached EOS -- return null return false; } - - /** - * Returns the next input Token whose term() is the right len - * @deprecated - */ - public final Token next(final Token reusableToken) throws IOException - { - assert reusableToken != null; - // return the first non-stop word found - for (Token nextToken = input.next(reusableToken); nextToken != null; nextToken = input.next(reusableToken)) - { - int len = nextToken.termLength(); - if (len >= min && len <= max) { - return nextToken; - } - // note: else we ignore it but should we index each part of it? - } - // reached EOS -- return null - return null; - } } diff --git a/src/java/org/apache/lucene/analysis/LowerCaseFilter.java b/src/java/org/apache/lucene/analysis/LowerCaseFilter.java index 0c146e2a64d..9774726b158 100644 --- a/src/java/org/apache/lucene/analysis/LowerCaseFilter.java +++ b/src/java/org/apache/lucene/analysis/LowerCaseFilter.java @@ -46,20 +46,4 @@ public final class LowerCaseFilter extends TokenFilter { } else return false; } - - /** @deprecated */ - public final Token next(final Token reusableToken) throws IOException { - assert reusableToken != null; - Token nextToken = input.next(reusableToken); - if (nextToken != null) { - - final char[] buffer = nextToken.termBuffer(); - final int length = nextToken.termLength(); - for(int i=0;i= valSize) - return null; - - reusableToken.clear(); - - final char[] buffer; - switch (valSize) { - case 64: - buffer = reusableToken.resizeTermBuffer(NumericUtils.BUF_SIZE_LONG); - reusableToken.setTermLength(NumericUtils.longToPrefixCoded(value, shift, buffer)); - break; - - case 32: - buffer = reusableToken.resizeTermBuffer(NumericUtils.BUF_SIZE_INT); - reusableToken.setTermLength(NumericUtils.intToPrefixCoded((int) value, shift, buffer)); - break; - - default: - // should not happen - throw new IllegalArgumentException("valSize must be 32 or 64"); - } - - reusableToken.setType((shift == 0) ? TOKEN_TYPE_FULL_PREC : TOKEN_TYPE_LOWER_PREC); - reusableToken.setPositionIncrement((shift == 0) ? 1 : 0); - shift += precisionStep; - return reusableToken; - } // @Override public String toString() { diff --git a/src/java/org/apache/lucene/analysis/PorterStemFilter.java b/src/java/org/apache/lucene/analysis/PorterStemFilter.java index 0b3d94e972c..59f638f628b 100644 --- a/src/java/org/apache/lucene/analysis/PorterStemFilter.java +++ b/src/java/org/apache/lucene/analysis/PorterStemFilter.java @@ -57,16 +57,4 @@ public final class PorterStemFilter extends TokenFilter { termAtt.setTermBuffer(stemmer.getResultBuffer(), 0, stemmer.getResultLength()); return true; } - - /** @deprecated */ - public final Token next(final Token reusableToken) throws IOException { - assert reusableToken != null; - Token nextToken = input.next(reusableToken); - if (nextToken == null) - return null; - - if (stemmer.stem(nextToken.termBuffer(), 0, nextToken.termLength())) - nextToken.setTermBuffer(stemmer.getResultBuffer(), 0, stemmer.getResultLength()); - return nextToken; - } } diff --git a/src/java/org/apache/lucene/analysis/SinkTokenizer.java b/src/java/org/apache/lucene/analysis/SinkTokenizer.java index 3de26d9d4e5..45dc5173383 100644 --- a/src/java/org/apache/lucene/analysis/SinkTokenizer.java +++ b/src/java/org/apache/lucene/analysis/SinkTokenizer.java @@ -22,19 +22,21 @@ import java.util.ArrayList; import java.util.Iterator; import java.util.List; -import org.apache.lucene.util.AttributeSource; - /** * A SinkTokenizer can be used to cache Tokens for use in an Analyzer - * + *

+ * WARNING: {@link TeeTokenFilter} and {@link SinkTokenizer} only work with the old TokenStream API. + * If you switch to the new API, you need to use {@link TeeSinkTokenFilter} instead, which offers + * the same functionality. * @see TeeTokenFilter + * @deprecated Use {@link TeeSinkTokenFilter} instead * **/ public class SinkTokenizer extends Tokenizer { protected List/**/ lst = new ArrayList/**/(); protected Iterator/**/ iter; - + public SinkTokenizer(List/**/ input) { this.lst = input; if (this.lst == null) this.lst = new ArrayList/**/(); @@ -63,30 +65,10 @@ public class SinkTokenizer extends Tokenizer { return lst; } - /** - * Increments this stream to the next token out of the list of cached tokens - * @throws IOException - */ - public boolean incrementToken() throws IOException { - if (iter == null) iter = lst.iterator(); - // Since this TokenStream can be reset we have to maintain the tokens as immutable - if (iter.hasNext()) { - AttributeSource state = (AttributeSource) iter.next(); - state.restoreState(this); - return true; - } - return false; - } - - public void add(AttributeSource source) throws IOException { - lst.add(source); - } - /** * Returns the next token out of the list of cached tokens * @return The next {@link org.apache.lucene.analysis.Token} in the Sink. * @throws IOException - * @deprecated */ public Token next(final Token reusableToken) throws IOException { assert reusableToken != null; diff --git a/src/java/org/apache/lucene/analysis/StopFilter.java b/src/java/org/apache/lucene/analysis/StopFilter.java index befd56a9d4c..5e98f2afaf0 100644 --- a/src/java/org/apache/lucene/analysis/StopFilter.java +++ b/src/java/org/apache/lucene/analysis/StopFilter.java @@ -234,27 +234,6 @@ public final class StopFilter extends TokenFilter { return false; } - /** - * Returns the next input Token whose term() is not a stop word. - * @deprecated - */ - public final Token next(final Token reusableToken) throws IOException { - assert reusableToken != null; - // return the first non-stop word found - int skippedPositions = 0; - for (Token nextToken = input.next(reusableToken); nextToken != null; nextToken = input.next(reusableToken)) { - if (!stopWords.contains(nextToken.termBuffer(), 0, nextToken.termLength())) { - if (enablePositionIncrements) { - nextToken.setPositionIncrement(nextToken.getPositionIncrement() + skippedPositions); - } - return nextToken; - } - skippedPositions += nextToken.getPositionIncrement(); - } - // reached EOS -- return null - return null; - } - /** * @see #setEnablePositionIncrementsDefault(boolean). * @deprecated Please specify this when you create the StopFilter diff --git a/src/java/org/apache/lucene/analysis/TeeSinkTokenFilter.java b/src/java/org/apache/lucene/analysis/TeeSinkTokenFilter.java new file mode 100644 index 00000000000..2609b323555 --- /dev/null +++ b/src/java/org/apache/lucene/analysis/TeeSinkTokenFilter.java @@ -0,0 +1,206 @@ +package org.apache.lucene.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.lang.ref.WeakReference; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.Collections; + +import org.apache.lucene.util.AttributeImpl; +import org.apache.lucene.util.AttributeSource; + +/** + * This TokenFilter provides the ability to set aside attribute states + * that have already been analyzed. This is useful in situations where multiple fields share + * many common analysis steps and then go their separate ways. + *

+ * It is also useful for doing things like entity extraction or proper noun analysis as + * part of the analysis workflow and saving off those tokens for use in another field. + * + *

+TeeSinkTokenFilter source1 = new TeeSinkTokenFilter(new WhitespaceTokenizer(reader1));
+TeeSinkTokenFilter.SinkTokenStream sink1 = source1.newSinkTokenStream();
+TeeSinkTokenFilter.SinkTokenStream sink2 = source1.newSinkTokenStream();
+
+TeeSinkTokenFilter source2 = new TeeSinkTokenFilter(new WhitespaceTokenizer(reader2));
+source2.addSinkTokenStream(sink1);
+source2.addSinkTokenStream(sink2);
+
+TokenStream final1 = new LowerCaseFilter(source1);
+TokenStream final2 = source2;
+TokenStream final3 = new EntityDetect(sink1);
+TokenStream final4 = new URLDetect(sink2);
+
+d.add(new Field("f1", final1));
+d.add(new Field("f2", final2));
+d.add(new Field("f3", final3));
+d.add(new Field("f4", final4));
+ * 
+ * In this example, sink1 and sink2 will both get tokens from both + * reader1 and reader2 after whitespace tokenizer + * and now we can further wrap any of these in extra analysis, and more "sources" can be inserted if desired. + * It is important, that tees are consumed before sinks (in the above example, the field names must be + * less the sink's field names). If you are not sure, which stream is consumed first, you can simply + * add another sink and then pass all tokens to the sinks at once using {@link #consumeAllTokens}. + * This TokenFilter is exhausted after this. In the above example, change + * the example above to: + *
+...
+TokenStream final1 = new LowerCaseFilter(source1.newSinkTokenStream());
+TokenStream final2 = source2.newSinkTokenStream();
+sink1.consumeAllTokens();
+sink2.consumeAllTokens();
+...
+ * 
+ * In this case, the fields can be added in any order, because the sources are not used anymore and all sinks are ready. + *

Note, the EntityDetect and URLDetect TokenStreams are for the example and do not currently exist in Lucene. + */ +public final class TeeSinkTokenFilter extends TokenFilter { + private final List sinks = new LinkedList(); + + /** + * Instantiates a new TeeSinkTokenFilter. + */ + public TeeSinkTokenFilter(TokenStream input) { + super(input); + } + + /** + * Returns a new {@link SinkTokenStream} that receives all tokens consumed by this stream. + */ + public SinkTokenStream newSinkTokenStream() { + return newSinkTokenStream(ACCEPT_ALL_FILTER); + } + + /** + * Returns a new {@link SinkTokenStream} that receives all tokens consumed by this stream + * that pass the supplied filter. + * @see SinkFilter + */ + public SinkTokenStream newSinkTokenStream(SinkFilter filter) { + SinkTokenStream sink = new SinkTokenStream(this.cloneAttributes(), filter); + this.sinks.add(new WeakReference(sink)); + return sink; + } + + /** + * Adds a {@link SinkTokenStream} created by another TeeSinkTokenFilter + * to this one. The supplied stream will also receive all consumed tokens. + * This method can be used to pass tokens from two different tees to one sink. + */ + public void addSinkTokenStream(final SinkTokenStream sink) { + // check that sink has correct factory + if (!this.getAttributeFactory().equals(sink.getAttributeFactory())) { + throw new IllegalArgumentException("The supplied sink is not compatible to this tee"); + } + // add eventually missing attribute impls to the existing sink + for (Iterator it = this.cloneAttributes().getAttributeImplsIterator(); it.hasNext(); ) { + sink.addAttributeImpl((AttributeImpl) it.next()); + } + this.sinks.add(new WeakReference(sink)); + } + + /** + * TeeSinkTokenFilter passes all tokens to the added sinks + * when itsself is consumed. To be sure, that all tokens from the input + * stream are passed to the sinks, you can call this methods. + * This instance is exhausted after this, but all sinks are instant available. + */ + public void consumeAllTokens() throws IOException { + while (incrementToken()); + } + + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + // capture state lazily - maybe no SinkFilter accepts this state + AttributeSource.State state = null; + for (Iterator it = sinks.iterator(); it.hasNext(); ) { + final SinkTokenStream sink = (SinkTokenStream) ((WeakReference) it.next()).get(); + if (sink != null) { + if (sink.accept(this)) { + if (state == null) { + state = this.captureState(); + } + sink.addState(state); + } + } + } + return true; + } + + return false; + } + + /** + * TODO: Missing Docs + */ + public static interface SinkFilter { + boolean accept(AttributeSource source); + } + + public static final class SinkTokenStream extends TokenStream { + private final List cachedStates = new LinkedList(); + private Iterator it = null; + private SinkFilter filter; + + private SinkTokenStream(AttributeSource source, SinkFilter filter) { + super(source); + this.filter = filter; + } + + private boolean accept(AttributeSource source) { + return filter.accept(source); + } + + private void addState(AttributeSource.State state) { + if (it != null) { + throw new IllegalStateException("The tee must be consumed before sinks are consumed."); + } + cachedStates.add(state); + } + + public final boolean incrementToken() throws IOException { + // lazy init the iterator + if (it == null) { + it = cachedStates.iterator(); + } + + if (!it.hasNext()) { + return false; + } + + AttributeSource.State state = (State) it.next(); + restoreState(state); + return true; + } + + public final void reset() { + it = cachedStates.iterator(); + } + } + + private static final SinkFilter ACCEPT_ALL_FILTER = new SinkFilter() { + public boolean accept(AttributeSource source) { + return true; + } + }; + +} diff --git a/src/java/org/apache/lucene/analysis/TeeTokenFilter.java b/src/java/org/apache/lucene/analysis/TeeTokenFilter.java index ec2606c1a00..3fbba94a486 100644 --- a/src/java/org/apache/lucene/analysis/TeeTokenFilter.java +++ b/src/java/org/apache/lucene/analysis/TeeTokenFilter.java @@ -18,7 +18,6 @@ package org.apache.lucene.analysis; import java.io.IOException; -import java.util.Iterator; /** @@ -30,8 +29,8 @@ import java.util.Iterator; * part of the analysis workflow and saving off those tokens for use in another field. * *

-SinkTokenizer sink1 = new SinkTokenizer(null);
-SinkTokenizer sink2 = new SinkTokenizer(null);
+SinkTokenizer sink1 = new SinkTokenizer();
+SinkTokenizer sink2 = new SinkTokenizer();
 
 TokenStream source1 = new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(reader1), sink1), sink2);
 TokenStream source2 = new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(reader2), sink1), sink2);
@@ -46,14 +45,22 @@ d.add(new Field("f2", final2));
 d.add(new Field("f3", final3));
 d.add(new Field("f4", final4));
  * 
- * In this example, sink1 and sink2 will both get tokens from both reader1 and reader2 after whitespace tokenizer - and now we can further wrap any of these in extra analysis, and more "sources" can be inserted if desired. - Note, the EntityDetect and URLDetect TokenStreams are for the example and do not currently exist in Lucene + * In this example, sink1 and sink2 will both get tokens from both + * reader1 and reader2 after whitespace tokenizer + * and now we can further wrap any of these in extra analysis, and more "sources" can be inserted if desired. + * It is important, that tees are consumed before sinks (in the above example, the field names must be + * less the sink's field names). + * Note, the EntityDetect and URLDetect TokenStreams are for the example and do not currently exist in Lucene

* - * See http://issues.apache.org/jira/browse/LUCENE-1058 + * See LUCENE-1058. + *

+ * WARNING: {@link TeeTokenFilter} and {@link SinkTokenizer} only work with the old TokenStream API. + * If you switch to the new API, you need to use {@link TeeSinkTokenFilter} instead, which offers + * the same functionality. + * @see SinkTokenizer - * + * @deprecated Use {@link TeeSinkTokenFilter} instead **/ public class TeeTokenFilter extends TokenFilter { SinkTokenizer sink; @@ -61,21 +68,8 @@ public class TeeTokenFilter extends TokenFilter { public TeeTokenFilter(TokenStream input, SinkTokenizer sink) { super(input); this.sink = sink; - Iterator it = getAttributesIterator(); - while (it.hasNext()) { - sink.addAttribute(it.next().getClass()); - } } - public boolean incrementToken() throws IOException { - if (input.incrementToken()) { - sink.add(captureState()); - return true; - } - return false; - } - - /** @deprecated */ public Token next(final Token reusableToken) throws IOException { assert reusableToken != null; Token nextToken = input.next(reusableToken); diff --git a/src/java/org/apache/lucene/analysis/Token.java b/src/java/org/apache/lucene/analysis/Token.java index f5d48516898..d5a98834a7b 100644 --- a/src/java/org/apache/lucene/analysis/Token.java +++ b/src/java/org/apache/lucene/analysis/Token.java @@ -17,14 +17,19 @@ package org.apache.lucene.analysis; * limitations under the License. */ +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.FlagsAttribute; +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.index.Payload; import org.apache.lucene.index.TermPositions; // for javadoc import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.Attribute; +import org.apache.lucene.util.AttributeImpl; /** - This class is now deprecated and a new TokenStream API was introduced with Lucene 2.9. - See Javadocs in {@link TokenStream} for further details. -

A Token is an occurrence of a term from the text of a field. It consists of a term's text, the start and end offset of the term in the text of the field, and a type string. @@ -44,11 +49,13 @@ import org.apache.lucene.util.ArrayUtil; {@link TermPositions#getPayload(byte[], int)} to retrieve the payloads from the index.

-

- WARNING: The status of the Payloads feature is experimental. - The APIs introduced here might change in the future and will not be - supported anymore in such a case. - + +

NOTE: As of 2.9, Token implements all {@link Attribute} interfaces + that are part of core Lucene and can be found in the {@code tokenattributes} subpackage. + Even though it is not necessary to use Token anymore, with the new TokenStream API it can + be used as convenience class that implements all {@link Attribute}s, which is especially useful + to easily switch from the old to the new TokenStream API. +

NOTE: As of 2.3, Token stores the term text @@ -118,10 +125,10 @@ import org.apache.lucene.util.ArrayUtil;

@see org.apache.lucene.index.Payload - @deprecated A new TokenStream API was introduced with Lucene 2.9. - See javadocs in {@link TokenStream} for further details. */ -public class Token implements Cloneable { +public class Token extends AttributeImpl + implements Cloneable, TermAttribute, TypeAttribute, PositionIncrementAttribute, + FlagsAttribute, OffsetAttribute, PayloadAttribute { public static final String DEFAULT_TYPE = "word"; @@ -134,7 +141,7 @@ public class Token implements Cloneable { /** * Characters for the term text. * @deprecated This will be made private. Instead, use: - * {@link termBuffer()}, + * {@link #termBuffer()}, * {@link #setTermBuffer(char[], int, int)}, * {@link #setTermBuffer(String)}, or * {@link #setTermBuffer(String, int, int)} @@ -144,28 +151,28 @@ public class Token implements Cloneable { /** * Length of term text in the buffer. * @deprecated This will be made private. Instead, use: - * {@link termLength()}, or @{link setTermLength(int)}. + * {@link #termLength()}, or @{link setTermLength(int)}. */ int termLength; /** * Start in source text. * @deprecated This will be made private. Instead, use: - * {@link startOffset()}, or @{link setStartOffset(int)}. + * {@link #startOffset()}, or @{link setStartOffset(int)}. */ int startOffset; /** * End in source text. * @deprecated This will be made private. Instead, use: - * {@link endOffset()}, or @{link setEndOffset(int)}. + * {@link #endOffset()}, or @{link setEndOffset(int)}. */ int endOffset; /** * The lexical type of the token. * @deprecated This will be made private. Instead, use: - * {@link type()}, or @{link setType(String)}. + * {@link #type()}, or @{link setType(String)}. */ String type = DEFAULT_TYPE; @@ -173,13 +180,13 @@ public class Token implements Cloneable { /** * @deprecated This will be made private. Instead, use: - * {@link getPayload()}, or @{link setPayload(Payload)}. + * {@link #getPayload()}, or @{link setPayload(Payload)}. */ Payload payload; /** * @deprecated This will be made private. Instead, use: - * {@link getPositionIncrement()}, or @{link setPositionIncrement(String)}. + * {@link #getPositionIncrement()}, or @{link setPositionIncrement(String)}. */ int positionIncrement = 1; @@ -561,6 +568,13 @@ public class Token implements Cloneable { public void setEndOffset(int offset) { this.endOffset = offset; } + + /** Set the starting and ending offset. + @see #startOffset() and #endOffset()*/ + public void setOffset(int startOffset, int endOffset) { + this.startOffset = startOffset; + this.endOffset = endOffset; + } /** Returns this Token's lexical type. Defaults to "word". */ public final String type() { @@ -640,19 +654,15 @@ public class Token implements Cloneable { } public Object clone() { - try { - Token t = (Token)super.clone(); - // Do a deep clone - if (termBuffer != null) { - t.termBuffer = (char[]) termBuffer.clone(); - } - if (payload != null) { - t.setPayload((Payload) payload.clone()); - } - return t; - } catch (CloneNotSupportedException e) { - throw new RuntimeException(e); // shouldn't happen + Token t = (Token)super.clone(); + // Do a deep clone + if (termBuffer != null) { + t.termBuffer = (char[]) termBuffer.clone(); } + if (payload != null) { + t.setPayload((Payload) payload.clone()); + } + return t; } /** Makes a clone, but replaces the term buffer & @@ -862,4 +872,9 @@ public class Token implements Cloneable { type = prototype.type; payload = prototype.payload; } + + public void copyTo(AttributeImpl target) { + Token to = (Token) target; + to.reinit(this); + } } diff --git a/src/java/org/apache/lucene/analysis/TokenFilter.java b/src/java/org/apache/lucene/analysis/TokenFilter.java index 2165d3edcec..24b22f1adc0 100644 --- a/src/java/org/apache/lucene/analysis/TokenFilter.java +++ b/src/java/org/apache/lucene/analysis/TokenFilter.java @@ -42,7 +42,7 @@ public abstract class TokenFilter extends TokenStream { super(input); this.input = input; } - + /** Close the input TokenStream. */ public void close() throws IOException { input.close(); @@ -50,20 +50,6 @@ public abstract class TokenFilter extends TokenStream { /** Reset the filter as well as the input TokenStream. */ public void reset() throws IOException { - super.reset(); input.reset(); } - - public boolean useNewAPI() { - return input.useNewAPI(); - } - - /** - * Sets whether or not to use the new TokenStream API. Settings this - * will apply to this Filter and all TokenStream/Filters upstream. - */ - public void setUseNewAPI(boolean use) { - input.setUseNewAPI(use); - } - } diff --git a/src/java/org/apache/lucene/analysis/TokenStream.java b/src/java/org/apache/lucene/analysis/TokenStream.java index 8fff742500f..e51e42477b7 100644 --- a/src/java/org/apache/lucene/analysis/TokenStream.java +++ b/src/java/org/apache/lucene/analysis/TokenStream.java @@ -18,10 +18,15 @@ package org.apache.lucene.analysis; */ import java.io.IOException; -import java.util.Iterator; -import org.apache.lucene.index.Payload; +import org.apache.lucene.analysis.tokenattributes.FlagsAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.util.Attribute; +import org.apache.lucene.util.AttributeImpl; import org.apache.lucene.util.AttributeSource; /** A TokenStream enumerates the sequence of tokens, either from @@ -36,13 +41,13 @@ import org.apache.lucene.util.AttributeSource; A new TokenStream API is introduced with Lucene 2.9. Since 2.9 Token is deprecated and the preferred way to store - the information of a token is to use {@link Attribute}s. + the information of a token is to use {@link AttributeImpl}s.

For that reason TokenStream extends {@link AttributeSource} - now. Note that only one instance per {@link Attribute} is + now. Note that only one instance per {@link AttributeImpl} is created and reused for every token. This approach reduces object creations and allows local caching of references to - the {@link Attribute}s. See {@link #incrementToken()} for further details. + the {@link AttributeImpl}s. See {@link #incrementToken()} for further details.

The workflow of the new TokenStream API is as follows:

    @@ -60,19 +65,8 @@ import org.apache.lucene.util.AttributeSource;

    Sometimes it is desirable to capture a current state of a TokenStream, e. g. for buffering purposes (see {@link CachingTokenFilter}, - {@link TeeTokenFilter}/{@link SinkTokenizer}). For this usecase - {@link AttributeSource#captureState()} and {@link AttributeSource#restoreState(AttributeSource)} can be used. -

    - NOTE: In order to enable the new API the method - {@link #useNewAPI()} has to be called with useNewAPI=true. - Otherwise the deprecated method {@link #next(Token)} will - be used by Lucene consumers (indexer and queryparser) to - consume the tokens. {@link #next(Token)} will be removed - in Lucene 3.0. -

    - NOTE: To use the old API subclasses must override {@link #next(Token)}. - It's also OK to instead override {@link #next()} but that - method is slower compared to {@link #next(Token)}. + {@link TeeSinkTokenFilter}). For this usecase + {@link AttributeSource#captureState} and {@link AttributeSource#restoreState} can be used. *

    * WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental. * The APIs introduced in these classes with Lucene 2.9 might change in the future. @@ -80,110 +74,203 @@ import org.apache.lucene.util.AttributeSource; */ public abstract class TokenStream extends AttributeSource { - private static boolean useNewAPIDefault = false; - private boolean useNewAPI = useNewAPIDefault; + + /** @deprecated Remove this when old API is removed! */ + private static final AttributeFactory DEFAULT_TOKEN_WRAPPER_ATTRIBUTE_FACTORY + = new TokenWrapperAttributeFactory(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY); - protected TokenStream() { - super(); + /** @deprecated Remove this when old API is removed! */ + private static final Class[] METHOD_NO_PARAMS = new Class[0]; + + /** @deprecated Remove this when old API is removed! */ + private static final Class[] METHOD_TOKEN_PARAM = new Class[]{Token.class}; + + /** @deprecated Remove this when old API is removed! */ + private final TokenWrapper tokenWrapper; + + /** @deprecated Remove this when old API is removed! */ + private static boolean onlyUseNewAPI = false; + + /** @deprecated Remove this when old API is removed! */ + private final boolean + hasIncrementToken = isMethodOverridden("incrementToken", METHOD_NO_PARAMS), + hasReusableNext = onlyUseNewAPI ? false : isMethodOverridden("next", METHOD_TOKEN_PARAM), + hasNext = onlyUseNewAPI ? false : isMethodOverridden("next", METHOD_NO_PARAMS); + + /** @deprecated Remove this when old API is removed! */ + private boolean isMethodOverridden(String name, Class[] params) { + try { + return this.getClass().getMethod(name, params).getDeclaringClass() != TokenStream.class; + } catch (NoSuchMethodException e) { + // should not happen + throw new RuntimeException(e); + } } + /** @deprecated Remove this when old API is removed! */ + private static final class TokenWrapperAttributeFactory extends AttributeFactory { + private final AttributeFactory delegate; + + private TokenWrapperAttributeFactory(AttributeFactory delegate) { + this.delegate = delegate; + } + + public AttributeImpl createAttributeInstance(Class attClass) { + return attClass.isAssignableFrom(TokenWrapper.class) + ? new TokenWrapper() + : delegate.createAttributeInstance(attClass); + } + + // this is needed for TeeSinkTokenStream's check for compatibility of AttributeSource, + // so two TokenStreams using old API have the same AttributeFactory wrapped by this one. + public boolean equals(Object other) { + if (this == other) return true; + if (other instanceof TokenWrapperAttributeFactory) { + final TokenWrapperAttributeFactory af = (TokenWrapperAttributeFactory) other; + return this.delegate.equals(af.delegate); + } + return false; + } + } + + /** + * A TokenStream using the default attribute factory. + */ + protected TokenStream() { + super(onlyUseNewAPI + ? AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY + : TokenStream.DEFAULT_TOKEN_WRAPPER_ATTRIBUTE_FACTORY + ); + tokenWrapper = initTokenWrapper(null); + check(); + } + + /** + * A TokenStream that uses the same attributes as the supplied one. + */ protected TokenStream(AttributeSource input) { super(input); - } - - /** - * Returns whether or not the new TokenStream APIs are used - * by default. - * (see {@link #incrementToken()}, {@link AttributeSource}). - */ - public static boolean useNewAPIDefault() { - return useNewAPIDefault; - } - - /** - * Use this API to enable or disable the new TokenStream API. - * by default. Can be overridden by calling {@link #setUseNewAPI(boolean)}. - * (see {@link #incrementToken()}, {@link AttributeSource}). - *

    - * If set to true, the indexer will call {@link #incrementToken()} - * to consume Tokens from this stream. - *

    - * If set to false, the indexer will call {@link #next(Token)} - * instead. - */ - public static void setUseNewAPIDefault(boolean use) { - useNewAPIDefault = use; + tokenWrapper = initTokenWrapper(input); + check(); } /** - * Returns whether or not the new TokenStream APIs are used - * for this stream. - * (see {@link #incrementToken()}, {@link AttributeSource}). + * A TokenStream using the supplied AttributeFactory for creating new {@link Attribute} instances. */ - public boolean useNewAPI() { - return useNewAPI; + protected TokenStream(AttributeFactory factory) { + super(onlyUseNewAPI + ? factory + : new TokenWrapperAttributeFactory(factory) + ); + tokenWrapper = initTokenWrapper(null); + check(); } - /** - * Use this API to enable or disable the new TokenStream API - * for this stream. Overrides {@link #setUseNewAPIDefault(boolean)}. - * (see {@link #incrementToken()}, {@link AttributeSource}). - *

    - * If set to true, the indexer will call {@link #incrementToken()} - * to consume Tokens from this stream. - *

    - * If set to false, the indexer will call {@link #next(Token)} - * instead. - *

    - * NOTE: All streams and filters in one chain must use the - * same API. - */ - public void setUseNewAPI(boolean use) { - useNewAPI = use; - } - - /** - * Consumers (e. g. the indexer) use this method to advance the stream - * to the next token. Implementing classes must implement this method - * and update the appropriate {@link Attribute}s with content of the - * next token. - *

    - * This method is called for every token of a document, so an efficient - * implementation is crucial for good performance. To avoid calls to - * {@link #addAttribute(Class)} and {@link #getAttribute(Class)} and - * downcasts, references to all {@link Attribute}s that this stream uses - * should be retrieved during instantiation. - *

    - * To make sure that filters and consumers know which attributes are available - * the attributes must be added during instantiation. Filters and - * consumers are not required to check for availability of attributes in {@link #incrementToken()}. - * - * @return false for end of stream; true otherwise - * - *

    - * Note that this method will be defined abstract in Lucene 3.0. - */ - public boolean incrementToken() throws IOException { - // subclasses must implement this method; will be made abstract in Lucene 3.0 - return false; - } - - /** Returns the next token in the stream, or null at EOS. - * @deprecated The returned Token is a "full private copy" (not - * re-used across calls to next()) but will be slower - * than calling {@link #next(Token)} instead.. */ - public Token next() throws IOException { - final Token reusableToken = new Token(); - Token nextToken = next(reusableToken); - - if (nextToken != null) { - Payload p = nextToken.getPayload(); - if (p != null) { - nextToken.setPayload((Payload) p.clone()); + /** @deprecated Remove this when old API is removed! */ + private TokenWrapper initTokenWrapper(AttributeSource input) { + if (onlyUseNewAPI) { + // no wrapper needed + return null; + } else { + // if possible get the wrapper from the filter's input stream + if (input instanceof TokenStream && ((TokenStream) input).tokenWrapper != null) { + return ((TokenStream) input).tokenWrapper; + } + // check that all attributes are implemented by the same TokenWrapper instance + final AttributeImpl att = addAttribute(TermAttribute.class); + if (att instanceof TokenWrapper && + addAttribute(TypeAttribute.class) == att && + addAttribute(PositionIncrementAttribute.class) == att && + addAttribute(FlagsAttribute.class) == att && + addAttribute(OffsetAttribute.class) == att && + addAttribute(PayloadAttribute.class) == att + ) { + return (TokenWrapper) att; + } else { + throw new UnsupportedOperationException( + "If onlyUseNewAPI is disabled, all basic Attributes must be implemented by the internal class "+ + "TokenWrapper. Please make sure, that all TokenStreams/TokenFilters in this chain have been "+ + "instantiated with this flag disabled and do not add any custom instances for the basic Attributes!" + ); } } + } - return nextToken; + /** @deprecated Remove this when old API is removed! */ + private void check() { + if (onlyUseNewAPI && !hasIncrementToken) { + throw new UnsupportedOperationException(getClass().getName()+" does not implement incrementToken() which is needed for onlyUseNewAPI."); + } + + // a TokenStream subclass must at least implement one of the methods! + if (!(hasIncrementToken || hasNext || hasReusableNext)) { + throw new UnsupportedOperationException(getClass().getName()+" does not implement any of incrementToken(), next(Token), next()."); + } + } + + /** + * For extra performance you can globally enable the new {@link #incrementToken} + * API using {@link Attribute}s. There will be a small, but in most cases neglectible performance + * increase by enabling this, but it only works if all TokenStreams and -Filters + * use the new API and implement {@link #incrementToken}. This setting can only be enabled + * globally. + *

    This setting only affects TokenStreams instantiated after this call. All TokenStreams + * already created use the other setting. + *

    All core analyzers are compatible with this setting, if you have own + * TokenStreams/-Filters, that are also compatible, enable this. + *

    When enabled, tokenization may throw {@link UnsupportedOperationException}s, + * if the whole tokenizer chain is not compatible. + *

    The default is false, so there is the fallback to the old API available. + * @deprecated This setting will be true per default in Lucene 3.0, + * when {@link #incrementToken} is abstract and must be always implemented. + */ + public static void setOnlyUseNewAPI(boolean onlyUseNewAPI) { + TokenStream.onlyUseNewAPI = onlyUseNewAPI; + } + + /** Returns if only the new API is used. + * @see #setOnlyUseNewAPI + * @deprecated This setting will be true per default in Lucene 3.0, + * when {@link #incrementToken} is abstract and must be always implemented. + */ + public static boolean getOnlyUseNewAPI() { + return onlyUseNewAPI; + } + + /** + * Consumers (e. g. the indexer) use this method to advance the stream + * to the next token. Implementing classes must implement this method + * and update the appropriate {@link AttributeImpl}s with content of the + * next token. + *

    + * This method is called for every token of a document, so an efficient + * implementation is crucial for good performance. To avoid calls to + * {@link #addAttribute(Class)} and {@link #getAttribute(Class)} and + * downcasts, references to all {@link AttributeImpl}s that this stream uses + * should be retrieved during instantiation. + *

    + * To make sure that filters and consumers know which attributes are available + * the attributes must be added during instantiation. Filters and + * consumers are not required to check for availability of attributes in {@link #incrementToken()}. + * + * @return false for end of stream; true otherwise + * + *

    + * Note that this method will be defined abstract in Lucene 3.0. + */ + public boolean incrementToken() throws IOException { + assert !onlyUseNewAPI && tokenWrapper != null; + + final Token token; + if (hasReusableNext) { + token = next(tokenWrapper.delegate); + } else { + assert hasNext; + token = next(); + } + if (token == null) return false; + tokenWrapper.delegate = token; + return true; } /** Returns the next token in the stream, or null at EOS. @@ -215,12 +302,46 @@ public abstract class TokenStream extends AttributeSource { * good idea to assert that it is not null.) * @return next token in the stream or null if end-of-stream was hit * @deprecated The new {@link #incrementToken()} and {@link AttributeSource} - * APIs should be used instead. See also {@link #useNewAPI()}. + * APIs should be used instead. */ public Token next(final Token reusableToken) throws IOException { - // We don't actually use inputToken, but still add this assert assert reusableToken != null; - return next(); + + if (onlyUseNewAPI) + throw new UnsupportedOperationException("This TokenStream only supports the new Attributes API."); + + if (hasIncrementToken) { + tokenWrapper.delegate = reusableToken; + return incrementToken() ? tokenWrapper.delegate : null; + } else { + assert hasNext; + final Token token = next(); + if (token == null) return null; + tokenWrapper.delegate = token; + return token; + } + } + + /** Returns the next token in the stream, or null at EOS. + * @deprecated The returned Token is a "full private copy" (not + * re-used across calls to next()) but will be slower + * than calling {@link #next(Token)} or using the new + * {@link #incrementToken()} method with the new + * {@link AttributeSource} API. + */ + public Token next() throws IOException { + if (onlyUseNewAPI) + throw new UnsupportedOperationException("This TokenStream only supports the new Attributes API."); + + if (hasIncrementToken) { + return incrementToken() ? ((Token) tokenWrapper.delegate.clone()) : null; + } else { + assert hasReusableNext; + final Token token = next(tokenWrapper.delegate); + if (token == null) return null; + tokenWrapper.delegate = token; + return (Token) token.clone(); + } } /** Resets this stream to the beginning. This is an @@ -240,24 +361,4 @@ public abstract class TokenStream extends AttributeSource { /** Releases resources associated with this stream. */ public void close() throws IOException {} - public String toString() { - StringBuffer sb = new StringBuffer(); - sb.append('('); - - if (hasAttributes()) { - // TODO Java 1.5 - //Iterator it = attributes.values().iterator(); - Iterator it = getAttributesIterator(); - if (it.hasNext()) { - sb.append(it.next().toString()); - } - while (it.hasNext()) { - sb.append(','); - sb.append(it.next().toString()); - } - } - sb.append(')'); - return sb.toString(); - } - } diff --git a/src/java/org/apache/lucene/analysis/TokenWrapper.java b/src/java/org/apache/lucene/analysis/TokenWrapper.java new file mode 100644 index 00000000000..e770a581394 --- /dev/null +++ b/src/java/org/apache/lucene/analysis/TokenWrapper.java @@ -0,0 +1,163 @@ +package org.apache.lucene.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.FlagsAttribute; +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.apache.lucene.index.Payload; +import org.apache.lucene.util.AttributeImpl; + +/** + * This class wraps a Token and supplies a single attribute instance + * where the delegate token can be replaced. + * @deprecated Will be removed, when old TokenStream API is removed. + */ +final class TokenWrapper extends AttributeImpl + implements Cloneable, TermAttribute, TypeAttribute, PositionIncrementAttribute, + FlagsAttribute, OffsetAttribute, PayloadAttribute { + + Token delegate; + + TokenWrapper() { + this(new Token()); + } + + TokenWrapper(Token delegate) { + this.delegate = delegate; + } + + // TermAttribute: + + public String term() { + return delegate.term(); + } + + public void setTermBuffer(char[] buffer, int offset, int length) { + delegate.setTermBuffer(buffer, offset, length); + } + + public void setTermBuffer(String buffer) { + delegate.setTermBuffer(buffer); + } + + public void setTermBuffer(String buffer, int offset, int length) { + delegate.setTermBuffer(buffer, offset, length); + } + + public char[] termBuffer() { + return delegate.termBuffer(); + } + + public char[] resizeTermBuffer(int newSize) { + return delegate.resizeTermBuffer(newSize); + } + + public int termLength() { + return delegate.termLength(); + } + + public void setTermLength(int length) { + delegate.setTermLength(length); + } + + // TypeAttribute: + + public String type() { + return delegate.type(); + } + + public void setType(String type) { + delegate.setType(type); + } + + public void setPositionIncrement(int positionIncrement) { + delegate.setPositionIncrement(positionIncrement); + } + + public int getPositionIncrement() { + return delegate.getPositionIncrement(); + } + + // FlagsAttribute + + public int getFlags() { + return delegate.getFlags(); + } + + public void setFlags(int flags) { + delegate.setFlags(flags); + } + + // OffsetAttribute + + public int startOffset() { + return delegate.startOffset(); + } + + public void setOffset(int startOffset, int endOffset) { + delegate.setOffset(startOffset, endOffset); + } + + public int endOffset() { + return delegate.endOffset(); + } + + // PayloadAttribute + public Payload getPayload() { + return delegate.getPayload(); + } + + public void setPayload(Payload payload) { + delegate.setPayload(payload); + } + + // TokenAttribute + + public void clear() { + delegate.clear(); + } + + // AttributeImpl + + public String toString() { + return delegate.toString(); + } + + public int hashCode() { + return delegate.hashCode(); + } + + public boolean equals(Object other) { + if (other instanceof TokenWrapper) { + return ((TokenWrapper) other).delegate.equals(this.delegate); + } + return false; + } + + public Object clone() { + return new TokenWrapper((Token) delegate.clone()); + } + + public void copyTo(AttributeImpl target) { + ((TokenWrapper) target).delegate = (Token) this.delegate.clone(); + } +} diff --git a/src/java/org/apache/lucene/analysis/Tokenizer.java b/src/java/org/apache/lucene/analysis/Tokenizer.java index e2525cf75ec..c77aca62707 100644 --- a/src/java/org/apache/lucene/analysis/Tokenizer.java +++ b/src/java/org/apache/lucene/analysis/Tokenizer.java @@ -24,17 +24,10 @@ import java.io.IOException;

    This is an abstract class.

    - NOTE: In order to enable the new API the method - {@link #useNewAPI()} has to be called with useNewAPI=true. - Otherwise the deprecated method {@link #next(Token)} will - be used by Lucene consumers (indexer and queryparser) to - consume the tokens. {@link #next(Token)} will be removed - in Lucene 3.0. -

    NOTE: To use the old API subclasses must override {@link #next(Token)}. It's also OK to instead override {@link #next()} but that method is slower compared to {@link #next(Token)}. -

    +

    NOTE: subclasses overriding {@link #next(Token)} must call {@link Token#clear()}. *

    diff --git a/src/java/org/apache/lucene/analysis/package.html b/src/java/org/apache/lucene/analysis/package.html index 7e683e8c2c9..641a47f6f01 100644 --- a/src/java/org/apache/lucene/analysis/package.html +++ b/src/java/org/apache/lucene/analysis/package.html @@ -442,57 +442,73 @@ are retrieved from the input stream in the incrementToken() method. in the TermAttribute the length of the term can be determined and too short or too long tokens are skipped. Note how incrementToken() can efficiently access the instance variable; no attribute lookup or downcasting is neccessary. The same is true for the consumer, which can simply use local references to the Attributes. +

    Adding a custom Attribute

    Now we're going to implement our own custom Attribute for part-of-speech tagging and call it consequently -PartOfSpeechAttribute: +PartOfSpeechAttribute. First we need to define the interface of the new Attribute:
    -  public static enum PartOfSpeech {
    -    Noun, Verb, Adjective, Adverb, Pronoun, Preposition, Conjunction, Article, Unknown
    -  }
    +  public interface PartOfSpeechAttribute extends Attribute {
    +    public static enum PartOfSpeech {
    +      Noun, Verb, Adjective, Adverb, Pronoun, Preposition, Conjunction, Article, Unknown
    +    }
       
    -  public static final class PartOfSpeechAttribute extends Attribute {
    -    
    -    private PartOfSpeech pos = PartOfSpeech.Unknown;
    -    
    -    public void setPartOfSpeech(PartOfSpeech pos) {
    -      this.pos = pos;
    -    }
    -    
    -    public PartOfSpeech getPartOfSpeech() {
    -      return pos;
    -    }
    -
    -    public void clear() {
    -      pos = PartOfSpeech.Unknown;
    -    }
    -
    -    public void copyTo(Attribute target) {
    -      ((PartOfSpeechAttribute) target).pos = pos;
    -    }
    -
    -    public boolean equals(Object other) {
    -      if (other == this) {
    -        return true;
    -      }
    -      
    -      if (other instanceof PartOfSpeechAttribute) {
    -        return pos == ((PartOfSpeechAttribute) other).pos;
    -      }
    -   
    -      return false;
    -    }
    -
    -    public int hashCode() {
    -      return pos.ordinal();
    -    }
    -
    -    public String toString() {
    -      return "PartOfSpeech=" + pos;
    -    }
    +    public void setPartOfSpeech(PartOfSpeech pos);
    +  
    +    public PartOfSpeech getPartOfSpeech();
       }
     
    -This is a simple Attribute that has only a single variable that stores the part-of-speech of a token. It extends the -new Attribute class and therefore implements its abstract methods clear(), copyTo(), equals(), hashCode(), toString(). + +Now we also need to write the implementing class. The name of that class is important here: By default, Lucene +checks if there is a class with the name of the Attribute with the postfix 'Impl'. In this example, we would +consequently call the implementing class PartOfSpeechAttributeImpl.
    +This should be the usual behavior. However, there is also an expert-API that allows changing these naming conventions: +{@link org.apache.lucene.util.AttributeSource.AttributeFactory}. The factory accepts an Attribute interface as argument +and returns an actual instance. You can implement your own factory if you need to change the default behavior.

    + +Now here is the actual class that implements our new Attribute. Notice that the class has to extend +{@link org.apache.lucene.util.AttributeSource.AttributeImpl}: + +
    +public final class PartOfSpeechAttributeImpl extends AttributeImpl 
    +                            implements PartOfSpeechAttribute{
    +  
    +  private PartOfSpeech pos = PartOfSpeech.Unknown;
    +  
    +  public void setPartOfSpeech(PartOfSpeech pos) {
    +    this.pos = pos;
    +  }
    +  
    +  public PartOfSpeech getPartOfSpeech() {
    +    return pos;
    +  }
    +
    +  public void clear() {
    +    pos = PartOfSpeech.Unknown;
    +  }
    +
    +  public void copyTo(AttributeImpl target) {
    +    ((PartOfSpeechAttributeImpl) target).pos = pos;
    +  }
    +
    +  public boolean equals(Object other) {
    +    if (other == this) {
    +      return true;
    +    }
    +    
    +    if (other instanceof PartOfSpeechAttributeImpl) {
    +      return pos == ((PartOfSpeechAttributeImpl) other).pos;
    +    }
    + 
    +    return false;
    +  }
    +
    +  public int hashCode() {
    +    return pos.ordinal();
    +  }
    +}
    +
    +This is a simple Attribute implementation has only a single variable that stores the part-of-speech of a token. It extends the +new AttributeImpl class and therefore implements its abstract methods clear(), copyTo(), equals(), hashCode(). Now we need a TokenFilter that can set this new PartOfSpeechAttribute for each token. In this example we show a very naive filter that tags every word with a leading upper-case letter as a 'Noun' and all other words as 'Unknown'.
    @@ -523,7 +539,9 @@ that tags every word with a leading upper-case letter as a 'Noun' and all other
       }
     
    Just like the LengthFilter, this new filter accesses the attributes it needs in the constructor and -stores references in instance variables. Now we need to add the filter to the chain: +stores references in instance variables. Notice how you only need to pass in the interface of the new +Attribute and instantiating the correct class is automatically been taken care of. +Now we need to add the filter to the chain:
       public TokenStream tokenStream(String fieldName, Reader reader) {
         TokenStream stream = new WhitespaceTokenizer(reader);
    @@ -582,7 +600,8 @@ of a sentence or not. Then the PartOfSpeechTaggingFilter can make use of this kn
     as nouns if not the first word of a sentence (we know, this is still not a correct behavior, but hey, it's a good exercise). 
     As a small hint, this is how the new Attribute class could begin:
     
    -  public class FirstTokenOfSentenceAttribute extends Attribute {
    +  public class FirstTokenOfSentenceAttributeImpl extends Attribute
    +                   implements FirstTokenOfSentenceAttribute {
         
         private boolean firstToken;
         
    diff --git a/src/java/org/apache/lucene/analysis/standard/StandardFilter.java b/src/java/org/apache/lucene/analysis/standard/StandardFilter.java
    index 72ff4bffc23..5c5b596a390 100644
    --- a/src/java/org/apache/lucene/analysis/standard/StandardFilter.java
    +++ b/src/java/org/apache/lucene/analysis/standard/StandardFilter.java
    @@ -73,39 +73,4 @@ public final class StandardFilter extends TokenFilter {
     
         return true;
       }
    -  
    -  /** Returns the next token in the stream, or null at EOS.
    -   * 

    Removes 's from the end of words. - *

    Removes dots from acronyms. - * @deprecated - */ - public final Token next(final Token reusableToken) throws java.io.IOException { - assert reusableToken != null; - Token nextToken = input.next(reusableToken); - - if (nextToken == null) - return null; - - char[] buffer = nextToken.termBuffer(); - final int bufferLength = nextToken.termLength(); - final String type = nextToken.type(); - - if (type == APOSTROPHE_TYPE && // remove 's - bufferLength >= 2 && - buffer[bufferLength-2] == '\'' && - (buffer[bufferLength-1] == 's' || buffer[bufferLength-1] == 'S')) { - // Strip last 2 characters off - nextToken.setTermLength(bufferLength - 2); - } else if (type == ACRONYM_TYPE) { // remove dots - int upto = 0; - for(int i=0;i */ -public class FlagsAttribute extends Attribute implements Cloneable, Serializable { - private int flags = 0; - +public interface FlagsAttribute extends Attribute { /** * EXPERIMENTAL: While we think this is here to stay, we may want to change it to be a long. *

    @@ -44,43 +40,10 @@ public class FlagsAttribute extends Attribute implements Cloneable, Serializable * * @return The bits */ - public int getFlags() { - return flags; - } + public int getFlags(); /** * @see #getFlags() */ - public void setFlags(int flags) { - this.flags = flags; - } - - public void clear() { - flags = 0; - } - - public String toString() { - return "flags=" + flags; - } - - public boolean equals(Object other) { - if (this == other) { - return true; - } - - if (other instanceof FlagsAttribute) { - return ((FlagsAttribute) other).flags == flags; - } - - return false; - } - - public int hashCode() { - return flags; - } - - public void copyTo(Attribute target) { - FlagsAttribute t = (FlagsAttribute) target; - t.setFlags(flags); - } + public void setFlags(int flags); } diff --git a/src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttributeImpl.java b/src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttributeImpl.java new file mode 100644 index 00000000000..a22b82a7c66 --- /dev/null +++ b/src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttributeImpl.java @@ -0,0 +1,82 @@ +package org.apache.lucene.analysis.tokenattributes; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Serializable; + +import org.apache.lucene.util.AttributeImpl; + +/** + * This attribute can be used to pass different flags down the tokenizer chain, + * e. g. from one TokenFilter to another one. + * + *

    + * WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental. + * The APIs introduced in these classes with Lucene 2.9 might change in the future. + * We will make our best efforts to keep the APIs backwards-compatible. + + */ +public class FlagsAttributeImpl extends AttributeImpl implements FlagsAttribute, Cloneable, Serializable { + private int flags = 0; + + /** + * EXPERIMENTAL: While we think this is here to stay, we may want to change it to be a long. + *

    + * + * Get the bitset for any bits that have been set. This is completely distinct from {@link TypeAttribute#type()}, although they do share similar purposes. + * The flags can be used to encode information about the token for use by other {@link org.apache.lucene.analysis.TokenFilter}s. + * + * + * @return The bits + */ + public int getFlags() { + return flags; + } + + /** + * @see #getFlags() + */ + public void setFlags(int flags) { + this.flags = flags; + } + + public void clear() { + flags = 0; + } + + public boolean equals(Object other) { + if (this == other) { + return true; + } + + if (other instanceof FlagsAttributeImpl) { + return ((FlagsAttributeImpl) other).flags == flags; + } + + return false; + } + + public int hashCode() { + return flags; + } + + public void copyTo(AttributeImpl target) { + FlagsAttribute t = (FlagsAttribute) target; + t.setFlags(flags); + } +} diff --git a/src/java/org/apache/lucene/analysis/tokenattributes/OffsetAttribute.java b/src/java/org/apache/lucene/analysis/tokenattributes/OffsetAttribute.java index 93585e1353c..b85bab42b76 100644 --- a/src/java/org/apache/lucene/analysis/tokenattributes/OffsetAttribute.java +++ b/src/java/org/apache/lucene/analysis/tokenattributes/OffsetAttribute.java @@ -17,8 +17,6 @@ package org.apache.lucene.analysis.tokenattributes; * limitations under the License. */ -import java.io.Serializable; - import org.apache.lucene.util.Attribute; /** @@ -29,67 +27,23 @@ import org.apache.lucene.util.Attribute; * The APIs introduced in these classes with Lucene 2.9 might change in the future. * We will make our best efforts to keep the APIs backwards-compatible. */ -public class OffsetAttribute extends Attribute implements Cloneable, Serializable { - private int startOffset; - private int endOffset; - +public interface OffsetAttribute extends Attribute { /** Returns this Token's starting offset, the position of the first character corresponding to this token in the source text. Note that the difference between endOffset() and startOffset() may not be equal to termText.length(), as the term text may have been altered by a stemmer or some other filter. */ - public int startOffset() { - return startOffset; - } + public int startOffset(); /** Set the starting and ending offset. @see #startOffset() and #endOffset()*/ - public void setOffset(int startOffset, int endOffset) { - this.startOffset = startOffset; - this.endOffset = endOffset; - } + public void setOffset(int startOffset, int endOffset); /** Returns this Token's ending offset, one greater than the position of the last character corresponding to this token in the source text. The length of the token in the source text is (endOffset - startOffset). */ - public int endOffset() { - return endOffset; - } - - - public void clear() { - startOffset = 0; - endOffset = 0; - } - - public String toString() { - return "start=" + startOffset + ",end=" + endOffset; - } - - public boolean equals(Object other) { - if (other == this) { - return true; - } - - if (other instanceof OffsetAttribute) { - OffsetAttribute o = (OffsetAttribute) other; - return o.startOffset == startOffset && o.endOffset == endOffset; - } - - return false; - } - - public int hashCode() { - int code = startOffset; - code = code * 31 + endOffset; - return code; - } - - public void copyTo(Attribute target) { - OffsetAttribute t = (OffsetAttribute) target; - t.setOffset(startOffset, endOffset); - } + public int endOffset(); } diff --git a/src/java/org/apache/lucene/analysis/tokenattributes/OffsetAttributeImpl.java b/src/java/org/apache/lucene/analysis/tokenattributes/OffsetAttributeImpl.java new file mode 100644 index 00000000000..cb4eb5e4c2b --- /dev/null +++ b/src/java/org/apache/lucene/analysis/tokenattributes/OffsetAttributeImpl.java @@ -0,0 +1,91 @@ +package org.apache.lucene.analysis.tokenattributes; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Serializable; + +import org.apache.lucene.util.AttributeImpl; + +/** + * The start and end character offset of a Token. + * + *

    + * WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental. + * The APIs introduced in these classes with Lucene 2.9 might change in the future. + * We will make our best efforts to keep the APIs backwards-compatible. + */ +public class OffsetAttributeImpl extends AttributeImpl implements OffsetAttribute, Cloneable, Serializable { + private int startOffset; + private int endOffset; + + /** Returns this Token's starting offset, the position of the first character + corresponding to this token in the source text. + + Note that the difference between endOffset() and startOffset() may not be + equal to termText.length(), as the term text may have been altered by a + stemmer or some other filter. */ + public int startOffset() { + return startOffset; + } + + + /** Set the starting and ending offset. + @see #startOffset() and #endOffset()*/ + public void setOffset(int startOffset, int endOffset) { + this.startOffset = startOffset; + this.endOffset = endOffset; + } + + + /** Returns this Token's ending offset, one greater than the position of the + last character corresponding to this token in the source text. The length + of the token in the source text is (endOffset - startOffset). */ + public int endOffset() { + return endOffset; + } + + + public void clear() { + startOffset = 0; + endOffset = 0; + } + + public boolean equals(Object other) { + if (other == this) { + return true; + } + + if (other instanceof OffsetAttributeImpl) { + OffsetAttributeImpl o = (OffsetAttributeImpl) other; + return o.startOffset == startOffset && o.endOffset == endOffset; + } + + return false; + } + + public int hashCode() { + int code = startOffset; + code = code * 31 + endOffset; + return code; + } + + public void copyTo(AttributeImpl target) { + OffsetAttribute t = (OffsetAttribute) target; + t.setOffset(startOffset, endOffset); + } +} diff --git a/src/java/org/apache/lucene/analysis/tokenattributes/PayloadAttribute.java b/src/java/org/apache/lucene/analysis/tokenattributes/PayloadAttribute.java index 8f0a37e03cd..2bffe62a233 100644 --- a/src/java/org/apache/lucene/analysis/tokenattributes/PayloadAttribute.java +++ b/src/java/org/apache/lucene/analysis/tokenattributes/PayloadAttribute.java @@ -17,8 +17,6 @@ package org.apache.lucene.analysis.tokenattributes; * limitations under the License. */ -import java.io.Serializable; - import org.apache.lucene.index.Payload; import org.apache.lucene.util.Attribute; @@ -30,80 +28,14 @@ import org.apache.lucene.util.Attribute; * The APIs introduced in these classes with Lucene 2.9 might change in the future. * We will make our best efforts to keep the APIs backwards-compatible. */ -public class PayloadAttribute extends Attribute implements Cloneable, Serializable { - private Payload payload; - - /** - * Initialize this attribute with no payload. - */ - public PayloadAttribute() {} - - /** - * Initialize this attribute with the given payload. - */ - public PayloadAttribute(Payload payload) { - this.payload = payload; - } - +public interface PayloadAttribute extends Attribute { /** * Returns this Token's payload. */ - public Payload getPayload() { - return this.payload; - } + public Payload getPayload(); /** * Sets this Token's payload. */ - public void setPayload(Payload payload) { - this.payload = payload; - } - - public void clear() { - payload = null; - } - - public String toString() { - if (payload == null) { - return "payload=null"; - } - - return "payload=" + payload.toString(); - } - - public Object clone() { - PayloadAttribute clone = (PayloadAttribute) super.clone(); - if (payload != null) { - clone.payload = (Payload) payload.clone(); - } - return clone; - } - - public boolean equals(Object other) { - if (other == this) { - return true; - } - - if (other instanceof PayloadAttribute) { - PayloadAttribute o = (PayloadAttribute) other; - if (o.payload == null || payload == null) { - return o.payload == null && payload == null; - } - - return o.payload.equals(payload); - } - - return false; - } - - public int hashCode() { - return (payload == null) ? 0 : payload.hashCode(); - } - - public void copyTo(Attribute target) { - PayloadAttribute t = (PayloadAttribute) target; - t.setPayload((payload == null) ? null : (Payload) payload.clone()); - } - - + public void setPayload(Payload payload); } diff --git a/src/java/org/apache/lucene/analysis/tokenattributes/PayloadAttributeImpl.java b/src/java/org/apache/lucene/analysis/tokenattributes/PayloadAttributeImpl.java new file mode 100644 index 00000000000..67923010812 --- /dev/null +++ b/src/java/org/apache/lucene/analysis/tokenattributes/PayloadAttributeImpl.java @@ -0,0 +1,101 @@ +package org.apache.lucene.analysis.tokenattributes; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Serializable; + +import org.apache.lucene.index.Payload; +import org.apache.lucene.util.AttributeImpl; + +/** + * The payload of a Token. See also {@link Payload}. + * + *

    + * WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental. + * The APIs introduced in these classes with Lucene 2.9 might change in the future. + * We will make our best efforts to keep the APIs backwards-compatible. + */ +public class PayloadAttributeImpl extends AttributeImpl implements PayloadAttribute, Cloneable, Serializable { + private Payload payload; + + /** + * Initialize this attribute with no payload. + */ + public PayloadAttributeImpl() {} + + /** + * Initialize this attribute with the given payload. + */ + public PayloadAttributeImpl(Payload payload) { + this.payload = payload; + } + + /** + * Returns this Token's payload. + */ + public Payload getPayload() { + return this.payload; + } + + /** + * Sets this Token's payload. + */ + public void setPayload(Payload payload) { + this.payload = payload; + } + + public void clear() { + payload = null; + } + + public Object clone() { + PayloadAttributeImpl clone = (PayloadAttributeImpl) super.clone(); + if (payload != null) { + clone.payload = (Payload) payload.clone(); + } + return clone; + } + + public boolean equals(Object other) { + if (other == this) { + return true; + } + + if (other instanceof PayloadAttribute) { + PayloadAttributeImpl o = (PayloadAttributeImpl) other; + if (o.payload == null || payload == null) { + return o.payload == null && payload == null; + } + + return o.payload.equals(payload); + } + + return false; + } + + public int hashCode() { + return (payload == null) ? 0 : payload.hashCode(); + } + + public void copyTo(AttributeImpl target) { + PayloadAttribute t = (PayloadAttribute) target; + t.setPayload((payload == null) ? null : (Payload) payload.clone()); + } + + +} diff --git a/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttribute.java b/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttribute.java index 50400cc14e5..947758c6469 100644 --- a/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttribute.java +++ b/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttribute.java @@ -17,13 +17,10 @@ package org.apache.lucene.analysis.tokenattributes; * limitations under the License. */ -import java.io.Serializable; - -import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.util.Attribute; /** The positionIncrement determines the position of this token - * relative to the previous Token in a {@link TokenStream}, used in phrase + * relative to the previous Token in a TokenStream, used in phrase * searching. * *

    The default value is one. @@ -53,54 +50,15 @@ import org.apache.lucene.util.Attribute; * * @see org.apache.lucene.index.TermPositions */ -public class PositionIncrementAttribute extends Attribute implements Cloneable, Serializable { - private int positionIncrement = 1; - +public interface PositionIncrementAttribute extends Attribute { /** Set the position increment. The default value is one. * * @param positionIncrement the distance from the prior term */ - public void setPositionIncrement(int positionIncrement) { - if (positionIncrement < 0) - throw new IllegalArgumentException - ("Increment must be zero or greater: " + positionIncrement); - this.positionIncrement = positionIncrement; - } + public void setPositionIncrement(int positionIncrement); /** Returns the position increment of this Token. * @see #setPositionIncrement */ - public int getPositionIncrement() { - return positionIncrement; - } - - public void clear() { - this.positionIncrement = 1; - } - - public String toString() { - return "positionIncrement=" + positionIncrement; - } - - public boolean equals(Object other) { - if (other == this) { - return true; - } - - if (other instanceof PositionIncrementAttribute) { - return positionIncrement == ((PositionIncrementAttribute) other).positionIncrement; - } - - return false; - } - - public int hashCode() { - return positionIncrement; - } - - public void copyTo(Attribute target) { - PositionIncrementAttribute t = (PositionIncrementAttribute) target; - t.setPositionIncrement(positionIncrement); - } - + public int getPositionIncrement(); } diff --git a/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttributeImpl.java b/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttributeImpl.java new file mode 100644 index 00000000000..84fcf58680b --- /dev/null +++ b/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttributeImpl.java @@ -0,0 +1,102 @@ +package org.apache.lucene.analysis.tokenattributes; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Serializable; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.util.AttributeImpl; + +/** The positionIncrement determines the position of this token + * relative to the previous Token in a {@link TokenStream}, used in phrase + * searching. + * + *

    The default value is one. + * + *

    Some common uses for this are:

      + * + *
    • Set it to zero to put multiple terms in the same position. This is + * useful if, e.g., a word has multiple stems. Searches for phrases + * including either stem will match. In this case, all but the first stem's + * increment should be set to zero: the increment of the first instance + * should be one. Repeating a token with an increment of zero can also be + * used to boost the scores of matches on that token. + * + *
    • Set it to values greater than one to inhibit exact phrase matches. + * If, for example, one does not want phrases to match across removed stop + * words, then one could build a stop word filter that removes stop words and + * also sets the increment to the number of stop words removed before each + * non-stop word. Then exact phrase queries will only match when the terms + * occur with no intervening stop words. + * + *
    + * + *

    + * WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental. + * The APIs introduced in these classes with Lucene 2.9 might change in the future. + * We will make our best efforts to keep the APIs backwards-compatible. + * + * @see org.apache.lucene.index.TermPositions + */ +public class PositionIncrementAttributeImpl extends AttributeImpl implements PositionIncrementAttribute, Cloneable, Serializable { + private int positionIncrement = 1; + + /** Set the position increment. The default value is one. + * + * @param positionIncrement the distance from the prior term + */ + public void setPositionIncrement(int positionIncrement) { + if (positionIncrement < 0) + throw new IllegalArgumentException + ("Increment must be zero or greater: " + positionIncrement); + this.positionIncrement = positionIncrement; + } + + /** Returns the position increment of this Token. + * @see #setPositionIncrement + */ + public int getPositionIncrement() { + return positionIncrement; + } + + public void clear() { + this.positionIncrement = 1; + } + + public boolean equals(Object other) { + if (other == this) { + return true; + } + + if (other instanceof PositionIncrementAttributeImpl) { + return positionIncrement == ((PositionIncrementAttributeImpl) other).positionIncrement; + } + + return false; + } + + public int hashCode() { + return positionIncrement; + } + + public void copyTo(AttributeImpl target) { + PositionIncrementAttribute t = (PositionIncrementAttribute) target; + t.setPositionIncrement(positionIncrement); + } + +} diff --git a/src/java/org/apache/lucene/analysis/tokenattributes/TermAttribute.java b/src/java/org/apache/lucene/analysis/tokenattributes/TermAttribute.java index 0642148f32e..72db22819d4 100644 --- a/src/java/org/apache/lucene/analysis/tokenattributes/TermAttribute.java +++ b/src/java/org/apache/lucene/analysis/tokenattributes/TermAttribute.java @@ -17,9 +17,6 @@ package org.apache.lucene.analysis.tokenattributes; * limitations under the License. */ -import java.io.Serializable; - -import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.Attribute; /** @@ -30,12 +27,7 @@ import org.apache.lucene.util.Attribute; * The APIs introduced in these classes with Lucene 2.9 might change in the future. * We will make our best efforts to keep the APIs backwards-compatible. */ -public class TermAttribute extends Attribute implements Cloneable, Serializable { - private static int MIN_BUFFER_SIZE = 10; - - private char[] termBuffer; - private int termLength; - +public interface TermAttribute extends Attribute { /** Returns the Token's term text. * * This method has a performance penalty @@ -45,38 +37,20 @@ public class TermAttribute extends Attribute implements Cloneable, Serializable * String, use this method, which is nothing more than * a convenience call to new String(token.termBuffer(), 0, token.termLength()) */ - public String term() { - initTermBuffer(); - return new String(termBuffer, 0, termLength); - } - + public String term(); + /** Copies the contents of buffer, starting at offset for * length characters, into the termBuffer array. * @param buffer the buffer to copy * @param offset the index in the buffer of the first character to copy * @param length the number of characters to copy */ - public void setTermBuffer(char[] buffer, int offset, int length) { - char[] newCharBuffer = growTermBuffer(length); - if (newCharBuffer != null) { - termBuffer = newCharBuffer; - } - System.arraycopy(buffer, offset, termBuffer, 0, length); - termLength = length; - } + public void setTermBuffer(char[] buffer, int offset, int length); /** Copies the contents of buffer into the termBuffer array. * @param buffer the buffer to copy */ - public void setTermBuffer(String buffer) { - int length = buffer.length(); - char[] newCharBuffer = growTermBuffer(length); - if (newCharBuffer != null) { - termBuffer = newCharBuffer; - } - buffer.getChars(0, length, termBuffer, 0); - termLength = length; - } + public void setTermBuffer(String buffer); /** Copies the contents of buffer, starting at offset and continuing * for length characters, into the termBuffer array. @@ -84,17 +58,8 @@ public class TermAttribute extends Attribute implements Cloneable, Serializable * @param offset the index in the buffer of the first character to copy * @param length the number of characters to copy */ - public void setTermBuffer(String buffer, int offset, int length) { - assert offset <= buffer.length(); - assert offset + length <= buffer.length(); - char[] newCharBuffer = growTermBuffer(length); - if (newCharBuffer != null) { - termBuffer = newCharBuffer; - } - buffer.getChars(offset, offset + length, termBuffer, 0); - termLength = length; - } - + public void setTermBuffer(String buffer, int offset, int length); + /** Returns the internal termBuffer character array which * you can then directly alter. If the array is too * small for your token, use {@link @@ -102,10 +67,7 @@ public class TermAttribute extends Attribute implements Cloneable, Serializable * altering the buffer be sure to call {@link * #setTermLength} to record the number of valid * characters that were placed into the termBuffer. */ - public char[] termBuffer() { - initTermBuffer(); - return termBuffer; - } + public char[] termBuffer(); /** Grows the termBuffer to at least size newSize, preserving the * existing content. Note: If the next operation is to change @@ -117,63 +79,12 @@ public class TermAttribute extends Attribute implements Cloneable, Serializable * @param newSize minimum size of the new termBuffer * @return newly created termBuffer with length >= newSize */ - public char[] resizeTermBuffer(int newSize) { - char[] newCharBuffer = growTermBuffer(newSize); - if (termBuffer == null) { - // If there were termText, then preserve it. - // note that if termBuffer is null then newCharBuffer cannot be null - assert newCharBuffer != null; - termBuffer = newCharBuffer; - } else if (newCharBuffer != null) { - // Note: if newCharBuffer != null then termBuffer needs to grow. - // If there were a termBuffer, then preserve it - System.arraycopy(termBuffer, 0, newCharBuffer, 0, termBuffer.length); - termBuffer = newCharBuffer; - } - return termBuffer; - } - - /** Allocates a buffer char[] of at least newSize - * @param newSize minimum size of the buffer - * @return newly created buffer with length >= newSize or null if the current termBuffer is big enough - */ - private char[] growTermBuffer(int newSize) { - if (termBuffer != null) { - if (termBuffer.length >= newSize) - // Already big enough - return null; - else - // Not big enough; create a new array with slight - // over allocation: - return new char[ArrayUtil.getNextSize(newSize)]; - } else { - - // determine the best size - // The buffer is always at least MIN_BUFFER_SIZE - if (newSize < MIN_BUFFER_SIZE) { - newSize = MIN_BUFFER_SIZE; - } - - return new char[newSize]; - } - } - - // TODO: once we remove the deprecated termText() method - // and switch entirely to char[] termBuffer we don't need - // to use this method anymore - private void initTermBuffer() { - if (termBuffer == null) { - termBuffer = new char[MIN_BUFFER_SIZE]; - termLength = 0; - } - } + public char[] resizeTermBuffer(int newSize); /** Return number of valid characters (length of the term) * in the termBuffer array. */ - public int termLength() { - return termLength; - } - + public int termLength(); + /** Set number of valid characters (length of the term) in * the termBuffer array. Use this to truncate the termBuffer * or to synchronize with external manipulation of the termBuffer. @@ -181,61 +92,5 @@ public class TermAttribute extends Attribute implements Cloneable, Serializable * use {@link #resizeTermBuffer(int)} first. * @param length the truncated length */ - public void setTermLength(int length) { - initTermBuffer(); - if (length > termBuffer.length) - throw new IllegalArgumentException("length " + length + " exceeds the size of the termBuffer (" + termBuffer.length + ")"); - termLength = length; - } - - public int hashCode() { - initTermBuffer(); - int code = termLength; - code = code * 31 + ArrayUtil.hashCode(termBuffer, 0, termLength); - return code; - } - - public void clear() { - termLength = 0; - } - - public Object clone() { - TermAttribute t = (TermAttribute)super.clone(); - // Do a deep clone - if (termBuffer != null) { - t.termBuffer = (char[]) termBuffer.clone(); - } - return t; - } - - public boolean equals(Object other) { - if (other == this) { - return true; - } - - if (other instanceof TermAttribute) { - initTermBuffer(); - TermAttribute o = ((TermAttribute) other); - o.initTermBuffer(); - - for(int i=0;i + * WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental. + * The APIs introduced in these classes with Lucene 2.9 might change in the future. + * We will make our best efforts to keep the APIs backwards-compatible. + */ +public class TermAttributeImpl extends AttributeImpl implements TermAttribute, Cloneable, Serializable { + private static int MIN_BUFFER_SIZE = 10; + + private char[] termBuffer; + private int termLength; + + /** Returns the Token's term text. + * + * This method has a performance penalty + * because the text is stored internally in a char[]. If + * possible, use {@link #termBuffer()} and {@link + * #termLength()} directly instead. If you really need a + * String, use this method, which is nothing more than + * a convenience call to new String(token.termBuffer(), 0, token.termLength()) + */ + public String term() { + initTermBuffer(); + return new String(termBuffer, 0, termLength); + } + + /** Copies the contents of buffer, starting at offset for + * length characters, into the termBuffer array. + * @param buffer the buffer to copy + * @param offset the index in the buffer of the first character to copy + * @param length the number of characters to copy + */ + public void setTermBuffer(char[] buffer, int offset, int length) { + char[] newCharBuffer = growTermBuffer(length); + if (newCharBuffer != null) { + termBuffer = newCharBuffer; + } + System.arraycopy(buffer, offset, termBuffer, 0, length); + termLength = length; + } + + /** Copies the contents of buffer into the termBuffer array. + * @param buffer the buffer to copy + */ + public void setTermBuffer(String buffer) { + int length = buffer.length(); + char[] newCharBuffer = growTermBuffer(length); + if (newCharBuffer != null) { + termBuffer = newCharBuffer; + } + buffer.getChars(0, length, termBuffer, 0); + termLength = length; + } + + /** Copies the contents of buffer, starting at offset and continuing + * for length characters, into the termBuffer array. + * @param buffer the buffer to copy + * @param offset the index in the buffer of the first character to copy + * @param length the number of characters to copy + */ + public void setTermBuffer(String buffer, int offset, int length) { + assert offset <= buffer.length(); + assert offset + length <= buffer.length(); + char[] newCharBuffer = growTermBuffer(length); + if (newCharBuffer != null) { + termBuffer = newCharBuffer; + } + buffer.getChars(offset, offset + length, termBuffer, 0); + termLength = length; + } + + /** Returns the internal termBuffer character array which + * you can then directly alter. If the array is too + * small for your token, use {@link + * #resizeTermBuffer(int)} to increase it. After + * altering the buffer be sure to call {@link + * #setTermLength} to record the number of valid + * characters that were placed into the termBuffer. */ + public char[] termBuffer() { + initTermBuffer(); + return termBuffer; + } + + /** Grows the termBuffer to at least size newSize, preserving the + * existing content. Note: If the next operation is to change + * the contents of the term buffer use + * {@link #setTermBuffer(char[], int, int)}, + * {@link #setTermBuffer(String)}, or + * {@link #setTermBuffer(String, int, int)} + * to optimally combine the resize with the setting of the termBuffer. + * @param newSize minimum size of the new termBuffer + * @return newly created termBuffer with length >= newSize + */ + public char[] resizeTermBuffer(int newSize) { + char[] newCharBuffer = growTermBuffer(newSize); + if (termBuffer == null) { + // If there were termText, then preserve it. + // note that if termBuffer is null then newCharBuffer cannot be null + assert newCharBuffer != null; + termBuffer = newCharBuffer; + } else if (newCharBuffer != null) { + // Note: if newCharBuffer != null then termBuffer needs to grow. + // If there were a termBuffer, then preserve it + System.arraycopy(termBuffer, 0, newCharBuffer, 0, termBuffer.length); + termBuffer = newCharBuffer; + } + return termBuffer; + } + + /** Allocates a buffer char[] of at least newSize + * @param newSize minimum size of the buffer + * @return newly created buffer with length >= newSize or null if the current termBuffer is big enough + */ + private char[] growTermBuffer(int newSize) { + if (termBuffer != null) { + if (termBuffer.length >= newSize) + // Already big enough + return null; + else + // Not big enough; create a new array with slight + // over allocation: + return new char[ArrayUtil.getNextSize(newSize)]; + } else { + + // determine the best size + // The buffer is always at least MIN_BUFFER_SIZE + if (newSize < MIN_BUFFER_SIZE) { + newSize = MIN_BUFFER_SIZE; + } + + return new char[newSize]; + } + } + + // TODO: once we remove the deprecated termText() method + // and switch entirely to char[] termBuffer we don't need + // to use this method anymore + private void initTermBuffer() { + if (termBuffer == null) { + termBuffer = new char[MIN_BUFFER_SIZE]; + termLength = 0; + } + } + + /** Return number of valid characters (length of the term) + * in the termBuffer array. */ + public int termLength() { + return termLength; + } + + /** Set number of valid characters (length of the term) in + * the termBuffer array. Use this to truncate the termBuffer + * or to synchronize with external manipulation of the termBuffer. + * Note: to grow the size of the array, + * use {@link #resizeTermBuffer(int)} first. + * @param length the truncated length + */ + public void setTermLength(int length) { + initTermBuffer(); + if (length > termBuffer.length) + throw new IllegalArgumentException("length " + length + " exceeds the size of the termBuffer (" + termBuffer.length + ")"); + termLength = length; + } + + public int hashCode() { + initTermBuffer(); + int code = termLength; + code = code * 31 + ArrayUtil.hashCode(termBuffer, 0, termLength); + return code; + } + + public void clear() { + termLength = 0; + } + + public Object clone() { + TermAttributeImpl t = (TermAttributeImpl)super.clone(); + // Do a deep clone + if (termBuffer != null) { + t.termBuffer = (char[]) termBuffer.clone(); + } + return t; + } + + public boolean equals(Object other) { + if (other == this) { + return true; + } + + if (other instanceof TermAttribute) { + initTermBuffer(); + TermAttributeImpl o = ((TermAttributeImpl) other); + o.initTermBuffer(); + + for(int i=0;i + * WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental. + * The APIs introduced in these classes with Lucene 2.9 might change in the future. + * We will make our best efforts to keep the APIs backwards-compatible. + */ +public class TypeAttributeImpl extends AttributeImpl implements TypeAttribute, Cloneable, Serializable { + private String type; + public static final String DEFAULT_TYPE = "word"; + + public TypeAttributeImpl() { + this(DEFAULT_TYPE); + } + + public TypeAttributeImpl(String type) { + this.type = type; + } + + /** Returns this Token's lexical type. Defaults to "word". */ + public String type() { + return type; + } + + /** Set the lexical type. + @see #type() */ + public void setType(String type) { + this.type = type; + } + + public void clear() { + type = DEFAULT_TYPE; + } + + public boolean equals(Object other) { + if (other == this) { + return true; + } + + if (other instanceof TypeAttributeImpl) { + return type.equals(((TypeAttributeImpl) other).type); + } + + return false; + } + + public int hashCode() { + return type.hashCode(); + } + + public void copyTo(AttributeImpl target) { + TypeAttribute t = (TypeAttribute) target; + t.setType(new String(type)); + } +} diff --git a/src/java/org/apache/lucene/index/DocInverterPerField.java b/src/java/org/apache/lucene/index/DocInverterPerField.java index 140cac026b6..00e19038532 100644 --- a/src/java/org/apache/lucene/index/DocInverterPerField.java +++ b/src/java/org/apache/lucene/index/DocInverterPerField.java @@ -20,7 +20,6 @@ package org.apache.lucene.index; import java.io.IOException; import java.io.Reader; import org.apache.lucene.document.Fieldable; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; @@ -83,7 +82,6 @@ final class DocInverterPerField extends DocFieldConsumerPerField { final int valueLength = stringValue.length(); perThread.singleTokenTokenStream.reinit(stringValue, 0, valueLength); fieldState.attributeSource = perThread.singleTokenTokenStream; - perThread.localTokenStream.reset(); consumer.start(field); boolean success = false; @@ -132,21 +130,15 @@ final class DocInverterPerField extends DocFieldConsumerPerField { try { int offsetEnd = fieldState.offset-1; - boolean useNewTokenStreamAPI = stream.useNewAPI(); - Token localToken = null; - - if (useNewTokenStreamAPI) { - fieldState.attributeSource = stream; - } else { - fieldState.attributeSource = perThread.localTokenStream; - localToken = perThread.localToken; - } - - consumer.start(field); + boolean hasMoreTokens = stream.incrementToken(); + + fieldState.attributeSource = stream; OffsetAttribute offsetAttribute = (OffsetAttribute) fieldState.attributeSource.addAttribute(OffsetAttribute.class); PositionIncrementAttribute posIncrAttribute = (PositionIncrementAttribute) fieldState.attributeSource.addAttribute(PositionIncrementAttribute.class); + consumer.start(field); + for(;;) { // If we hit an exception in stream.next below @@ -155,14 +147,8 @@ final class DocInverterPerField extends DocFieldConsumerPerField { // non-aborting and (above) this one document // will be marked as deleted, but still // consume a docID - Token token = null; - if (useNewTokenStreamAPI) { - if (!stream.incrementToken()) break; - } else { - token = stream.next(localToken); - if (token == null) break; - perThread.localTokenStream.set(token); - } + + if (!hasMoreTokens) break; final int posIncr = posIncrAttribute.getPositionIncrement(); fieldState.position += posIncr; @@ -194,6 +180,8 @@ final class DocInverterPerField extends DocFieldConsumerPerField { docState.infoStream.println("maxFieldLength " +maxFieldLength+ " reached for field " + fieldInfo.name + ", ignoring following tokens"); break; } + + hasMoreTokens = stream.incrementToken(); } fieldState.offset = offsetEnd+1; } finally { diff --git a/src/java/org/apache/lucene/index/DocInverterPerThread.java b/src/java/org/apache/lucene/index/DocInverterPerThread.java index b2a8737e1ec..0a85be5aef8 100644 --- a/src/java/org/apache/lucene/index/DocInverterPerThread.java +++ b/src/java/org/apache/lucene/index/DocInverterPerThread.java @@ -19,15 +19,9 @@ package org.apache.lucene.index; import java.io.IOException; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.FlagsAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; -import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; -import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute; -import org.apache.lucene.analysis.tokenattributes.TypeAttribute; -import org.apache.lucene.util.Attribute; /** This is a DocFieldConsumer that inverts each field, * separately, from a Document, and accepts a @@ -37,10 +31,8 @@ final class DocInverterPerThread extends DocFieldConsumerPerThread { final DocInverter docInverter; final InvertedDocConsumerPerThread consumer; final InvertedDocEndConsumerPerThread endConsumer; - final Token localToken = new Token(); //TODO: change to SingleTokenTokenStream after Token was removed final SingleTokenTokenStream singleTokenTokenStream = new SingleTokenTokenStream(); - final BackwardsCompatibilityStream localTokenStream = new BackwardsCompatibilityStream(); static class SingleTokenTokenStream extends TokenStream { TermAttribute termAttribute; @@ -55,76 +47,13 @@ final class DocInverterPerThread extends DocFieldConsumerPerThread { termAttribute.setTermBuffer(stringValue); offsetAttribute.setOffset(startOffset, endOffset); } + + // this is a dummy, to not throw an UOE because this class does not implement any iteration method + public boolean incrementToken() { + throw new UnsupportedOperationException(); + } } - /** This stream wrapper is only used to maintain backwards compatibility with the - * old TokenStream API and can be removed in Lucene 3.0 - * @deprecated - */ - static class BackwardsCompatibilityStream extends TokenStream { - private Token token; - - TermAttribute termAttribute = new TermAttribute() { - public String term() { - return token.term(); - } - - public char[] termBuffer() { - return token.termBuffer(); - } - - public int termLength() { - return token.termLength(); - } - }; - OffsetAttribute offsetAttribute = new OffsetAttribute() { - public int startOffset() { - return token.startOffset(); - } - - public int endOffset() { - return token.endOffset(); - } - }; - - PositionIncrementAttribute positionIncrementAttribute = new PositionIncrementAttribute() { - public int getPositionIncrement() { - return token.getPositionIncrement(); - } - }; - - FlagsAttribute flagsAttribute = new FlagsAttribute() { - public int getFlags() { - return token.getFlags(); - } - }; - - PayloadAttribute payloadAttribute = new PayloadAttribute() { - public Payload getPayload() { - return token.getPayload(); - } - }; - - TypeAttribute typeAttribute = new TypeAttribute() { - public String type() { - return token.type(); - } - }; - - BackwardsCompatibilityStream() { - attributes.put(TermAttribute.class, termAttribute); - attributes.put(OffsetAttribute.class, offsetAttribute); - attributes.put(PositionIncrementAttribute.class, positionIncrementAttribute); - attributes.put(FlagsAttribute.class, flagsAttribute); - attributes.put(PayloadAttribute.class, payloadAttribute); - attributes.put(TypeAttribute.class, typeAttribute); - } - - public void set(Token token) { - this.token = token; - } - }; - final DocumentsWriter.DocState docState; final FieldInvertState fieldState = new FieldInvertState(); diff --git a/src/java/org/apache/lucene/queryParser/QueryParser.java b/src/java/org/apache/lucene/queryParser/QueryParser.java index 45a5dbde592..809f060b524 100644 --- a/src/java/org/apache/lucene/queryParser/QueryParser.java +++ b/src/java/org/apache/lucene/queryParser/QueryParser.java @@ -531,66 +531,41 @@ public class QueryParser implements QueryParserConstants { PositionIncrementAttribute posIncrAtt = null; int numTokens = 0; - org.apache.lucene.analysis.Token reusableToken = null; - org.apache.lucene.analysis.Token nextToken = null; - - - boolean useNewAPI = TokenStream.useNewAPIDefault(); - - if (useNewAPI) { - boolean success = false; - try { - buffer.reset(); - success = true; - } catch (IOException e) { - // success==false if we hit an exception + boolean success = false; + try { + buffer.reset(); + success = true; + } catch (IOException e) { + // success==false if we hit an exception + } + if (success) { + if (buffer.hasAttribute(TermAttribute.class)) { + termAtt = (TermAttribute) buffer.getAttribute(TermAttribute.class); } - if (success) { - if (buffer.hasAttribute(TermAttribute.class)) { - termAtt = (TermAttribute) buffer.getAttribute(TermAttribute.class); - } - if (buffer.hasAttribute(PositionIncrementAttribute.class)) { - posIncrAtt = (PositionIncrementAttribute) buffer.getAttribute(PositionIncrementAttribute.class); - } + if (buffer.hasAttribute(PositionIncrementAttribute.class)) { + posIncrAtt = (PositionIncrementAttribute) buffer.getAttribute(PositionIncrementAttribute.class); } - } else { - reusableToken = new org.apache.lucene.analysis.Token(); } int positionCount = 0; boolean severalTokensAtSamePosition = false; - if (useNewAPI) { - if (termAtt != null) { - try { - while (buffer.incrementToken()) { - numTokens++; - int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1; - if (positionIncrement != 0) { - positionCount += positionIncrement; - } else { - severalTokensAtSamePosition = true; - } + boolean hasMoreTokens = false; + if (termAtt != null) { + try { + hasMoreTokens = buffer.incrementToken(); + while (hasMoreTokens) { + numTokens++; + int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1; + if (positionIncrement != 0) { + positionCount += positionIncrement; + } else { + severalTokensAtSamePosition = true; } - } catch (IOException e) { - // ignore + hasMoreTokens = buffer.incrementToken(); } - } - } else { - while (true) { - try { - nextToken = buffer.next(reusableToken); - } - catch (IOException e) { - nextToken = null; - } - if (nextToken == null) - break; - numTokens++; - if (nextToken.getPositionIncrement() != 0) - positionCount += nextToken.getPositionIncrement(); - else - severalTokensAtSamePosition = true; + } catch (IOException e) { + // ignore } } try { @@ -609,16 +584,9 @@ public class QueryParser implements QueryParserConstants { else if (numTokens == 1) { String term = null; try { - - if (useNewAPI) { - boolean hasNext = buffer.incrementToken(); - assert hasNext == true; - term = termAtt.term(); - } else { - nextToken = buffer.next(reusableToken); - assert nextToken != null; - term = nextToken.term(); - } + boolean hasNext = buffer.incrementToken(); + assert hasNext == true; + term = termAtt.term(); } catch (IOException e) { // safe to ignore, because we know the number of tokens } @@ -631,15 +599,9 @@ public class QueryParser implements QueryParserConstants { for (int i = 0; i < numTokens; i++) { String term = null; try { - if (useNewAPI) { - boolean hasNext = buffer.incrementToken(); - assert hasNext == true; - term = termAtt.term(); - } else { - nextToken = buffer.next(reusableToken); - assert nextToken != null; - term = nextToken.term(); - } + boolean hasNext = buffer.incrementToken(); + assert hasNext == true; + term = termAtt.term(); } catch (IOException e) { // safe to ignore, because we know the number of tokens } @@ -660,18 +622,11 @@ public class QueryParser implements QueryParserConstants { String term = null; int positionIncrement = 1; try { - if (useNewAPI) { - boolean hasNext = buffer.incrementToken(); - assert hasNext == true; - term = termAtt.term(); - if (posIncrAtt != null) { - positionIncrement = posIncrAtt.getPositionIncrement(); - } - } else { - nextToken = buffer.next(reusableToken); - assert nextToken != null; - term = nextToken.term(); - positionIncrement = nextToken.getPositionIncrement(); + boolean hasNext = buffer.incrementToken(); + assert hasNext == true; + term = termAtt.term(); + if (posIncrAtt != null) { + positionIncrement = posIncrAtt.getPositionIncrement(); } } catch (IOException e) { // safe to ignore, because we know the number of tokens @@ -707,19 +662,11 @@ public class QueryParser implements QueryParserConstants { int positionIncrement = 1; try { - if (useNewAPI) { - - boolean hasNext = buffer.incrementToken(); - assert hasNext == true; - term = termAtt.term(); - if (posIncrAtt != null) { - positionIncrement = posIncrAtt.getPositionIncrement(); - } - } else { - nextToken = buffer.next(reusableToken); - assert nextToken != null; - term = nextToken.term(); - positionIncrement = nextToken.getPositionIncrement(); + boolean hasNext = buffer.incrementToken(); + assert hasNext == true; + term = termAtt.term(); + if (posIncrAtt != null) { + positionIncrement = posIncrAtt.getPositionIncrement(); } } catch (IOException e) { // safe to ignore, because we know the number of tokens @@ -1625,12 +1572,6 @@ public class QueryParser implements QueryParserConstants { finally { jj_save(0, xla); } } - private boolean jj_3R_3() { - if (jj_scan_token(STAR)) return true; - if (jj_scan_token(COLON)) return true; - return false; - } - private boolean jj_3R_2() { if (jj_scan_token(TERM)) return true; if (jj_scan_token(COLON)) return true; @@ -1647,6 +1588,12 @@ public class QueryParser implements QueryParserConstants { return false; } + private boolean jj_3R_3() { + if (jj_scan_token(STAR)) return true; + if (jj_scan_token(COLON)) return true; + return false; + } + /** Generated Token Manager. */ public QueryParserTokenManager token_source; /** Current token. */ diff --git a/src/java/org/apache/lucene/queryParser/QueryParser.jj b/src/java/org/apache/lucene/queryParser/QueryParser.jj index cfd60b3eb3e..b9fdbb9bcd7 100644 --- a/src/java/org/apache/lucene/queryParser/QueryParser.jj +++ b/src/java/org/apache/lucene/queryParser/QueryParser.jj @@ -555,67 +555,42 @@ public class QueryParser { PositionIncrementAttribute posIncrAtt = null; int numTokens = 0; - org.apache.lucene.analysis.Token reusableToken = null; - org.apache.lucene.analysis.Token nextToken = null; - - - boolean useNewAPI = TokenStream.useNewAPIDefault(); - - if (useNewAPI) { - boolean success = false; - try { - buffer.reset(); - success = true; - } catch (IOException e) { - // success==false if we hit an exception - } - if (success) { - if (buffer.hasAttribute(TermAttribute.class)) { - termAtt = (TermAttribute) buffer.getAttribute(TermAttribute.class); - } - if (buffer.hasAttribute(PositionIncrementAttribute.class)) { - posIncrAtt = (PositionIncrementAttribute) buffer.getAttribute(PositionIncrementAttribute.class); - } - } - } else { - reusableToken = new org.apache.lucene.analysis.Token(); + boolean success = false; + try { + buffer.reset(); + success = true; + } catch (IOException e) { + // success==false if we hit an exception } - + if (success) { + if (buffer.hasAttribute(TermAttribute.class)) { + termAtt = (TermAttribute) buffer.getAttribute(TermAttribute.class); + } + if (buffer.hasAttribute(PositionIncrementAttribute.class)) { + posIncrAtt = (PositionIncrementAttribute) buffer.getAttribute(PositionIncrementAttribute.class); + } + } + int positionCount = 0; boolean severalTokensAtSamePosition = false; - if (useNewAPI) { - if (termAtt != null) { - try { - while (buffer.incrementToken()) { - numTokens++; - int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1; - if (positionIncrement != 0) { - positionCount += positionIncrement; - } else { - severalTokensAtSamePosition = true; - } + boolean hasMoreTokens = false; + if (termAtt != null) { + try { + hasMoreTokens = buffer.incrementToken(); + while (hasMoreTokens) { + numTokens++; + int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1; + if (positionIncrement != 0) { + positionCount += positionIncrement; + } else { + severalTokensAtSamePosition = true; } - } catch (IOException e) { - // ignore + hasMoreTokens = buffer.incrementToken(); } + } catch (IOException e) { + // ignore } - } else { - while (true) { - try { - nextToken = buffer.next(reusableToken); - } - catch (IOException e) { - nextToken = null; - } - if (nextToken == null) - break; - numTokens++; - if (nextToken.getPositionIncrement() != 0) - positionCount += nextToken.getPositionIncrement(); - else - severalTokensAtSamePosition = true; - } } try { // rewind the buffer stream @@ -627,22 +602,15 @@ public class QueryParser { catch (IOException e) { // ignore } - + if (numTokens == 0) return null; else if (numTokens == 1) { String term = null; try { - - if (useNewAPI) { - boolean hasNext = buffer.incrementToken(); - assert hasNext == true; - term = termAtt.term(); - } else { - nextToken = buffer.next(reusableToken); - assert nextToken != null; - term = nextToken.term(); - } + boolean hasNext = buffer.incrementToken(); + assert hasNext == true; + term = termAtt.term(); } catch (IOException e) { // safe to ignore, because we know the number of tokens } @@ -655,19 +623,13 @@ public class QueryParser { for (int i = 0; i < numTokens; i++) { String term = null; try { - if (useNewAPI) { - boolean hasNext = buffer.incrementToken(); - assert hasNext == true; - term = termAtt.term(); - } else { - nextToken = buffer.next(reusableToken); - assert nextToken != null; - term = nextToken.term(); - } + boolean hasNext = buffer.incrementToken(); + assert hasNext == true; + term = termAtt.term(); } catch (IOException e) { // safe to ignore, because we know the number of tokens } - + Query currentQuery = newTermQuery( new Term(field, term)); q.add(currentQuery, BooleanClause.Occur.SHOULD); @@ -684,18 +646,11 @@ public class QueryParser { String term = null; int positionIncrement = 1; try { - if (useNewAPI) { - boolean hasNext = buffer.incrementToken(); - assert hasNext == true; - term = termAtt.term(); - if (posIncrAtt != null) { - positionIncrement = posIncrAtt.getPositionIncrement(); - } - } else { - nextToken = buffer.next(reusableToken); - assert nextToken != null; - term = nextToken.term(); - positionIncrement = nextToken.getPositionIncrement(); + boolean hasNext = buffer.incrementToken(); + assert hasNext == true; + term = termAtt.term(); + if (posIncrAtt != null) { + positionIncrement = posIncrAtt.getPositionIncrement(); } } catch (IOException e) { // safe to ignore, because we know the number of tokens @@ -724,26 +679,18 @@ public class QueryParser { PhraseQuery pq = newPhraseQuery(); pq.setSlop(phraseSlop); int position = -1; - - + + for (int i = 0; i < numTokens; i++) { String term = null; int positionIncrement = 1; - try { - if (useNewAPI) { - - boolean hasNext = buffer.incrementToken(); - assert hasNext == true; - term = termAtt.term(); - if (posIncrAtt != null) { - positionIncrement = posIncrAtt.getPositionIncrement(); - } - } else { - nextToken = buffer.next(reusableToken); - assert nextToken != null; - term = nextToken.term(); - positionIncrement = nextToken.getPositionIncrement(); + try { + boolean hasNext = buffer.incrementToken(); + assert hasNext == true; + term = termAtt.term(); + if (posIncrAtt != null) { + positionIncrement = posIncrAtt.getPositionIncrement(); } } catch (IOException e) { // safe to ignore, because we know the number of tokens diff --git a/src/java/org/apache/lucene/search/QueryTermVector.java b/src/java/org/apache/lucene/search/QueryTermVector.java index 3070896a248..5e74dd84f43 100644 --- a/src/java/org/apache/lucene/search/QueryTermVector.java +++ b/src/java/org/apache/lucene/search/QueryTermVector.java @@ -27,7 +27,6 @@ import java.util.List; import java.util.Map; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.index.TermFreqVector; @@ -59,17 +58,15 @@ public class QueryTermVector implements TermFreqVector { { List terms = new ArrayList(); try { - if (stream.useNewAPI()) { - stream.reset(); - TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class); - while (stream.incrementToken()) { - terms.add(termAtt.term()); - } - } else { - final Token reusableToken = new Token(); - for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) { - terms.add(nextToken.term()); - } + boolean hasMoreTokens = false; + + stream.reset(); + TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class); + + hasMoreTokens = stream.incrementToken(); + while (hasMoreTokens) { + terms.add(termAtt.term()); + hasMoreTokens = stream.incrementToken(); } processTerms((String[])terms.toArray(new String[terms.size()])); } catch (IOException e) { diff --git a/src/java/org/apache/lucene/util/Attribute.java b/src/java/org/apache/lucene/util/Attribute.java index ad5de1eb651..0a39ef0e26b 100644 --- a/src/java/org/apache/lucene/util/Attribute.java +++ b/src/java/org/apache/lucene/util/Attribute.java @@ -17,79 +17,14 @@ package org.apache.lucene.util; * limitations under the License. */ -import java.io.Serializable; - /** - * Base class for Attributes that can be added to a - * {@link org.apache.lucene.util.AttributeSource}. - *

    - * Attributes are used to add data in a dynamic, yet type-safe way to a source - * of usually streamed objects, e. g. a {@link org.apache.lucene.analysis.TokenStream}. + * Base interface for attributes. + * *

    * WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental. * The APIs introduced in these classes with Lucene 2.9 might change in the future. * We will make our best efforts to keep the APIs backwards-compatible. */ -public abstract class Attribute implements Cloneable, Serializable { - /** - * Clears the values in this Attribute and resets it to its - * default value. - */ - public abstract void clear(); - - /** - * Subclasses must implement this method and should follow a syntax - * similar to this one: - * - *

    -   *   public String toString() {
    -   *     return "start=" + startOffset + ",end=" + endOffset;
    -   *   }
    -   * 
    - */ - public abstract String toString(); - - /** - * Subclasses must implement this method and should compute - * a hashCode similar to this: - *
    -   *   public int hashCode() {
    -   *     int code = startOffset;
    -   *     code = code * 31 + endOffset;
    -   *     return code;
    -   *   }
    -   * 
    - * - * see also {@link #equals(Object)} - */ - public abstract int hashCode(); - - /** - * All values used for computation of {@link #hashCode()} - * should be checked here for equality. - * - * see also {@link Object#equals(Object)} - */ - public abstract boolean equals(Object other); - - /** - * Copies the values from this Attribute into the passed-in - * target attribute. The type of the target must match the type - * of this attribute. - */ - public abstract void copyTo(Attribute target); - - /** - * Shallow clone. Subclasses must override this if they - * need to clone any members deeply, - */ - public Object clone() { - Object clone = null; - try { - clone = super.clone(); - } catch (CloneNotSupportedException e) { - throw new RuntimeException(e); // shouldn't happen - } - return clone; - } +public interface Attribute { + public void clear(); } diff --git a/src/java/org/apache/lucene/util/AttributeImpl.java b/src/java/org/apache/lucene/util/AttributeImpl.java new file mode 100644 index 00000000000..976cbec7a04 --- /dev/null +++ b/src/java/org/apache/lucene/util/AttributeImpl.java @@ -0,0 +1,123 @@ +package org.apache.lucene.util; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Serializable; +import java.lang.reflect.Field; + +/** + * Base class for Attributes that can be added to a + * {@link org.apache.lucene.util.AttributeSource}. + *

    + * Attributes are used to add data in a dynamic, yet type-safe way to a source + * of usually streamed objects, e. g. a {@link org.apache.lucene.analysis.TokenStream}. + *

    + * WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental. + * The APIs introduced in these classes with Lucene 2.9 might change in the future. + * We will make our best efforts to keep the APIs backwards-compatible. + */ +public abstract class AttributeImpl implements Cloneable, Serializable { + /** + * Clears the values in this Attribute and resets it to its + * default value. + */ + public abstract void clear(); + + /** + * The default implementation of this method accesses all declared + * fields of this object and prints the values in the following syntax: + * + *

    +   *   public String toString() {
    +   *     return "start=" + startOffset + ",end=" + endOffset;
    +   *   }
    +   * 
    + * + * This method may be overridden by subclasses. + */ + public String toString() { + StringBuffer buffer = new StringBuffer(); + Class clazz = this.getClass(); + Field[] fields = clazz.getDeclaredFields(); + try { + for (int i = 0; i < fields.length; i++) { + Field f = fields[i]; + f.setAccessible(true); + Object value = f.get(this); + if (value == null) { + buffer.append(f.getName() + "=null"); + } else { + buffer.append(f.getName() + "=" + value); + } + if (i < fields.length - 1) { + buffer.append(','); + } + } + } catch (IllegalAccessException e) { + // this should never happen, because we're just accessing fields + // from 'this' + throw new RuntimeException(e); + } + + return buffer.toString(); + } + + /** + * Subclasses must implement this method and should compute + * a hashCode similar to this: + *
    +   *   public int hashCode() {
    +   *     int code = startOffset;
    +   *     code = code * 31 + endOffset;
    +   *     return code;
    +   *   }
    +   * 
    + * + * see also {@link #equals(Object)} + */ + public abstract int hashCode(); + + /** + * All values used for computation of {@link #hashCode()} + * should be checked here for equality. + * + * see also {@link Object#equals(Object)} + */ + public abstract boolean equals(Object other); + + /** + * Copies the values from this Attribute into the passed-in + * target attribute. The type of the target must match the type + * of this attribute. + */ + public abstract void copyTo(AttributeImpl target); + + /** + * Shallow clone. Subclasses must override this if they + * need to clone any members deeply, + */ + public Object clone() { + Object clone = null; + try { + clone = super.clone(); + } catch (CloneNotSupportedException e) { + throw new RuntimeException(e); // shouldn't happen + } + return clone; + } +} diff --git a/src/java/org/apache/lucene/util/AttributeSource.java b/src/java/org/apache/lucene/util/AttributeSource.java index 1ac06a941c8..f596c8ee0fc 100644 --- a/src/java/org/apache/lucene/util/AttributeSource.java +++ b/src/java/org/apache/lucene/util/AttributeSource.java @@ -18,14 +18,17 @@ package org.apache.lucene.util; */ import java.util.Iterator; +import java.util.Collections; import java.util.LinkedHashMap; +import java.util.IdentityHashMap; +import java.util.LinkedList; import java.util.Map; +import java.util.Map.Entry; -import org.apache.lucene.analysis.TokenStream; - +import org.apache.lucene.analysis.TokenStream; // for javadocs /** - * An AttributeSource contains a list of different {@link Attribute}s, + * An AttributeSource contains a list of different {@link AttributeImpl}s, * and methods to add and get them. There can only be a single instance * of an attribute in the same AttributeSource instance. This is ensured * by passing in the actual type of the Attribute (Class<Attribute>) to @@ -40,43 +43,147 @@ import org.apache.lucene.analysis.TokenStream; */ public class AttributeSource { /** - * An AttributeAcceptor defines only a single method {@link #accept(Class)}. - * It can be used for e. g. buffering purposes to specify which attributes - * to buffer. + * An AttributeFactory creates instances of {@link AttributeImpl}s. */ - public static abstract class AttributeAcceptor { - /** Return true, to accept this attribute; false otherwise */ - public abstract boolean accept(Class attClass); + public static abstract class AttributeFactory { + /** + * returns an {@link AttributeImpl} for the supplied {@link Attribute} interface class. + */ + public abstract AttributeImpl createAttributeInstance(Class attClass); + + /** + * This is the default factory that creates {@link AttributeImpl}s using the + * class name of the supplied {@link Attribute} interface class by appending Impl to it. + */ + public static final AttributeFactory DEFAULT_ATTRIBUTE_FACTORY = new DefaultAttributeFactory(); + + private static final class DefaultAttributeFactory extends AttributeFactory { + private static final IdentityHashMap/*,Class>*/ attClassImplMap = new IdentityHashMap(); + + private DefaultAttributeFactory() {} + + public AttributeImpl createAttributeInstance(Class attClass) { + try { + return (AttributeImpl) getClassForInterface(attClass).newInstance(); + } catch (InstantiationException e) { + throw new IllegalArgumentException("Could not instantiate class " + attClass); + } catch (IllegalAccessException e) { + throw new IllegalArgumentException("Could not instantiate class " + attClass); + } + } + + private static Class getClassForInterface(Class attClass) { + synchronized(attClassImplMap) { + Class clazz = (Class) attClassImplMap.get(attClass); + if (clazz == null) { + try { + attClassImplMap.put(attClass, clazz = Class.forName(attClass.getName() + "Impl")); + } catch (ClassNotFoundException e) { + throw new IllegalArgumentException("Could not find implementing class for " + attClass.getName()); + } + } + return clazz; + } + } + } } + + // These two maps must always be in sync!!! + // So they are private, final and read-only from the outside (read-only iterators) + private final Map/*,AttributeImpl>*/ attributes; + private final Map/*,AttributeImpl>*/ attributeImpls; + + private AttributeFactory factory; /** - * Default AttributeAcceptor that accepts all attributes. + * An AttributeSource using the default attribute factory {@link AttributeFactory#DEFAULT_ATTRIBUTE_FACTORY}. */ - public static final AttributeAcceptor AllAcceptor = new AttributeAcceptor() { - public boolean accept(Class attClass) {return true;} - }; - - /** - * Holds the Class<Attribute> -> Attribute mapping - */ - protected Map attributes; - public AttributeSource() { - this.attributes = new LinkedHashMap(); + this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY); } + /** + * An AttributeSource that uses the same attributes as the supplied one. + */ public AttributeSource(AttributeSource input) { if (input == null) { throw new IllegalArgumentException("input AttributeSource must not be null"); } this.attributes = input.attributes; + this.attributeImpls = input.attributeImpls; + this.factory = input.factory; } - /** Returns an iterator that iterates the attributes + /** + * An AttributeSource using the supplied {@link AttributeFactory} for creating new {@link Attribute} instances. + */ + public AttributeSource(AttributeFactory factory) { + this.attributes = new LinkedHashMap(); + this.attributeImpls = new LinkedHashMap(); + this.factory = factory; + } + + /** + * returns the used AttributeFactory. + */ + public AttributeFactory getAttributeFactory() { + return this.factory; + } + + /** Returns a new iterator that iterates the attribute classes * in the same order they were added in. */ - public Iterator getAttributesIterator() { - return attributes.values().iterator(); + public Iterator/*>*/ getAttributeClassesIterator() { + return Collections.unmodifiableSet(attributes.keySet()).iterator(); + } + + /** Returns a new iterator that iterates all unique Attribute implementations. + * This iterator may contain less entries that {@link #getAttributeClassesIterator}, + * if one instance implements more than one Attribute interface. + */ + public Iterator/**/ getAttributeImplsIterator() { + return Collections.unmodifiableCollection(attributeImpls.values()).iterator(); + } + + /** a cache that stores all interfaces for known implementation classes for performance (slow reflection) */ + private static final IdentityHashMap/*,LinkedList>>*/ knownImplClasses = new IdentityHashMap(); + + /** Adds a custom AttributeImpl instance with one or more Attribute interfaces. */ + public void addAttributeImpl(final AttributeImpl att) { + final Class clazz = att.getClass(); + if (attributeImpls.containsKey(clazz)) return; + LinkedList foundInterfaces; + synchronized(knownImplClasses) { + foundInterfaces = (LinkedList) knownImplClasses.get(clazz); + if (foundInterfaces == null) { + knownImplClasses.put(clazz, foundInterfaces=new LinkedList()); + // find all interfaces that this attribute instance implements + // and that extend the Attribute interface + Class actClazz = clazz; + do { + Class[] interfaces = actClazz.getInterfaces(); + for (int i = 0; i < interfaces.length; i++) { + final Class curInterface = interfaces[i]; + if (Attribute.class.isAssignableFrom(curInterface)) { + foundInterfaces.add(curInterface); + } + } + actClazz = actClazz.getSuperclass(); + } while (actClazz != null); + } + } + + // add all interfaces of this AttributeImpl to the maps + for (Iterator it = foundInterfaces.iterator(); it.hasNext(); ) { + final Class curInterface = (Class) it.next(); + // Attribute is a superclass of this interface + if (!attributes.containsKey(curInterface)) { + // invalidate state to force recomputation in captureState() + this.currentState = null; + attributes.put(curInterface, att); + attributeImpls.put(clazz, att); + } + } } /** @@ -85,18 +192,11 @@ public class AttributeSource { * already in this AttributeSource and returns it. Otherwise a * new instance is created, added to this AttributeSource and returned. */ - public Attribute addAttribute(Class attClass) { - Attribute att = (Attribute) attributes.get(attClass); + public AttributeImpl addAttribute(Class attClass) { + AttributeImpl att = (AttributeImpl) attributes.get(attClass); if (att == null) { - try { - att = (Attribute) attClass.newInstance(); - } catch (InstantiationException e) { - throw new IllegalArgumentException("Could not instantiate class " + attClass); - } catch (IllegalAccessException e) { - throw new IllegalArgumentException("Could not instantiate class " + attClass); - } - - attributes.put(attClass, att); + att = this.factory.createAttributeInstance(attClass); + addAttributeImpl(att); } return att; } @@ -121,10 +221,10 @@ public class AttributeSource { * @throws IllegalArgumentException if this AttributeSource does not contain the * Attribute */ - public Attribute getAttribute(Class attClass) { - Attribute att = (Attribute) this.attributes.get(attClass); + public AttributeImpl getAttribute(Class attClass) { + AttributeImpl att = (AttributeImpl) this.attributes.get(attClass); if (att == null) { - throw new IllegalArgumentException("This token does not have the attribute '" + attClass + "'."); + throw new IllegalArgumentException("This AttributeSource does not have the attribute '" + attClass + "'."); } return att; @@ -132,52 +232,72 @@ public class AttributeSource { /** * Resets all Attributes in this AttributeSource by calling - * {@link Attribute#clear()} on each Attribute. + * {@link AttributeImpl#clear()} on each Attribute implementation. */ public void clearAttributes() { - Iterator it = getAttributesIterator(); + Iterator it = getAttributeImplsIterator(); while (it.hasNext()) { - ((Attribute) it.next()).clear(); + ((AttributeImpl) it.next()).clear(); } } /** - * Captures the current state of the passed in TokenStream. - *

    - * This state will contain all of the passed in TokenStream's - * {@link Attribute}s. If only a subset of the attributes is needed - * please use {@link #captureState(AttributeAcceptor)} + * This class holds the state of an AttributeSource. + * @see #captureState + * @see #restoreState */ - public AttributeSource captureState() { - return captureState(AllAcceptor); - } - - /** - * Captures the current state of the passed in TokenStream. - *

    - * This state will contain all of the passed in TokenStream's - * {@link Attribute}s which the {@link AttributeAcceptor} accepts. - */ - public AttributeSource captureState(AttributeAcceptor acceptor) { - AttributeSource state = new AttributeSource(); - - Iterator it = getAttributesIterator(); - while(it.hasNext()) { - Attribute att = (Attribute) it.next(); - if (acceptor.accept(att.getClass())) { - Attribute clone = (Attribute) att.clone(); - state.attributes.put(att.getClass(), clone); - } - } + public static final class State implements Cloneable { + private AttributeImpl attribute; + private State next; - return state; + public Object clone() { + State clone = new State(); + clone.attribute = (AttributeImpl) attribute.clone(); + + if (next != null) { + clone.next = (State) next.clone(); + } + + return clone; + } + } + + private State currentState = null; + + private void computeCurrentState() { + currentState = new State(); + State c = currentState; + Iterator it = getAttributeImplsIterator(); + c.attribute = (AttributeImpl) it.next(); + while (it.hasNext()) { + c.next = new State(); + c = c.next; + c.attribute = (AttributeImpl) it.next(); + } } /** - * Restores this state by copying the values of all attributes - * that this state contains into the attributes of the targetStream. + * Captures the state of all Attributes. The return value can be passed to + * {@link #restoreState} to restore the state of this or another AttributeSource. + */ + public State captureState() { + if (!hasAttributes()) { + return null; + } + + if (currentState == null) { + computeCurrentState(); + } + return (State) this.currentState.clone(); + } + + /** + * Restores this state by copying the values of all attribute implementations + * that this state contains into the attributes implementations of the targetStream. * The targetStream must contain a corresponding instance for each argument - * contained in this state. + * contained in this state (e.g. it is not possible to restore the state of + * an AttributeSource containing a TermAttribute into a AttributeSource using + * a Token instance as implementation). *

    * Note that this method does not affect attributes of the targetStream * that are not contained in this state. In other words, if for example @@ -186,19 +306,22 @@ public class AttributeSource { * reset its value to the default, in which case the caller should first * call {@link TokenStream#clearAttributes()} on the targetStream. */ - public void restoreState(AttributeSource target) { - Iterator it = getAttributesIterator(); - while (it.hasNext()) { - Attribute att = (Attribute) it.next(); - Attribute targetAtt = target.getAttribute(att.getClass()); - att.copyTo(targetAtt); - } + public void restoreState(State state) { + if (state == null) return; + + do { + AttributeImpl targetImpl = (AttributeImpl) attributeImpls.get(state.attribute.getClass()); + if (targetImpl == null) + throw new IllegalArgumentException("State contains an AttributeImpl that is not in this AttributeSource"); + state.attribute.copyTo(targetImpl); + state = state.next; + } while (state != null); } - + public int hashCode() { int code = 0; if (hasAttributes()) { - Iterator it = getAttributesIterator(); + Iterator it = getAttributeImplsIterator(); while (it.hasNext()) { code = code * 31 + it.next().hashCode(); } @@ -220,16 +343,17 @@ public class AttributeSource { return false; } - if (attributes.size() != other.attributes.size()) { + if (this.attributeImpls.size() != other.attributeImpls.size()) { return false; } - Iterator it = getAttributesIterator(); - while (it.hasNext()) { - Class attName = it.next().getClass(); - - Attribute otherAtt = (Attribute) other.attributes.get(attName); - if (otherAtt == null || !otherAtt.equals(attributes.get(attName))) { + // it is only equal if all attribute impls are the same in the same order + Iterator thisIt = this.getAttributeImplsIterator(); + Iterator otherIt = other.getAttributeImplsIterator(); + while (thisIt.hasNext() && otherIt.hasNext()) { + AttributeImpl thisAtt = (AttributeImpl) thisIt.next(); + AttributeImpl otherAtt = (AttributeImpl) otherIt.next(); + if (otherAtt.getClass() != thisAtt.getClass() || !otherAtt.equals(thisAtt)) { return false; } } @@ -240,38 +364,48 @@ public class AttributeSource { } else return false; } - -// TODO: Java 1.5 -// private Map, Attribute> attributes; -// public T addAttribute(Class attClass) { -// T att = (T) attributes.get(attClass); -// if (att == null) { -// try { -// att = attClass.newInstance(); -// } catch (InstantiationException e) { -// throw new IllegalArgumentException("Could not instantiate class " + attClass); -// } catch (IllegalAccessException e) { -// throw new IllegalArgumentException("Could not instantiate class " + attClass); -// } -// -// attributes.put(attClass, att); -// } -// return att; -// } -// -// public boolean hasAttribute(Class attClass) { -// return this.attributes.containsKey(attClass); -// } -// -// public T getAttribute(Class attClass) { -// Attribute att = this.attributes.get(attClass); -// if (att == null) { -// throw new IllegalArgumentException("This token does not have the attribute '" + attClass + "'."); -// } -// -// return (T) att; -// } -// + public String toString() { + StringBuffer sb = new StringBuffer(); + sb.append('('); + + if (hasAttributes()) { + Iterator it = getAttributeImplsIterator(); + if (it.hasNext()) { + sb.append(it.next().toString()); + } + while (it.hasNext()) { + sb.append(','); + sb.append(it.next().toString()); + } + } + sb.append(')'); + return sb.toString(); + } + + /** + * Performs a clone of all {@link AttributeImpl} instances returned in a new + * AttributeSource instance. This method can be used to e.g. create another TokenStream + * with exactly the same attributes (using {@link #AttributeSource(AttributeSource)}) + */ + public AttributeSource cloneAttributes() { + AttributeSource clone = new AttributeSource(this.factory); + + // first clone the impls + Iterator/**/ implIt = getAttributeImplsIterator(); + while (implIt.hasNext()) { + AttributeImpl impl = (AttributeImpl) implIt.next(); + clone.attributeImpls.put(impl.getClass(), impl.clone()); + } + + // now the interfaces + Iterator/*, AttributeImpl>>*/ attIt = this.attributes.entrySet().iterator(); + while (attIt.hasNext()) { + Entry/*, AttributeImpl>*/ entry = (Entry/*, AttributeImpl>*/) attIt.next(); + clone.attributes.put(entry.getKey(), clone.attributeImpls.get(entry.getValue().getClass())); + } + + return clone; + } } diff --git a/src/test/org/apache/lucene/analysis/TestASCIIFoldingFilter.java b/src/test/org/apache/lucene/analysis/TestASCIIFoldingFilter.java index ca7e1c53b02..d2c949f6533 100644 --- a/src/test/org/apache/lucene/analysis/TestASCIIFoldingFilter.java +++ b/src/test/org/apache/lucene/analysis/TestASCIIFoldingFilter.java @@ -17,6 +17,7 @@ package org.apache.lucene.analysis; * limitations under the License. */ +import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.util.LuceneTestCase; import java.io.StringReader; @@ -34,84 +35,84 @@ public class TestASCIIFoldingFilter extends LuceneTestCase { +" ð ñ ò ó ô õ ö ø œ ß þ ù ú û ü ý ÿ fi fl")); ASCIIFoldingFilter filter = new ASCIIFoldingFilter(stream); - final Token reusableToken = new Token(); + TermAttribute termAtt = (TermAttribute) filter.getAttribute(TermAttribute.class); - assertEquals("Des", filter.next(reusableToken).term()); - assertEquals("mot", filter.next(reusableToken).term()); - assertEquals("cles", filter.next(reusableToken).term()); - assertEquals("A", filter.next(reusableToken).term()); - assertEquals("LA", filter.next(reusableToken).term()); - assertEquals("CHAINE", filter.next(reusableToken).term()); - assertEquals("A", filter.next(reusableToken).term()); - assertEquals("A", filter.next(reusableToken).term()); - assertEquals("A", filter.next(reusableToken).term()); - assertEquals("A", filter.next(reusableToken).term()); - assertEquals("A", filter.next(reusableToken).term()); - assertEquals("A", filter.next(reusableToken).term()); - assertEquals("AE", filter.next(reusableToken).term()); - assertEquals("C", filter.next(reusableToken).term()); - assertEquals("E", filter.next(reusableToken).term()); - assertEquals("E", filter.next(reusableToken).term()); - assertEquals("E", filter.next(reusableToken).term()); - assertEquals("E", filter.next(reusableToken).term()); - assertEquals("I", filter.next(reusableToken).term()); - assertEquals("I", filter.next(reusableToken).term()); - assertEquals("I", filter.next(reusableToken).term()); - assertEquals("I", filter.next(reusableToken).term()); - assertEquals("IJ", filter.next(reusableToken).term()); - assertEquals("D", filter.next(reusableToken).term()); - assertEquals("N", filter.next(reusableToken).term()); - assertEquals("O", filter.next(reusableToken).term()); - assertEquals("O", filter.next(reusableToken).term()); - assertEquals("O", filter.next(reusableToken).term()); - assertEquals("O", filter.next(reusableToken).term()); - assertEquals("O", filter.next(reusableToken).term()); - assertEquals("O", filter.next(reusableToken).term()); - assertEquals("OE", filter.next(reusableToken).term()); - assertEquals("TH", filter.next(reusableToken).term()); - assertEquals("U", filter.next(reusableToken).term()); - assertEquals("U", filter.next(reusableToken).term()); - assertEquals("U", filter.next(reusableToken).term()); - assertEquals("U", filter.next(reusableToken).term()); - assertEquals("Y", filter.next(reusableToken).term()); - assertEquals("Y", filter.next(reusableToken).term()); - assertEquals("a", filter.next(reusableToken).term()); - assertEquals("a", filter.next(reusableToken).term()); - assertEquals("a", filter.next(reusableToken).term()); - assertEquals("a", filter.next(reusableToken).term()); - assertEquals("a", filter.next(reusableToken).term()); - assertEquals("a", filter.next(reusableToken).term()); - assertEquals("ae", filter.next(reusableToken).term()); - assertEquals("c", filter.next(reusableToken).term()); - assertEquals("e", filter.next(reusableToken).term()); - assertEquals("e", filter.next(reusableToken).term()); - assertEquals("e", filter.next(reusableToken).term()); - assertEquals("e", filter.next(reusableToken).term()); - assertEquals("i", filter.next(reusableToken).term()); - assertEquals("i", filter.next(reusableToken).term()); - assertEquals("i", filter.next(reusableToken).term()); - assertEquals("i", filter.next(reusableToken).term()); - assertEquals("ij", filter.next(reusableToken).term()); - assertEquals("d", filter.next(reusableToken).term()); - assertEquals("n", filter.next(reusableToken).term()); - assertEquals("o", filter.next(reusableToken).term()); - assertEquals("o", filter.next(reusableToken).term()); - assertEquals("o", filter.next(reusableToken).term()); - assertEquals("o", filter.next(reusableToken).term()); - assertEquals("o", filter.next(reusableToken).term()); - assertEquals("o", filter.next(reusableToken).term()); - assertEquals("oe", filter.next(reusableToken).term()); - assertEquals("ss", filter.next(reusableToken).term()); - assertEquals("th", filter.next(reusableToken).term()); - assertEquals("u", filter.next(reusableToken).term()); - assertEquals("u", filter.next(reusableToken).term()); - assertEquals("u", filter.next(reusableToken).term()); - assertEquals("u", filter.next(reusableToken).term()); - assertEquals("y", filter.next(reusableToken).term()); - assertEquals("y", filter.next(reusableToken).term()); - assertEquals("fi", filter.next(reusableToken).term()); - assertEquals("fl", filter.next(reusableToken).term()); - assertNull(filter.next(reusableToken)); + assertTermEquals("Des", filter, termAtt); + assertTermEquals("mot", filter, termAtt); + assertTermEquals("cles", filter, termAtt); + assertTermEquals("A", filter, termAtt); + assertTermEquals("LA", filter, termAtt); + assertTermEquals("CHAINE", filter, termAtt); + assertTermEquals("A", filter, termAtt); + assertTermEquals("A", filter, termAtt); + assertTermEquals("A", filter, termAtt); + assertTermEquals("A", filter, termAtt); + assertTermEquals("A", filter, termAtt); + assertTermEquals("A", filter, termAtt); + assertTermEquals("AE", filter, termAtt); + assertTermEquals("C", filter, termAtt); + assertTermEquals("E", filter, termAtt); + assertTermEquals("E", filter, termAtt); + assertTermEquals("E", filter, termAtt); + assertTermEquals("E", filter, termAtt); + assertTermEquals("I", filter, termAtt); + assertTermEquals("I", filter, termAtt); + assertTermEquals("I", filter, termAtt); + assertTermEquals("I", filter, termAtt); + assertTermEquals("IJ", filter, termAtt); + assertTermEquals("D", filter, termAtt); + assertTermEquals("N", filter, termAtt); + assertTermEquals("O", filter, termAtt); + assertTermEquals("O", filter, termAtt); + assertTermEquals("O", filter, termAtt); + assertTermEquals("O", filter, termAtt); + assertTermEquals("O", filter, termAtt); + assertTermEquals("O", filter, termAtt); + assertTermEquals("OE", filter, termAtt); + assertTermEquals("TH", filter, termAtt); + assertTermEquals("U", filter, termAtt); + assertTermEquals("U", filter, termAtt); + assertTermEquals("U", filter, termAtt); + assertTermEquals("U", filter, termAtt); + assertTermEquals("Y", filter, termAtt); + assertTermEquals("Y", filter, termAtt); + assertTermEquals("a", filter, termAtt); + assertTermEquals("a", filter, termAtt); + assertTermEquals("a", filter, termAtt); + assertTermEquals("a", filter, termAtt); + assertTermEquals("a", filter, termAtt); + assertTermEquals("a", filter, termAtt); + assertTermEquals("ae", filter, termAtt); + assertTermEquals("c", filter, termAtt); + assertTermEquals("e", filter, termAtt); + assertTermEquals("e", filter, termAtt); + assertTermEquals("e", filter, termAtt); + assertTermEquals("e", filter, termAtt); + assertTermEquals("i", filter, termAtt); + assertTermEquals("i", filter, termAtt); + assertTermEquals("i", filter, termAtt); + assertTermEquals("i", filter, termAtt); + assertTermEquals("ij", filter, termAtt); + assertTermEquals("d", filter, termAtt); + assertTermEquals("n", filter, termAtt); + assertTermEquals("o", filter, termAtt); + assertTermEquals("o", filter, termAtt); + assertTermEquals("o", filter, termAtt); + assertTermEquals("o", filter, termAtt); + assertTermEquals("o", filter, termAtt); + assertTermEquals("o", filter, termAtt); + assertTermEquals("oe", filter, termAtt); + assertTermEquals("ss", filter, termAtt); + assertTermEquals("th", filter, termAtt); + assertTermEquals("u", filter, termAtt); + assertTermEquals("u", filter, termAtt); + assertTermEquals("u", filter, termAtt); + assertTermEquals("u", filter, termAtt); + assertTermEquals("y", filter, termAtt); + assertTermEquals("y", filter, termAtt); + assertTermEquals("fi", filter, termAtt); + assertTermEquals("fl", filter, termAtt); + assertFalse(filter.incrementToken()); } @@ -1891,11 +1892,16 @@ public class TestASCIIFoldingFilter extends LuceneTestCase { TokenStream stream = new WhitespaceTokenizer(new StringReader(inputText.toString())); ASCIIFoldingFilter filter = new ASCIIFoldingFilter(stream); - final Token reusableToken = new Token(); + TermAttribute termAtt = (TermAttribute) filter.getAttribute(TermAttribute.class); Iterator expectedIter = expectedOutputTokens.iterator(); - while (expectedIter.hasNext()) { - assertEquals(expectedIter.next(), filter.next(reusableToken).term()); + while (expectedIter.hasNext()) {; + assertTermEquals((String)expectedIter.next(), filter, termAtt); } - assertNull(filter.next(reusableToken)); + assertFalse(filter.incrementToken()); + } + + void assertTermEquals(String expected, TokenStream stream, TermAttribute termAtt) throws Exception { + assertTrue(stream.incrementToken()); + assertEquals(expected, termAtt.term()); } } diff --git a/src/test/org/apache/lucene/analysis/TestNumericTokenStream.java b/src/test/org/apache/lucene/analysis/TestNumericTokenStream.java index e9f73019512..37abbb26529 100644 --- a/src/test/org/apache/lucene/analysis/TestNumericTokenStream.java +++ b/src/test/org/apache/lucene/analysis/TestNumericTokenStream.java @@ -27,9 +27,8 @@ public class TestNumericTokenStream extends LuceneTestCase { static final long lvalue = 4573245871874382L; static final int ivalue = 123456; - public void testLongStreamNewAPI() throws Exception { + public void testLongStream() throws Exception { final NumericTokenStream stream=new NumericTokenStream().setLongValue(lvalue); - stream.setUseNewAPI(true); // use getAttribute to test if attributes really exist, if not an IAE will be throwed final TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class); final TypeAttribute typeAtt = (TypeAttribute) stream.getAttribute(TypeAttribute.class); @@ -40,22 +39,9 @@ public class TestNumericTokenStream extends LuceneTestCase { } assertFalse("No more tokens available", stream.incrementToken()); } - - public void testLongStreamOldAPI() throws Exception { - final NumericTokenStream stream=new NumericTokenStream().setLongValue(lvalue); - stream.setUseNewAPI(false); - Token tok=new Token(); - for (int shift=0; shift<64; shift+=NumericUtils.PRECISION_STEP_DEFAULT) { - assertNotNull("New token is available", tok=stream.next(tok)); - assertEquals("Term is correctly encoded", NumericUtils.longToPrefixCoded(lvalue, shift), tok.term()); - assertEquals("Type correct", (shift == 0) ? NumericTokenStream.TOKEN_TYPE_FULL_PREC : NumericTokenStream.TOKEN_TYPE_LOWER_PREC, tok.type()); - } - assertNull("No more tokens available", stream.next(tok)); - } - public void testIntStreamNewAPI() throws Exception { + public void testIntStream() throws Exception { final NumericTokenStream stream=new NumericTokenStream().setIntValue(ivalue); - stream.setUseNewAPI(true); // use getAttribute to test if attributes really exist, if not an IAE will be throwed final TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class); final TypeAttribute typeAtt = (TypeAttribute) stream.getAttribute(TypeAttribute.class); @@ -67,18 +53,6 @@ public class TestNumericTokenStream extends LuceneTestCase { assertFalse("No more tokens available", stream.incrementToken()); } - public void testIntStreamOldAPI() throws Exception { - final NumericTokenStream stream=new NumericTokenStream().setIntValue(ivalue); - stream.setUseNewAPI(false); - Token tok=new Token(); - for (int shift=0; shift<32; shift+=NumericUtils.PRECISION_STEP_DEFAULT) { - assertNotNull("New token is available", tok=stream.next(tok)); - assertEquals("Term is correctly encoded", NumericUtils.intToPrefixCoded(ivalue, shift), tok.term()); - assertEquals("Type correct", (shift == 0) ? NumericTokenStream.TOKEN_TYPE_FULL_PREC : NumericTokenStream.TOKEN_TYPE_LOWER_PREC, tok.type()); - } - assertNull("No more tokens available", stream.next(tok)); - } - public void testNotInitialized() throws Exception { final NumericTokenStream stream=new NumericTokenStream(); @@ -89,21 +63,12 @@ public class TestNumericTokenStream extends LuceneTestCase { // pass } - stream.setUseNewAPI(true); try { stream.incrementToken(); fail("incrementToken() should not succeed."); } catch (IllegalStateException e) { // pass } - - stream.setUseNewAPI(false); - try { - stream.next(new Token()); - fail("next() should not succeed."); - } catch (IllegalStateException e) { - // pass - } } } diff --git a/src/test/org/apache/lucene/analysis/TestTeeSinkTokenFilter.java b/src/test/org/apache/lucene/analysis/TestTeeSinkTokenFilter.java new file mode 100644 index 00000000000..812ab8bd4aa --- /dev/null +++ b/src/test/org/apache/lucene/analysis/TestTeeSinkTokenFilter.java @@ -0,0 +1,267 @@ +package org.apache.lucene.analysis; + +/** + * Copyright 2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.standard.StandardFilter; +import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.English; +import org.apache.lucene.util.LuceneTestCase; + +import java.io.IOException; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.List; + +/** + * tests for the TestTeeSinkTokenFilter + */ +public class TestTeeSinkTokenFilter extends LuceneTestCase { + protected StringBuffer buffer1; + protected StringBuffer buffer2; + protected String[] tokens1; + protected String[] tokens2; + + + public TestTeeSinkTokenFilter(String s) { + super(s); + } + + protected void setUp() throws Exception { + super.setUp(); + tokens1 = new String[]{"The", "quick", "Burgundy", "Fox", "jumped", "over", "the", "lazy", "Red", "Dogs"}; + tokens2 = new String[]{"The", "Lazy", "Dogs", "should", "stay", "on", "the", "porch"}; + buffer1 = new StringBuffer(); + + for (int i = 0; i < tokens1.length; i++) { + buffer1.append(tokens1[i]).append(' '); + } + buffer2 = new StringBuffer(); + for (int i = 0; i < tokens2.length; i++) { + buffer2.append(tokens2[i]).append(' '); + } + } + + static final TeeSinkTokenFilter.SinkFilter theFilter = new TeeSinkTokenFilter.SinkFilter() { + public boolean accept(AttributeSource a) { + TermAttribute termAtt = (TermAttribute) a.getAttribute(TermAttribute.class); + return termAtt.term().equalsIgnoreCase("The"); + } + }; + + static final TeeSinkTokenFilter.SinkFilter dogFilter = new TeeSinkTokenFilter.SinkFilter() { + public boolean accept(AttributeSource a) { + TermAttribute termAtt = (TermAttribute) a.getAttribute(TermAttribute.class); + return termAtt.term().equalsIgnoreCase("Dogs"); + } + }; + + + public void testGeneral() throws IOException { + final TeeSinkTokenFilter source = new TeeSinkTokenFilter(new WhitespaceTokenizer(new StringReader(buffer1.toString()))); + final TokenStream sink1 = source.newSinkTokenStream(); + final TokenStream sink2 = source.newSinkTokenStream(theFilter); + int i = 0; + TermAttribute termAtt = (TermAttribute) source.getAttribute(TermAttribute.class); + while (source.incrementToken()) { + assertEquals(tokens1[i], termAtt.term()); + i++; + } + assertEquals(tokens1.length, i); + + i = 0; + termAtt = (TermAttribute) sink1.getAttribute(TermAttribute.class); + while (sink1.incrementToken()) { + assertEquals(tokens1[i], termAtt.term()); + i++; + } + assertEquals(tokens1.length, i); + + i = 0; + termAtt = (TermAttribute) sink2.getAttribute(TermAttribute.class); + while (sink2.incrementToken()) { + assertTrue(termAtt.term().equalsIgnoreCase("The")); + i++; + } + assertEquals("there should be two times 'the' in the stream", 2, i); + } + + public void testMultipleSources() throws Exception { + final TeeSinkTokenFilter tee1 = new TeeSinkTokenFilter(new WhitespaceTokenizer(new StringReader(buffer1.toString()))); + final TeeSinkTokenFilter.SinkTokenStream dogDetector = tee1.newSinkTokenStream(dogFilter); + final TeeSinkTokenFilter.SinkTokenStream theDetector = tee1.newSinkTokenStream(theFilter); + final TokenStream source1 = new CachingTokenFilter(tee1); + + final TeeSinkTokenFilter tee2 = new TeeSinkTokenFilter(new WhitespaceTokenizer(new StringReader(buffer2.toString()))); + tee2.addSinkTokenStream(dogDetector); + tee2.addSinkTokenStream(theDetector); + final TokenStream source2 = tee2; + + int i = 0; + TermAttribute termAtt = (TermAttribute) source1.getAttribute(TermAttribute.class); + while (source1.incrementToken()) { + assertEquals(tokens1[i], termAtt.term()); + i++; + } + assertEquals(tokens1.length, i); + i = 0; + termAtt = (TermAttribute) source2.getAttribute(TermAttribute.class); + while (source2.incrementToken()) { + assertEquals(tokens2[i], termAtt.term()); + i++; + } + assertEquals(tokens2.length, i); + i = 0; + termAtt = (TermAttribute) theDetector.getAttribute(TermAttribute.class); + while (theDetector.incrementToken()) { + assertTrue("'" + termAtt.term() + "' is not equal to 'The'", termAtt.term().equalsIgnoreCase("The")); + i++; + } + assertEquals("there must be 4 times 'The' in the stream", 4, i); + i = 0; + termAtt = (TermAttribute) dogDetector.getAttribute(TermAttribute.class); + while (dogDetector.incrementToken()) { + assertTrue("'" + termAtt.term() + "' is not equal to 'Dogs'", termAtt.term().equalsIgnoreCase("Dogs")); + i++; + } + assertEquals("there must be 2 times 'Dog' in the stream", 2, i); + + source1.reset(); + TokenStream lowerCasing = new LowerCaseFilter(source1); + i = 0; + termAtt = (TermAttribute) lowerCasing.getAttribute(TermAttribute.class); + while (lowerCasing.incrementToken()) { + assertEquals(tokens1[i].toLowerCase(), termAtt.term()); + i++; + } + assertEquals(i, tokens1.length); + } + + /** + * Not an explicit test, just useful to print out some info on performance + * + * @throws Exception + */ + public void performance() throws Exception { + int[] tokCount = {100, 500, 1000, 2000, 5000, 10000}; + int[] modCounts = {1, 2, 5, 10, 20, 50, 100, 200, 500}; + for (int k = 0; k < tokCount.length; k++) { + StringBuffer buffer = new StringBuffer(); + System.out.println("-----Tokens: " + tokCount[k] + "-----"); + for (int i = 0; i < tokCount[k]; i++) { + buffer.append(English.intToEnglish(i).toUpperCase()).append(' '); + } + //make sure we produce the same tokens + TeeSinkTokenFilter teeStream = new TeeSinkTokenFilter(new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString())))); + TokenStream sink = teeStream.newSinkTokenStream(new ModuloSinkFilter(100)); + teeStream.consumeAllTokens(); + TokenStream stream = new ModuloTokenFilter(new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString()))), 100); + TermAttribute tfTok = (TermAttribute) stream.addAttribute(TermAttribute.class); + TermAttribute sinkTok = (TermAttribute) sink.addAttribute(TermAttribute.class); + for (int i=0; stream.incrementToken(); i++) { + assertTrue(sink.incrementToken()); + assertTrue(tfTok + " is not equal to " + sinkTok + " at token: " + i, tfTok.equals(sinkTok) == true); + } + + //simulate two fields, each being analyzed once, for 20 documents + for (int j = 0; j < modCounts.length; j++) { + int tfPos = 0; + long start = System.currentTimeMillis(); + for (int i = 0; i < 20; i++) { + stream = new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString()))); + PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) stream.getAttribute(PositionIncrementAttribute.class); + while (stream.incrementToken()) { + tfPos += posIncrAtt.getPositionIncrement(); + } + stream = new ModuloTokenFilter(new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString()))), modCounts[j]); + posIncrAtt = (PositionIncrementAttribute) stream.getAttribute(PositionIncrementAttribute.class); + while (stream.incrementToken()) { + tfPos += posIncrAtt.getPositionIncrement(); + } + } + long finish = System.currentTimeMillis(); + System.out.println("ModCount: " + modCounts[j] + " Two fields took " + (finish - start) + " ms"); + int sinkPos = 0; + //simulate one field with one sink + start = System.currentTimeMillis(); + for (int i = 0; i < 20; i++) { + teeStream = new TeeSinkTokenFilter(new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString())))); + sink = teeStream.newSinkTokenStream(new ModuloSinkFilter(modCounts[j])); + PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) teeStream.getAttribute(PositionIncrementAttribute.class); + while (teeStream.incrementToken()) { + sinkPos += posIncrAtt.getPositionIncrement(); + } + //System.out.println("Modulo--------"); + posIncrAtt = (PositionIncrementAttribute) sink.getAttribute(PositionIncrementAttribute.class); + while (sink.incrementToken()) { + sinkPos += posIncrAtt.getPositionIncrement(); + } + } + finish = System.currentTimeMillis(); + System.out.println("ModCount: " + modCounts[j] + " Tee fields took " + (finish - start) + " ms"); + assertTrue(sinkPos + " does not equal: " + tfPos, sinkPos == tfPos); + + } + System.out.println("- End Tokens: " + tokCount[k] + "-----"); + } + + } + + + class ModuloTokenFilter extends TokenFilter { + + int modCount; + + ModuloTokenFilter(TokenStream input, int mc) { + super(input); + modCount = mc; + } + + int count = 0; + + //return every 100 tokens + public boolean incrementToken() throws IOException { + boolean hasNext; + for (hasNext = input.incrementToken(); + hasNext && count % modCount != 0; + hasNext = input.incrementToken()) { + count++; + } + count++; + return hasNext; + } + } + + class ModuloSinkFilter implements TeeSinkTokenFilter.SinkFilter { + int count = 0; + int modCount; + + ModuloSinkFilter(int mc) { + modCount = mc; + } + + public boolean accept(AttributeSource a) { + boolean b = (a != null && count % modCount == 0); + count++; + return b; + } + + } +} + diff --git a/src/test/org/apache/lucene/analysis/TestTeeTokenFilter.java b/src/test/org/apache/lucene/analysis/TestTeeTokenFilter.java index 9f4d506a549..7ed6298751b 100644 --- a/src/test/org/apache/lucene/analysis/TestTeeTokenFilter.java +++ b/src/test/org/apache/lucene/analysis/TestTeeTokenFilter.java @@ -18,9 +18,6 @@ package org.apache.lucene.analysis; import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; -import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; -import org.apache.lucene.analysis.tokenattributes.TermAttribute; -import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.English; import org.apache.lucene.util.LuceneTestCase; @@ -43,8 +40,7 @@ public class TestTeeTokenFilter extends LuceneTestCase { super(s); } - protected void setUp() throws Exception { - super.setUp(); + protected void setUp() { tokens1 = new String[]{"The", "quick", "Burgundy", "Fox", "jumped", "over", "the", "lazy", "Red", "Dogs"}; tokens2 = new String[]{"The", "Lazy", "Dogs", "should", "stay", "on", "the", "porch"}; buffer1 = new StringBuffer(); @@ -66,29 +62,24 @@ public class TestTeeTokenFilter extends LuceneTestCase { public void test() throws IOException { SinkTokenizer sink1 = new SinkTokenizer(null) { - public void add(AttributeSource a) throws IOException { - TermAttribute termAtt = null; - if (a.hasAttribute(TermAttribute.class)) { - termAtt = (TermAttribute) a.getAttribute(TermAttribute.class); - } - if (termAtt != null && termAtt.term().equalsIgnoreCase("The")) { - super.add(a); + public void add(Token t) { + if (t != null && t.term().equalsIgnoreCase("The")) { + super.add(t); } } }; TokenStream source = new TeeTokenFilter(new WhitespaceTokenizer(new StringReader(buffer1.toString())), sink1); int i = 0; - TermAttribute termAtt = (TermAttribute) source.getAttribute(TermAttribute.class); - while (source.incrementToken()) { - assertTrue(termAtt.term() + " is not equal to " + tokens1[i], termAtt.term().equals(tokens1[i]) == true); + final Token reusableToken = new Token(); + for (Token nextToken = source.next(reusableToken); nextToken != null; nextToken = source.next(reusableToken)) { + assertTrue(nextToken.term() + " is not equal to " + tokens1[i], nextToken.term().equals(tokens1[i]) == true); i++; } assertTrue(i + " does not equal: " + tokens1.length, i == tokens1.length); assertTrue("sink1 Size: " + sink1.getTokens().size() + " is not: " + 2, sink1.getTokens().size() == 2); i = 0; - termAtt = (TermAttribute) sink1.getAttribute(TermAttribute.class); - while (sink1.incrementToken()) { - assertTrue(termAtt.term() + " is not equal to " + "The", termAtt.term().equalsIgnoreCase("The") == true); + for (Token token = sink1.next(reusableToken); token != null; token = sink1.next(reusableToken)) { + assertTrue(token.term() + " is not equal to " + "The", token.term().equalsIgnoreCase("The") == true); i++; } assertTrue(i + " does not equal: " + sink1.getTokens().size(), i == sink1.getTokens().size()); @@ -96,67 +87,55 @@ public class TestTeeTokenFilter extends LuceneTestCase { public void testMultipleSources() throws Exception { SinkTokenizer theDetector = new SinkTokenizer(null) { - public void add(AttributeSource a) throws IOException { - TermAttribute termAtt = null; - if (a.hasAttribute(TermAttribute.class)) { - termAtt = (TermAttribute) a.getAttribute(TermAttribute.class); - } - if (termAtt != null && termAtt.term().equalsIgnoreCase("The")) { - super.add(a); + public void add(Token t) { + if (t != null && t.term().equalsIgnoreCase("The")) { + super.add(t); } } }; - SinkTokenizer dogDetector = new SinkTokenizer(null) { - public void add(AttributeSource a) throws IOException { - TermAttribute termAtt = null; - if (a.hasAttribute(TermAttribute.class)) { - termAtt = (TermAttribute) a.getAttribute(TermAttribute.class); - } - if (termAtt != null && termAtt.term().equalsIgnoreCase("Dogs")) { - super.add(a); + SinkTokenizer dogDetector = new SinkTokenizer(null) { + public void add(Token t) { + if (t != null && t.term().equalsIgnoreCase("Dogs")) { + super.add(t); } } }; TokenStream source1 = new CachingTokenFilter(new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(new StringReader(buffer1.toString())), theDetector), dogDetector)); TokenStream source2 = new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(new StringReader(buffer2.toString())), theDetector), dogDetector); int i = 0; - TermAttribute termAtt = (TermAttribute) source1.getAttribute(TermAttribute.class); - while (source1.incrementToken()) { - assertTrue(termAtt.term() + " is not equal to " + tokens1[i], termAtt.term().equals(tokens1[i]) == true); + final Token reusableToken = new Token(); + for (Token nextToken = source1.next(reusableToken); nextToken != null; nextToken = source1.next(reusableToken)) { + assertTrue(nextToken.term() + " is not equal to " + tokens1[i], nextToken.term().equals(tokens1[i]) == true); i++; } assertTrue(i + " does not equal: " + tokens1.length, i == tokens1.length); assertTrue("theDetector Size: " + theDetector.getTokens().size() + " is not: " + 2, theDetector.getTokens().size() == 2); assertTrue("dogDetector Size: " + dogDetector.getTokens().size() + " is not: " + 1, dogDetector.getTokens().size() == 1); i = 0; - termAtt = (TermAttribute) source2.getAttribute(TermAttribute.class); - while (source2.incrementToken()) { - assertTrue(termAtt.term() + " is not equal to " + tokens2[i], termAtt.term().equals(tokens2[i]) == true); + for (Token nextToken = source2.next(reusableToken); nextToken != null; nextToken = source2.next(reusableToken)) { + assertTrue(nextToken.term() + " is not equal to " + tokens2[i], nextToken.term().equals(tokens2[i]) == true); i++; } assertTrue(i + " does not equal: " + tokens2.length, i == tokens2.length); assertTrue("theDetector Size: " + theDetector.getTokens().size() + " is not: " + 4, theDetector.getTokens().size() == 4); assertTrue("dogDetector Size: " + dogDetector.getTokens().size() + " is not: " + 2, dogDetector.getTokens().size() == 2); i = 0; - termAtt = (TermAttribute) theDetector.getAttribute(TermAttribute.class); - while (theDetector.incrementToken()) { - assertTrue(termAtt.term() + " is not equal to " + "The", termAtt.term().equalsIgnoreCase("The") == true); + for (Token nextToken = theDetector.next(reusableToken); nextToken != null; nextToken = theDetector.next(reusableToken)) { + assertTrue(nextToken.term() + " is not equal to " + "The", nextToken.term().equalsIgnoreCase("The") == true); i++; } assertTrue(i + " does not equal: " + theDetector.getTokens().size(), i == theDetector.getTokens().size()); i = 0; - termAtt = (TermAttribute) dogDetector.getAttribute(TermAttribute.class); - while (dogDetector.incrementToken()) { - assertTrue(termAtt.term() + " is not equal to " + "Dogs", termAtt.term().equalsIgnoreCase("Dogs") == true); + for (Token nextToken = dogDetector.next(reusableToken); nextToken != null; nextToken = dogDetector.next(reusableToken)) { + assertTrue(nextToken.term() + " is not equal to " + "Dogs", nextToken.term().equalsIgnoreCase("Dogs") == true); i++; } assertTrue(i + " does not equal: " + dogDetector.getTokens().size(), i == dogDetector.getTokens().size()); source1.reset(); TokenStream lowerCasing = new LowerCaseFilter(source1); i = 0; - termAtt = (TermAttribute) lowerCasing.getAttribute(TermAttribute.class); - while (lowerCasing.incrementToken()) { - assertTrue(termAtt.term() + " is not equal to " + tokens1[i].toLowerCase(), termAtt.term().equals(tokens1[i].toLowerCase()) == true); + for (Token nextToken = lowerCasing.next(reusableToken); nextToken != null; nextToken = lowerCasing.next(reusableToken)) { + assertTrue(nextToken.term() + " is not equal to " + tokens1[i].toLowerCase(), nextToken.term().equals(tokens1[i].toLowerCase()) == true); i++; } assertTrue(i + " does not equal: " + tokens1.length, i == tokens1.length); @@ -167,7 +146,7 @@ public class TestTeeTokenFilter extends LuceneTestCase { * * @throws Exception */ - public void doTestPerformance() throws Exception { + public void performance() throws Exception { int[] tokCount = {100, 500, 1000, 2000, 5000, 10000}; int[] modCounts = {1, 2, 5, 10, 20, 50, 100, 200, 500}; for (int k = 0; k < tokCount.length; k++) { @@ -178,20 +157,21 @@ public class TestTeeTokenFilter extends LuceneTestCase { } //make sure we produce the same tokens ModuloSinkTokenizer sink = new ModuloSinkTokenizer(tokCount[k], 100); + final Token reusableToken = new Token(); TokenStream stream = new TeeTokenFilter(new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString()))), sink); - while (stream.incrementToken()) { + while (stream.next(reusableToken) != null) { } stream = new ModuloTokenFilter(new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString()))), 100); List tmp = new ArrayList(); - while (stream.incrementToken()) { - tmp.add(stream.captureState()); + for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) { + tmp.add(nextToken.clone()); } List sinkList = sink.getTokens(); assertTrue("tmp Size: " + tmp.size() + " is not: " + sinkList.size(), tmp.size() == sinkList.size()); for (int i = 0; i < tmp.size(); i++) { - AttributeSource tfTok = (AttributeSource) tmp.get(i); - AttributeSource sinkTok = (AttributeSource) sinkList.get(i); - assertTrue(tfTok + " is not equal to " + sinkTok + " at token: " + i, tfTok.equals(sinkTok) == true); + Token tfTok = (Token) tmp.get(i); + Token sinkTok = (Token) sinkList.get(i); + assertTrue(tfTok.term() + " is not equal to " + sinkTok.term() + " at token: " + i, tfTok.term().equals(sinkTok.term()) == true); } //simulate two fields, each being analyzed once, for 20 documents @@ -200,14 +180,12 @@ public class TestTeeTokenFilter extends LuceneTestCase { long start = System.currentTimeMillis(); for (int i = 0; i < 20; i++) { stream = new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString()))); - PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) stream.getAttribute(PositionIncrementAttribute.class); - while (stream.incrementToken()) { - tfPos += posIncrAtt.getPositionIncrement(); + for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) { + tfPos += nextToken.getPositionIncrement(); } stream = new ModuloTokenFilter(new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString()))), modCounts[j]); - posIncrAtt = (PositionIncrementAttribute) stream.getAttribute(PositionIncrementAttribute.class); - while (stream.incrementToken()) { - tfPos += posIncrAtt.getPositionIncrement(); + for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) { + tfPos += nextToken.getPositionIncrement(); } } long finish = System.currentTimeMillis(); @@ -218,15 +196,13 @@ public class TestTeeTokenFilter extends LuceneTestCase { for (int i = 0; i < 20; i++) { sink = new ModuloSinkTokenizer(tokCount[k], modCounts[j]); stream = new TeeTokenFilter(new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString()))), sink); - PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) stream.getAttribute(PositionIncrementAttribute.class); - while (stream.incrementToken()) { - sinkPos += posIncrAtt.getPositionIncrement(); + for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) { + sinkPos += nextToken.getPositionIncrement(); } //System.out.println("Modulo--------"); stream = sink; - posIncrAtt = (PositionIncrementAttribute) stream.getAttribute(PositionIncrementAttribute.class); - while (stream.incrementToken()) { - sinkPos += posIncrAtt.getPositionIncrement(); + for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) { + sinkPos += nextToken.getPositionIncrement(); } } finish = System.currentTimeMillis(); @@ -252,15 +228,15 @@ public class TestTeeTokenFilter extends LuceneTestCase { int count = 0; //return every 100 tokens - public boolean incrementToken() throws IOException { - boolean hasNext; - for (hasNext = input.incrementToken(); - hasNext && count % modCount != 0; - hasNext = input.incrementToken()) { + public Token next(final Token reusableToken) throws IOException { + Token nextToken = null; + for (nextToken = input.next(reusableToken); + nextToken != null && count % modCount != 0; + nextToken = input.next(reusableToken)) { count++; } count++; - return hasNext; + return nextToken; } } @@ -274,9 +250,9 @@ public class TestTeeTokenFilter extends LuceneTestCase { lst = new ArrayList(numToks % mc); } - public void add(AttributeSource a) throws IOException { - if (a != null && count % modCount == 0) { - super.add(a); + public void add(Token t) { + if (t != null && count % modCount == 0) { + super.add(t); } count++; } diff --git a/src/test/org/apache/lucene/analysis/TestTokenStreamBWComp.java b/src/test/org/apache/lucene/analysis/TestTokenStreamBWComp.java new file mode 100644 index 00000000000..7781226cb8f --- /dev/null +++ b/src/test/org/apache/lucene/analysis/TestTokenStreamBWComp.java @@ -0,0 +1,311 @@ +package org.apache.lucene.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.StringReader; + +import org.apache.lucene.index.Payload; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.analysis.tokenattributes.*; + +/** This class tests some special cases of backwards compatibility when using the new TokenStream API with old analyzers */ +public class TestTokenStreamBWComp extends LuceneTestCase { + + private final String doc = "This is the new TokenStream api"; + private final String[] stopwords = new String[] {"is", "the", "this"}; + + public static class POSToken extends Token { + public static final int PROPERNOUN = 1; + public static final int NO_NOUN = 2; + + private int partOfSpeech; + + public void setPartOfSpeech(int pos) { + partOfSpeech = pos; + } + + public int getPartOfSpeech() { + return this.partOfSpeech; + } + } + + static class PartOfSpeechTaggingFilter extends TokenFilter { + + protected PartOfSpeechTaggingFilter(TokenStream input) { + super(input); + } + + public Token next() throws IOException { + Token t = input.next(); + if (t == null) return null; + + POSToken pt = new POSToken(); + pt.reinit(t); + if (pt.termLength() > 0) { + if (Character.isUpperCase(pt.termBuffer()[0])) { + pt.setPartOfSpeech(POSToken.PROPERNOUN); + } else { + pt.setPartOfSpeech(POSToken.NO_NOUN); + } + } + return pt; + } + + } + + static class PartOfSpeechAnnotatingFilter extends TokenFilter { + public final static byte PROPER_NOUN_ANNOTATION = 1; + + + protected PartOfSpeechAnnotatingFilter(TokenStream input) { + super(input); + } + + public Token next() throws IOException { + Token t = input.next(); + if (t == null) return null; + + if (t instanceof POSToken) { + POSToken pt = (POSToken) t; + if (pt.getPartOfSpeech() == POSToken.PROPERNOUN) { + pt.setPayload(new Payload(new byte[] {PROPER_NOUN_ANNOTATION})); + } + return pt; + } else { + return t; + } + } + + } + + // test the chain: The one and only term "TokenStream" should be declared as proper noun: + + public void testTeeSinkCustomTokenNewAPI() throws IOException { + testTeeSinkCustomToken(0); + } + + public void testTeeSinkCustomTokenOldAPI() throws IOException { + testTeeSinkCustomToken(1); + } + + public void testTeeSinkCustomTokenVeryOldAPI() throws IOException { + testTeeSinkCustomToken(2); + } + + private void testTeeSinkCustomToken(int api) throws IOException { + TokenStream stream = new WhitespaceTokenizer(new StringReader(doc)); + stream = new PartOfSpeechTaggingFilter(stream); + stream = new LowerCaseFilter(stream); + stream = new StopFilter(stream, stopwords); + + SinkTokenizer sink = new SinkTokenizer(); + TokenStream stream1 = new PartOfSpeechAnnotatingFilter(sink); + + stream = new TeeTokenFilter(stream, sink); + stream = new PartOfSpeechAnnotatingFilter(stream); + + switch (api) { + case 0: + consumeStreamNewAPI(stream); + consumeStreamNewAPI(stream1); + break; + case 1: + consumeStreamOldAPI(stream); + consumeStreamOldAPI(stream1); + break; + case 2: + consumeStreamVeryOldAPI(stream); + consumeStreamVeryOldAPI(stream1); + break; + } + } + + // test caching the special custom POSToken works in all cases + + public void testCachingCustomTokenNewAPI() throws IOException { + testTeeSinkCustomToken(0); + } + + public void testCachingCustomTokenOldAPI() throws IOException { + testTeeSinkCustomToken(1); + } + + public void testCachingCustomTokenVeryOldAPI() throws IOException { + testTeeSinkCustomToken(2); + } + + public void testCachingCustomTokenMixed() throws IOException { + testTeeSinkCustomToken(3); + } + + private void testCachingCustomToken(int api) throws IOException { + TokenStream stream = new WhitespaceTokenizer(new StringReader(doc)); + stream = new PartOfSpeechTaggingFilter(stream); + stream = new LowerCaseFilter(stream); + stream = new StopFilter(stream, stopwords); + stream = new CachingTokenFilter(stream); // <- the caching is done before the annotating! + stream = new PartOfSpeechAnnotatingFilter(stream); + + switch (api) { + case 0: + consumeStreamNewAPI(stream); + consumeStreamNewAPI(stream); + break; + case 1: + consumeStreamOldAPI(stream); + consumeStreamOldAPI(stream); + break; + case 2: + consumeStreamVeryOldAPI(stream); + consumeStreamVeryOldAPI(stream); + break; + case 3: + consumeStreamNewAPI(stream); + consumeStreamOldAPI(stream); + consumeStreamVeryOldAPI(stream); + consumeStreamNewAPI(stream); + consumeStreamVeryOldAPI(stream); + break; + } + } + + private static void consumeStreamNewAPI(TokenStream stream) throws IOException { + stream.reset(); + PayloadAttribute payloadAtt = (PayloadAttribute) stream.addAttribute(PayloadAttribute.class); + TermAttribute termAtt = (TermAttribute) stream.addAttribute(TermAttribute.class); + + while (stream.incrementToken()) { + String term = termAtt.term(); + Payload p = payloadAtt.getPayload(); + if (p != null && p.getData().length == 1 && p.getData()[0] == PartOfSpeechAnnotatingFilter.PROPER_NOUN_ANNOTATION) { + assertTrue("only TokenStream is a proper noun", "tokenstream".equals(term)); + } else { + assertFalse("all other tokens (if this test fails, the special POSToken subclass is not correctly passed through the chain)", "tokenstream".equals(term)); + } + } + } + + private static void consumeStreamOldAPI(TokenStream stream) throws IOException { + stream.reset(); + Token reusableToken = new Token(); + + while ((reusableToken = stream.next(reusableToken)) != null) { + String term = reusableToken.term(); + Payload p = reusableToken.getPayload(); + if (p != null && p.getData().length == 1 && p.getData()[0] == PartOfSpeechAnnotatingFilter.PROPER_NOUN_ANNOTATION) { + assertTrue("only TokenStream is a proper noun", "tokenstream".equals(term)); + } else { + assertFalse("all other tokens (if this test fails, the special POSToken subclass is not correctly passed through the chain)", "tokenstream".equals(term)); + } + } + } + + private static void consumeStreamVeryOldAPI(TokenStream stream) throws IOException { + stream.reset(); + + Token token; + while ((token = stream.next()) != null) { + String term = token.term(); + Payload p = token.getPayload(); + if (p != null && p.getData().length == 1 && p.getData()[0] == PartOfSpeechAnnotatingFilter.PROPER_NOUN_ANNOTATION) { + assertTrue("only TokenStream is a proper noun", "tokenstream".equals(term)); + } else { + assertFalse("all other tokens (if this test fails, the special POSToken subclass is not correctly passed through the chain)", "tokenstream".equals(term)); + } + } + } + + // test if tokenization fails, if only the new API is allowed and an old TokenStream is in the chain + public void testOnlyNewAPI() throws IOException { + TokenStream.setOnlyUseNewAPI(true); + try { + + // this should fail with UOE + try { + TokenStream stream = new WhitespaceTokenizer(new StringReader(doc)); + stream = new PartOfSpeechTaggingFilter(stream); // <-- this one is evil! + stream = new LowerCaseFilter(stream); + stream = new StopFilter(stream, stopwords); + while (stream.incrementToken()); + fail("If only the new API is allowed, this should fail with an UOE"); + } catch (UnsupportedOperationException uoe) { + assertTrue((PartOfSpeechTaggingFilter.class.getName()+" does not implement incrementToken() which is needed for onlyUseNewAPI.").equals(uoe.getMessage())); + } + + // this should pass, as all core token streams support the new API + TokenStream stream = new WhitespaceTokenizer(new StringReader(doc)); + stream = new LowerCaseFilter(stream); + stream = new StopFilter(stream, stopwords); + while (stream.incrementToken()); + + // Test, if all attributes are implemented by their implementation, not Token/TokenWrapper + assertTrue("TermAttribute is implemented by TermAttributeImpl", + stream.addAttribute(TermAttribute.class) instanceof TermAttributeImpl); + assertTrue("OffsetAttribute is implemented by OffsetAttributeImpl", + stream.addAttribute(OffsetAttribute.class) instanceof OffsetAttributeImpl); + assertTrue("FlagsAttribute is implemented by FlagsAttributeImpl", + stream.addAttribute(FlagsAttribute.class) instanceof FlagsAttributeImpl); + assertTrue("PayloadAttribute is implemented by PayloadAttributeImpl", + stream.addAttribute(PayloadAttribute.class) instanceof PayloadAttributeImpl); + assertTrue("PositionIncrementAttribute is implemented by PositionIncrementAttributeImpl", + stream.addAttribute(PositionIncrementAttribute.class) instanceof PositionIncrementAttributeImpl); + assertTrue("TypeAttribute is implemented by TypeAttributeImpl", + stream.addAttribute(TypeAttribute.class) instanceof TypeAttributeImpl); + + // Test if the wrapper API (onlyUseNewAPI==false) uses TokenWrapper + // as attribute instance. + // TokenWrapper encapsulates a Token instance that can be exchanged + // by another Token instance without changing the AttributeImpl instance + // itsself. + TokenStream.setOnlyUseNewAPI(false); + stream = new WhitespaceTokenizer(new StringReader(doc)); + assertTrue("TermAttribute is implemented by TokenWrapper", + stream.addAttribute(TermAttribute.class) instanceof TokenWrapper); + assertTrue("OffsetAttribute is implemented by TokenWrapper", + stream.addAttribute(OffsetAttribute.class) instanceof TokenWrapper); + assertTrue("FlagsAttribute is implemented by TokenWrapper", + stream.addAttribute(FlagsAttribute.class) instanceof TokenWrapper); + assertTrue("PayloadAttribute is implemented by TokenWrapper", + stream.addAttribute(PayloadAttribute.class) instanceof TokenWrapper); + assertTrue("PositionIncrementAttribute is implemented by TokenWrapper", + stream.addAttribute(PositionIncrementAttribute.class) instanceof TokenWrapper); + assertTrue("TypeAttribute is implemented by TokenWrapper", + stream.addAttribute(TypeAttribute.class) instanceof TokenWrapper); + + } finally { + TokenStream.setOnlyUseNewAPI(false); + } + } + + public void testOverridesAny() throws Exception { + try { + TokenStream stream = new WhitespaceTokenizer(new StringReader(doc)); + stream = new TokenFilter(stream) { + // we implement nothing, only un-abstract it + }; + stream = new LowerCaseFilter(stream); + stream = new StopFilter(stream, stopwords); + while (stream.incrementToken()); + fail("One TokenFilter does not override any of the required methods, so it should fail."); + } catch (UnsupportedOperationException uoe) { + assertTrue(uoe.getMessage().endsWith("does not implement any of incrementToken(), next(Token), next().")); + } + } + +} diff --git a/src/test/org/apache/lucene/index/TestDocumentWriter.java b/src/test/org/apache/lucene/index/TestDocumentWriter.java index 72c4548040a..6eb8aa7f267 100644 --- a/src/test/org/apache/lucene/index/TestDocumentWriter.java +++ b/src/test/org/apache/lucene/index/TestDocumentWriter.java @@ -141,11 +141,11 @@ public class TestDocumentWriter extends LuceneTestCase { public TokenStream tokenStream(String fieldName, Reader reader) { return new TokenFilter(new WhitespaceTokenizer(reader)) { boolean first=true; - AttributeSource state; + AttributeSource.State state; public boolean incrementToken() throws IOException { if (state != null) { - state.restoreState(this); + restoreState(state); payloadAtt.setPayload(null); posIncrAtt.setPositionIncrement(0); termAtt.setTermBuffer(new char[]{'b'}, 0, 1); diff --git a/src/test/org/apache/lucene/index/TestIndexWriter.java b/src/test/org/apache/lucene/index/TestIndexWriter.java index 2e7e8deedbc..4db25a4be46 100644 --- a/src/test/org/apache/lucene/index/TestIndexWriter.java +++ b/src/test/org/apache/lucene/index/TestIndexWriter.java @@ -28,6 +28,7 @@ import java.util.Iterator; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.SinkTokenizer; +import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.WhitespaceAnalyzer; @@ -3521,47 +3522,21 @@ public class TestIndexWriter extends LuceneTestCase } } - private static class MyAnalyzer extends Analyzer { - - public TokenStream tokenStream(String fieldName, Reader reader) { - TokenStream s = new WhitespaceTokenizer(reader); - s.addAttribute(PositionIncrementAttribute.class); - return s; - } - - } - // LUCENE-1255 public void testNegativePositions() throws Throwable { SinkTokenizer tokens = new SinkTokenizer(); - tokens.addAttribute(TermAttribute.class); - tokens.addAttribute(PositionIncrementAttribute.class); - - AttributeSource state = new AttributeSource(); - TermAttribute termAtt = (TermAttribute) state.addAttribute(TermAttribute.class); - PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) state.addAttribute(PositionIncrementAttribute.class); - termAtt.setTermBuffer("a"); - posIncrAtt.setPositionIncrement(0); - tokens.add(state); - - state = new AttributeSource(); - termAtt = (TermAttribute) state.addAttribute(TermAttribute.class); - posIncrAtt = (PositionIncrementAttribute) state.addAttribute(PositionIncrementAttribute.class); - - termAtt.setTermBuffer("b"); - posIncrAtt.setPositionIncrement(1); - tokens.add(state); - - state = new AttributeSource(); - termAtt = (TermAttribute) state.addAttribute(TermAttribute.class); - posIncrAtt = (PositionIncrementAttribute) state.addAttribute(PositionIncrementAttribute.class); - - termAtt.setTermBuffer("c"); - posIncrAtt.setPositionIncrement(1); - tokens.add(state); + Token t = new Token(); + t.setTermBuffer("a"); + t.setPositionIncrement(0); + tokens.add(t); + t.setTermBuffer("b"); + t.setPositionIncrement(1); + tokens.add(t); + t.setTermBuffer("c"); + tokens.add(t); MockRAMDirectory dir = new MockRAMDirectory(); - IndexWriter w = new IndexWriter(dir, new MyAnalyzer(), true, IndexWriter.MaxFieldLength.UNLIMITED); + IndexWriter w = new IndexWriter(dir, new WhitespaceAnalyzer(), true, IndexWriter.MaxFieldLength.UNLIMITED); Document doc = new Document(); doc.add(new Field("field", tokens)); w.addDocument(doc); diff --git a/src/test/org/apache/lucene/queryParser/TestMultiFieldQueryParser.java b/src/test/org/apache/lucene/queryParser/TestMultiFieldQueryParser.java index c8e334a7445..80d04f2d696 100644 --- a/src/test/org/apache/lucene/queryParser/TestMultiFieldQueryParser.java +++ b/src/test/org/apache/lucene/queryParser/TestMultiFieldQueryParser.java @@ -17,6 +17,7 @@ package org.apache.lucene.queryParser; * limitations under the License. */ +import java.io.IOException; import java.io.Reader; import java.util.HashMap; import java.util.Map; @@ -317,8 +318,8 @@ public class TestMultiFieldQueryParser extends LuceneTestCase { } private static class EmptyTokenStream extends TokenStream { - public Token next(final Token reusableToken) { - return null; + public boolean incrementToken() throws IOException { + return false; } } } diff --git a/src/test/org/apache/lucene/util/LuceneTestCase.java b/src/test/org/apache/lucene/util/LuceneTestCase.java index d734c11a47f..c33238b4f10 100644 --- a/src/test/org/apache/lucene/util/LuceneTestCase.java +++ b/src/test/org/apache/lucene/util/LuceneTestCase.java @@ -44,7 +44,6 @@ public abstract class LuceneTestCase extends TestCase { protected void setUp() throws Exception { ConcurrentMergeScheduler.setTestMode(); - TokenStream.setUseNewAPIDefault(true); } protected void tearDown() throws Exception { diff --git a/src/test/org/apache/lucene/util/TestAttributeSource.java b/src/test/org/apache/lucene/util/TestAttributeSource.java new file mode 100644 index 00000000000..f2be4168bff --- /dev/null +++ b/src/test/org/apache/lucene/util/TestAttributeSource.java @@ -0,0 +1,122 @@ +package org.apache.lucene.util; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.tokenattributes.*; + +import java.util.Iterator; + +public class TestAttributeSource extends LuceneTestCase { + + public void testCaptureState() { + // init a first instance + AttributeSource src = new AttributeSource(); + TermAttribute termAtt = (TermAttribute) src.addAttribute(TermAttribute.class); + TypeAttribute typeAtt = (TypeAttribute) src.addAttribute(TypeAttribute.class); + termAtt.setTermBuffer("TestTerm"); + typeAtt.setType("TestType"); + final int hashCode = src.hashCode(); + + AttributeSource.State state = src.captureState(); + + // modify the attributes + termAtt.setTermBuffer("AnotherTestTerm"); + typeAtt.setType("AnotherTestType"); + assertTrue("Hash code should be different", hashCode != src.hashCode()); + + src.restoreState(state); + assertEquals("TestTerm", termAtt.term()); + assertEquals("TestType", typeAtt.type()); + assertEquals("Hash code should be equal after restore", hashCode, src.hashCode()); + + // restore into an exact configured copy + AttributeSource copy = new AttributeSource(); + copy.addAttribute(TermAttribute.class); + copy.addAttribute(TypeAttribute.class); + copy.restoreState(state); + assertEquals("Both AttributeSources should have same hashCode after restore", src.hashCode(), copy.hashCode()); + assertEquals("Both AttributeSources should be equal after restore", src, copy); + + // init a second instance (with attributes in different order and one additional attribute) + AttributeSource src2 = new AttributeSource(); + typeAtt = (TypeAttribute) src2.addAttribute(TypeAttribute.class); + FlagsAttribute flagsAtt = (FlagsAttribute) src2.addAttribute(FlagsAttribute.class); + termAtt = (TermAttribute) src2.addAttribute(TermAttribute.class); + flagsAtt.setFlags(12345); + + src2.restoreState(state); + assertEquals("TestTerm", termAtt.term()); + assertEquals("TestType", typeAtt.type()); + assertEquals("FlagsAttribute should not be touched", 12345, flagsAtt.getFlags()); + + // init a third instance missing one Attribute + AttributeSource src3 = new AttributeSource(); + termAtt = (TermAttribute) src3.addAttribute(TermAttribute.class); + try { + src3.restoreState(state); + fail("The third instance is missing the TypeAttribute, so restoreState() should throw IllegalArgumentException"); + } catch (IllegalArgumentException iae) { + // pass + } + } + + public void testCloneAttributes() { + final AttributeSource src = new AttributeSource(); + final TermAttribute termAtt = (TermAttribute) src.addAttribute(TermAttribute.class); + final TypeAttribute typeAtt = (TypeAttribute) src.addAttribute(TypeAttribute.class); + termAtt.setTermBuffer("TestTerm"); + typeAtt.setType("TestType"); + + final AttributeSource clone = src.cloneAttributes(); + final Iterator it = clone.getAttributeClassesIterator(); + assertEquals("TermAttribute must be the first attribute", TermAttribute.class, it.next()); + assertEquals("TypeAttribute must be the second attribute", TypeAttribute.class, it.next()); + assertFalse("No more attributes", it.hasNext()); + + final TermAttribute termAtt2 = (TermAttribute) clone.getAttribute(TermAttribute.class); + final TypeAttribute typeAtt2 = (TypeAttribute) clone.getAttribute(TypeAttribute.class); + assertNotSame("TermAttribute of original and clone must be different instances", termAtt2, termAtt); + assertNotSame("TypeAttribute of original and clone must be different instances", typeAtt2, typeAtt); + assertEquals("TermAttribute of original and clone must be equal", termAtt2, termAtt); + assertEquals("TypeAttribute of original and clone must be equal", typeAtt2, typeAtt); + } + + public void testToStringAndMultiAttributeImplementations() { + AttributeSource src = new AttributeSource(); + TermAttribute termAtt = (TermAttribute) src.addAttribute(TermAttribute.class); + TypeAttribute typeAtt = (TypeAttribute) src.addAttribute(TypeAttribute.class); + termAtt.setTermBuffer("TestTerm"); + typeAtt.setType("TestType"); + assertEquals("Attributes should appear in original order", "("+termAtt.toString()+","+typeAtt.toString()+")", src.toString()); + + src = new AttributeSource(); + src.addAttributeImpl(new Token()); + // this should not add a new attribute as Token implements TermAttribute, too + termAtt = (TermAttribute) src.addAttribute(TermAttribute.class); + assertTrue("TermAttribute should be implemented by Token", termAtt instanceof Token); + // get the Token attribute and check, that it is the only one + final Iterator it = src.getAttributeImplsIterator(); + Token tok = (Token) it.next(); + assertFalse("There should be only one attribute implementation instance", it.hasNext()); + + termAtt.setTermBuffer("TestTerm"); + assertEquals("Token should only printed once", "("+tok.toString()+")", src.toString()); + } + +}