mirror of https://github.com/apache/lucene.git
LUCENE-1693: Various improvements to the new TokenStream API.
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@797665 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
f758b4d259
commit
31a5f0edcc
22
CHANGES.txt
22
CHANGES.txt
|
@ -64,6 +64,22 @@ Changes in backwards compatibility policy
|
|||
process. It is not recommended to implement it, but rather extend
|
||||
Searcher. (Shai Erera via Mike McCandless)
|
||||
|
||||
4. LUCENE-1422, LUCENE-1693: The new TokenStream API (see below) using
|
||||
Attributes has some backwards breaks in rare cases.
|
||||
We did our best to make the transition as easy as possible. You should
|
||||
not have problems, if your tokenizers still implement next(Token) or
|
||||
next(), the calls are automatically wrapped. The indexer and query parser
|
||||
use the new API using incrementToken() calls. All core TokenStreams
|
||||
are implemented using the new API. You can mix old and new API
|
||||
style TokenFilters/TokenStream. Problems only occur when you have done
|
||||
the following:
|
||||
You have overridden next(Token) or next() in one of the non-abstract core
|
||||
TokenStreams/-Filters. This classes should normally be final, but some
|
||||
of them are not. In this case next(Token)/next() would never be called.
|
||||
To early fail with a hard compile/runtime error, the next(Token)/next()
|
||||
methods in these TokenStreams/-Filters were made final.
|
||||
(Michael Busch, Uwe Schindler)
|
||||
|
||||
Changes in runtime behavior
|
||||
|
||||
1. LUCENE-1424: QueryParser now by default uses constant score query
|
||||
|
@ -156,14 +172,16 @@ API Changes
|
|||
and deprecate FSDirectory.getDirectory(). FSDirectory instances
|
||||
are not required to be singletons per path. (yonik)
|
||||
|
||||
4. LUCENE-1422: New TokenStream API that uses a new class called
|
||||
4. LUCENE-1422, LUCENE-1693: New TokenStream API that uses a new class called
|
||||
AttributeSource instead of the now deprecated Token class. All attributes
|
||||
that the Token class had have been moved into separate classes:
|
||||
TermAttribute, OffsetAttribute, PositionIncrementAttribute,
|
||||
PayloadAttribute, TypeAttribute and FlagsAttribute. The new API
|
||||
is much more flexible; it allows to combine the Attributes arbitrarily
|
||||
and also to define custom Attributes. The new API has the same performance
|
||||
as the old next(Token) approach. (Michael Busch)
|
||||
as the old next(Token) approach.
|
||||
For conformance with this new API Tee-/SinkTokenizer was deprecated
|
||||
and replaced by a new TeeSinkTokenFilter. (Michael Busch, Uwe Schindler)
|
||||
|
||||
5. LUCENE-1467: Add nextDoc() and next(int) methods to OpenBitSetIterator.
|
||||
These methods can be used to avoid additional calls to doc().
|
||||
|
|
|
@ -1,5 +1,8 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
|
||||
/**
|
||||
|
@ -53,24 +56,21 @@ import org.apache.lucene.util.ArrayUtil;
|
|||
* accents from Latin1 characters. For example, 'à' will be replaced by
|
||||
* 'a'.
|
||||
*/
|
||||
public class ASCIIFoldingFilter extends TokenFilter {
|
||||
public final class ASCIIFoldingFilter extends TokenFilter {
|
||||
public ASCIIFoldingFilter(TokenStream input)
|
||||
{
|
||||
super(input);
|
||||
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
}
|
||||
|
||||
private char[] output = new char[512];
|
||||
private int outputPos;
|
||||
private TermAttribute termAtt;
|
||||
|
||||
public Token next(Token result)
|
||||
throws java.io.IOException
|
||||
{
|
||||
result = input.next(result);
|
||||
|
||||
if (result != null)
|
||||
{
|
||||
final char[] buffer = result.termBuffer();
|
||||
final int length = result.termLength();
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
final char[] buffer = termAtt.termBuffer();
|
||||
final int length = termAtt.termLength();
|
||||
|
||||
// If no characters actually require rewriting then we
|
||||
// just return token as-is:
|
||||
|
@ -79,13 +79,13 @@ public class ASCIIFoldingFilter extends TokenFilter {
|
|||
if (c >= '\u0080')
|
||||
{
|
||||
foldToASCII(buffer, length);
|
||||
result.setTermBuffer(output, 0, outputPos);
|
||||
termAtt.setTermBuffer(output, 0, outputPos);
|
||||
break;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
return true;
|
||||
} else {
|
||||
return null;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -25,24 +25,35 @@ import java.util.List;
|
|||
import org.apache.lucene.util.AttributeSource;
|
||||
|
||||
/**
|
||||
* This class can be used if the Tokens of a TokenStream
|
||||
* This class can be used if the token attributes of a TokenStream
|
||||
* are intended to be consumed more than once. It caches
|
||||
* all Tokens locally in a List.
|
||||
* all token attribute states locally in a List.
|
||||
*
|
||||
* CachingTokenFilter implements the optional method
|
||||
* <P>CachingTokenFilter implements the optional method
|
||||
* {@link TokenStream#reset()}, which repositions the
|
||||
* stream to the first Token.
|
||||
*
|
||||
*/
|
||||
public class CachingTokenFilter extends TokenFilter {
|
||||
private List cache;
|
||||
private Iterator iterator;
|
||||
private List cache = null;
|
||||
private Iterator iterator = null;
|
||||
|
||||
public CachingTokenFilter(TokenStream input) {
|
||||
super(input);
|
||||
}
|
||||
|
||||
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
|
||||
* not be overridden. Delegates to the backwards compatibility layer. */
|
||||
public final Token next(final Token reusableToken) throws IOException {
|
||||
return super.next(reusableToken);
|
||||
}
|
||||
|
||||
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
|
||||
* not be overridden. Delegates to the backwards compatibility layer. */
|
||||
public final Token next() throws IOException {
|
||||
return super.next();
|
||||
}
|
||||
|
||||
public boolean incrementToken() throws IOException {
|
||||
public final boolean incrementToken() throws IOException {
|
||||
if (cache == null) {
|
||||
// fill cache lazily
|
||||
cache = new LinkedList();
|
||||
|
@ -51,34 +62,14 @@ public class CachingTokenFilter extends TokenFilter {
|
|||
}
|
||||
|
||||
if (!iterator.hasNext()) {
|
||||
// the cache is exhausted, return null
|
||||
// the cache is exhausted, return false
|
||||
return false;
|
||||
}
|
||||
// Since the TokenFilter can be reset, the tokens need to be preserved as immutable.
|
||||
AttributeSource state = (AttributeSource) iterator.next();
|
||||
state.restoreState(this);
|
||||
restoreState((AttributeSource.State) iterator.next());
|
||||
return true;
|
||||
}
|
||||
|
||||
/** @deprecated */
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
if (cache == null) {
|
||||
// fill cache lazily
|
||||
cache = new LinkedList();
|
||||
fillCache(reusableToken);
|
||||
iterator = cache.iterator();
|
||||
}
|
||||
|
||||
if (!iterator.hasNext()) {
|
||||
// the cache is exhausted, return null
|
||||
return null;
|
||||
}
|
||||
// Since the TokenFilter can be reset, the tokens need to be preserved as immutable.
|
||||
Token nextToken = (Token) iterator.next();
|
||||
return (Token) nextToken.clone();
|
||||
}
|
||||
|
||||
|
||||
public void reset() throws IOException {
|
||||
if(cache != null) {
|
||||
iterator = cache.iterator();
|
||||
|
@ -90,12 +81,5 @@ public class CachingTokenFilter extends TokenFilter {
|
|||
cache.add(captureState());
|
||||
}
|
||||
}
|
||||
|
||||
/** @deprecated */
|
||||
private void fillCache(final Token reusableToken) throws IOException {
|
||||
for (Token nextToken = input.next(reusableToken); nextToken != null; nextToken = input.next(reusableToken)) {
|
||||
cache.add(nextToken.clone());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -94,49 +94,16 @@ public abstract class CharTokenizer extends Tokenizer {
|
|||
return true;
|
||||
}
|
||||
|
||||
/** @deprecated */
|
||||
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
|
||||
* not be overridden. Delegates to the backwards compatibility layer. */
|
||||
public final Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
reusableToken.clear();
|
||||
int length = 0;
|
||||
int start = bufferIndex;
|
||||
char[] buffer = reusableToken.termBuffer();
|
||||
while (true) {
|
||||
return super.next(reusableToken);
|
||||
}
|
||||
|
||||
if (bufferIndex >= dataLen) {
|
||||
offset += dataLen;
|
||||
dataLen = input.read(ioBuffer);
|
||||
if (dataLen == -1) {
|
||||
if (length > 0)
|
||||
break;
|
||||
else
|
||||
return null;
|
||||
}
|
||||
bufferIndex = 0;
|
||||
}
|
||||
|
||||
final char c = ioBuffer[bufferIndex++];
|
||||
|
||||
if (isTokenChar(c)) { // if it's a token char
|
||||
|
||||
if (length == 0) // start of token
|
||||
start = offset + bufferIndex - 1;
|
||||
else if (length == buffer.length)
|
||||
buffer = reusableToken.resizeTermBuffer(1+length);
|
||||
|
||||
buffer[length++] = normalize(c); // buffer it, normalized
|
||||
|
||||
if (length == MAX_WORD_LEN) // buffer overflow!
|
||||
break;
|
||||
|
||||
} else if (length > 0) // at non-Letter w/ chars
|
||||
break; // return 'em
|
||||
}
|
||||
|
||||
reusableToken.setTermLength(length);
|
||||
reusableToken.setStartOffset(input.correctOffset(start));
|
||||
reusableToken.setEndOffset(input.correctOffset(start+length));
|
||||
return reusableToken;
|
||||
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
|
||||
* not be overridden. Delegates to the backwards compatibility layer. */
|
||||
public final Token next() throws IOException {
|
||||
return super.next();
|
||||
}
|
||||
|
||||
public void reset(Reader input) throws IOException {
|
||||
|
|
|
@ -57,27 +57,17 @@ public class ISOLatin1AccentFilter extends TokenFilter {
|
|||
} else
|
||||
return false;
|
||||
}
|
||||
|
||||
/** @deprecated */
|
||||
|
||||
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
|
||||
* not be overridden. Delegates to the backwards compatibility layer. */
|
||||
public final Token next(final Token reusableToken) throws java.io.IOException {
|
||||
assert reusableToken != null;
|
||||
Token nextToken = input.next(reusableToken);
|
||||
if (nextToken != null) {
|
||||
final char[] buffer = nextToken.termBuffer();
|
||||
final int length = nextToken.termLength();
|
||||
// If no characters actually require rewriting then we
|
||||
// just return token as-is:
|
||||
for(int i=0;i<length;i++) {
|
||||
final char c = buffer[i];
|
||||
if (c >= '\u00c0' && c <= '\uFB06') {
|
||||
removeAccents(buffer, length);
|
||||
nextToken.setTermBuffer(output, 0, outputPos);
|
||||
break;
|
||||
}
|
||||
}
|
||||
return nextToken;
|
||||
} else
|
||||
return null;
|
||||
return super.next(reusableToken);
|
||||
}
|
||||
|
||||
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
|
||||
* not be overridden. Delegates to the backwards compatibility layer. */
|
||||
public final Token next() throws java.io.IOException {
|
||||
return super.next();
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -45,7 +45,7 @@ public class KeywordTokenizer extends Tokenizer {
|
|||
offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||
}
|
||||
|
||||
public boolean incrementToken() throws IOException {
|
||||
public final boolean incrementToken() throws IOException {
|
||||
if (!done) {
|
||||
done = true;
|
||||
int upto = 0;
|
||||
|
@ -65,28 +65,16 @@ public class KeywordTokenizer extends Tokenizer {
|
|||
return false;
|
||||
}
|
||||
|
||||
/** @deprecated */
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
if (!done) {
|
||||
done = true;
|
||||
int upto = 0;
|
||||
reusableToken.clear();
|
||||
char[] buffer = reusableToken.termBuffer();
|
||||
while (true) {
|
||||
final int length = input.read(buffer, upto, buffer.length-upto);
|
||||
if (length == -1) break;
|
||||
upto += length;
|
||||
if (upto == buffer.length)
|
||||
buffer = reusableToken.resizeTermBuffer(1+buffer.length);
|
||||
}
|
||||
reusableToken.setTermLength(upto);
|
||||
reusableToken.setStartOffset(input.correctOffset(0));
|
||||
reusableToken.setEndOffset(input.correctOffset(upto));
|
||||
|
||||
return reusableToken;
|
||||
}
|
||||
return null;
|
||||
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
|
||||
* not be overridden. Delegates to the backwards compatibility layer. */
|
||||
public final Token next(final Token reusableToken) throws IOException {
|
||||
return super.next(reusableToken);
|
||||
}
|
||||
|
||||
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
|
||||
* not be overridden. Delegates to the backwards compatibility layer. */
|
||||
public final Token next() throws IOException {
|
||||
return super.next();
|
||||
}
|
||||
|
||||
public void reset(Reader input) throws IOException {
|
||||
|
|
|
@ -61,24 +61,4 @@ public final class LengthFilter extends TokenFilter {
|
|||
// reached EOS -- return null
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the next input Token whose term() is the right len
|
||||
* @deprecated
|
||||
*/
|
||||
public final Token next(final Token reusableToken) throws IOException
|
||||
{
|
||||
assert reusableToken != null;
|
||||
// return the first non-stop word found
|
||||
for (Token nextToken = input.next(reusableToken); nextToken != null; nextToken = input.next(reusableToken))
|
||||
{
|
||||
int len = nextToken.termLength();
|
||||
if (len >= min && len <= max) {
|
||||
return nextToken;
|
||||
}
|
||||
// note: else we ignore it but should we index each part of it?
|
||||
}
|
||||
// reached EOS -- return null
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -46,20 +46,4 @@ public final class LowerCaseFilter extends TokenFilter {
|
|||
} else
|
||||
return false;
|
||||
}
|
||||
|
||||
/** @deprecated */
|
||||
public final Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
Token nextToken = input.next(reusableToken);
|
||||
if (nextToken != null) {
|
||||
|
||||
final char[] buffer = nextToken.termBuffer();
|
||||
final int length = nextToken.termLength();
|
||||
for(int i=0;i<length;i++)
|
||||
buffer[i] = Character.toLowerCase(buffer[i]);
|
||||
|
||||
return nextToken;
|
||||
} else
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -206,40 +206,6 @@ public final class NumericTokenStream extends TokenStream {
|
|||
shift += precisionStep;
|
||||
return true;
|
||||
}
|
||||
|
||||
// @Override
|
||||
/** @deprecated Will be removed in Lucene 3.0 */
|
||||
public Token next(final Token reusableToken) {
|
||||
assert reusableToken != null;
|
||||
if (valSize == 0)
|
||||
throw new IllegalStateException("call set???Value() before usage");
|
||||
if (shift >= valSize)
|
||||
return null;
|
||||
|
||||
reusableToken.clear();
|
||||
|
||||
final char[] buffer;
|
||||
switch (valSize) {
|
||||
case 64:
|
||||
buffer = reusableToken.resizeTermBuffer(NumericUtils.BUF_SIZE_LONG);
|
||||
reusableToken.setTermLength(NumericUtils.longToPrefixCoded(value, shift, buffer));
|
||||
break;
|
||||
|
||||
case 32:
|
||||
buffer = reusableToken.resizeTermBuffer(NumericUtils.BUF_SIZE_INT);
|
||||
reusableToken.setTermLength(NumericUtils.intToPrefixCoded((int) value, shift, buffer));
|
||||
break;
|
||||
|
||||
default:
|
||||
// should not happen
|
||||
throw new IllegalArgumentException("valSize must be 32 or 64");
|
||||
}
|
||||
|
||||
reusableToken.setType((shift == 0) ? TOKEN_TYPE_FULL_PREC : TOKEN_TYPE_LOWER_PREC);
|
||||
reusableToken.setPositionIncrement((shift == 0) ? 1 : 0);
|
||||
shift += precisionStep;
|
||||
return reusableToken;
|
||||
}
|
||||
|
||||
// @Override
|
||||
public String toString() {
|
||||
|
|
|
@ -57,16 +57,4 @@ public final class PorterStemFilter extends TokenFilter {
|
|||
termAtt.setTermBuffer(stemmer.getResultBuffer(), 0, stemmer.getResultLength());
|
||||
return true;
|
||||
}
|
||||
|
||||
/** @deprecated */
|
||||
public final Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
Token nextToken = input.next(reusableToken);
|
||||
if (nextToken == null)
|
||||
return null;
|
||||
|
||||
if (stemmer.stem(nextToken.termBuffer(), 0, nextToken.termLength()))
|
||||
nextToken.setTermBuffer(stemmer.getResultBuffer(), 0, stemmer.getResultLength());
|
||||
return nextToken;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -22,19 +22,21 @@ import java.util.ArrayList;
|
|||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
||||
|
||||
/**
|
||||
* A SinkTokenizer can be used to cache Tokens for use in an Analyzer
|
||||
*
|
||||
* <p/>
|
||||
* WARNING: {@link TeeTokenFilter} and {@link SinkTokenizer} only work with the old TokenStream API.
|
||||
* If you switch to the new API, you need to use {@link TeeSinkTokenFilter} instead, which offers
|
||||
* the same functionality.
|
||||
* @see TeeTokenFilter
|
||||
* @deprecated Use {@link TeeSinkTokenFilter} instead
|
||||
*
|
||||
**/
|
||||
public class SinkTokenizer extends Tokenizer {
|
||||
protected List/*<Token>*/ lst = new ArrayList/*<Token>*/();
|
||||
protected Iterator/*<Token>*/ iter;
|
||||
|
||||
|
||||
public SinkTokenizer(List/*<Token>*/ input) {
|
||||
this.lst = input;
|
||||
if (this.lst == null) this.lst = new ArrayList/*<Token>*/();
|
||||
|
@ -63,30 +65,10 @@ public class SinkTokenizer extends Tokenizer {
|
|||
return lst;
|
||||
}
|
||||
|
||||
/**
|
||||
* Increments this stream to the next token out of the list of cached tokens
|
||||
* @throws IOException
|
||||
*/
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (iter == null) iter = lst.iterator();
|
||||
// Since this TokenStream can be reset we have to maintain the tokens as immutable
|
||||
if (iter.hasNext()) {
|
||||
AttributeSource state = (AttributeSource) iter.next();
|
||||
state.restoreState(this);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public void add(AttributeSource source) throws IOException {
|
||||
lst.add(source);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the next token out of the list of cached tokens
|
||||
* @return The next {@link org.apache.lucene.analysis.Token} in the Sink.
|
||||
* @throws IOException
|
||||
* @deprecated
|
||||
*/
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
|
|
|
@ -234,27 +234,6 @@ public final class StopFilter extends TokenFilter {
|
|||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the next input Token whose term() is not a stop word.
|
||||
* @deprecated
|
||||
*/
|
||||
public final Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
// return the first non-stop word found
|
||||
int skippedPositions = 0;
|
||||
for (Token nextToken = input.next(reusableToken); nextToken != null; nextToken = input.next(reusableToken)) {
|
||||
if (!stopWords.contains(nextToken.termBuffer(), 0, nextToken.termLength())) {
|
||||
if (enablePositionIncrements) {
|
||||
nextToken.setPositionIncrement(nextToken.getPositionIncrement() + skippedPositions);
|
||||
}
|
||||
return nextToken;
|
||||
}
|
||||
skippedPositions += nextToken.getPositionIncrement();
|
||||
}
|
||||
// reached EOS -- return null
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* @see #setEnablePositionIncrementsDefault(boolean).
|
||||
* @deprecated Please specify this when you create the StopFilter
|
||||
|
|
|
@ -0,0 +1,206 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.lang.ref.WeakReference;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Collections;
|
||||
|
||||
import org.apache.lucene.util.AttributeImpl;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
||||
/**
|
||||
* This TokenFilter provides the ability to set aside attribute states
|
||||
* that have already been analyzed. This is useful in situations where multiple fields share
|
||||
* many common analysis steps and then go their separate ways.
|
||||
* <p/>
|
||||
* It is also useful for doing things like entity extraction or proper noun analysis as
|
||||
* part of the analysis workflow and saving off those tokens for use in another field.
|
||||
*
|
||||
* <pre>
|
||||
TeeSinkTokenFilter source1 = new TeeSinkTokenFilter(new WhitespaceTokenizer(reader1));
|
||||
TeeSinkTokenFilter.SinkTokenStream sink1 = source1.newSinkTokenStream();
|
||||
TeeSinkTokenFilter.SinkTokenStream sink2 = source1.newSinkTokenStream();
|
||||
|
||||
TeeSinkTokenFilter source2 = new TeeSinkTokenFilter(new WhitespaceTokenizer(reader2));
|
||||
source2.addSinkTokenStream(sink1);
|
||||
source2.addSinkTokenStream(sink2);
|
||||
|
||||
TokenStream final1 = new LowerCaseFilter(source1);
|
||||
TokenStream final2 = source2;
|
||||
TokenStream final3 = new EntityDetect(sink1);
|
||||
TokenStream final4 = new URLDetect(sink2);
|
||||
|
||||
d.add(new Field("f1", final1));
|
||||
d.add(new Field("f2", final2));
|
||||
d.add(new Field("f3", final3));
|
||||
d.add(new Field("f4", final4));
|
||||
* </pre>
|
||||
* In this example, <code>sink1</code> and <code>sink2<code> will both get tokens from both
|
||||
* <code>reader1</code> and <code>reader2</code> after whitespace tokenizer
|
||||
* and now we can further wrap any of these in extra analysis, and more "sources" can be inserted if desired.
|
||||
* It is important, that tees are consumed before sinks (in the above example, the field names must be
|
||||
* less the sink's field names). If you are not sure, which stream is consumed first, you can simply
|
||||
* add another sink and then pass all tokens to the sinks at once using {@link #consumeAllTokens}.
|
||||
* This TokenFilter is exhausted after this. In the above example, change
|
||||
* the example above to:
|
||||
* <pre>
|
||||
...
|
||||
TokenStream final1 = new LowerCaseFilter(source1.newSinkTokenStream());
|
||||
TokenStream final2 = source2.newSinkTokenStream();
|
||||
sink1.consumeAllTokens();
|
||||
sink2.consumeAllTokens();
|
||||
...
|
||||
* </pre>
|
||||
* In this case, the fields can be added in any order, because the sources are not used anymore and all sinks are ready.
|
||||
* <p>Note, the EntityDetect and URLDetect TokenStreams are for the example and do not currently exist in Lucene.
|
||||
*/
|
||||
public final class TeeSinkTokenFilter extends TokenFilter {
|
||||
private final List sinks = new LinkedList();
|
||||
|
||||
/**
|
||||
* Instantiates a new TeeSinkTokenFilter.
|
||||
*/
|
||||
public TeeSinkTokenFilter(TokenStream input) {
|
||||
super(input);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a new {@link SinkTokenStream} that receives all tokens consumed by this stream.
|
||||
*/
|
||||
public SinkTokenStream newSinkTokenStream() {
|
||||
return newSinkTokenStream(ACCEPT_ALL_FILTER);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a new {@link SinkTokenStream} that receives all tokens consumed by this stream
|
||||
* that pass the supplied filter.
|
||||
* @see SinkFilter
|
||||
*/
|
||||
public SinkTokenStream newSinkTokenStream(SinkFilter filter) {
|
||||
SinkTokenStream sink = new SinkTokenStream(this.cloneAttributes(), filter);
|
||||
this.sinks.add(new WeakReference(sink));
|
||||
return sink;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a {@link SinkTokenStream} created by another <code>TeeSinkTokenFilter</code>
|
||||
* to this one. The supplied stream will also receive all consumed tokens.
|
||||
* This method can be used to pass tokens from two different tees to one sink.
|
||||
*/
|
||||
public void addSinkTokenStream(final SinkTokenStream sink) {
|
||||
// check that sink has correct factory
|
||||
if (!this.getAttributeFactory().equals(sink.getAttributeFactory())) {
|
||||
throw new IllegalArgumentException("The supplied sink is not compatible to this tee");
|
||||
}
|
||||
// add eventually missing attribute impls to the existing sink
|
||||
for (Iterator it = this.cloneAttributes().getAttributeImplsIterator(); it.hasNext(); ) {
|
||||
sink.addAttributeImpl((AttributeImpl) it.next());
|
||||
}
|
||||
this.sinks.add(new WeakReference(sink));
|
||||
}
|
||||
|
||||
/**
|
||||
* <code>TeeSinkTokenFilter</code> passes all tokens to the added sinks
|
||||
* when itsself is consumed. To be sure, that all tokens from the input
|
||||
* stream are passed to the sinks, you can call this methods.
|
||||
* This instance is exhausted after this, but all sinks are instant available.
|
||||
*/
|
||||
public void consumeAllTokens() throws IOException {
|
||||
while (incrementToken());
|
||||
}
|
||||
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
// capture state lazily - maybe no SinkFilter accepts this state
|
||||
AttributeSource.State state = null;
|
||||
for (Iterator it = sinks.iterator(); it.hasNext(); ) {
|
||||
final SinkTokenStream sink = (SinkTokenStream) ((WeakReference) it.next()).get();
|
||||
if (sink != null) {
|
||||
if (sink.accept(this)) {
|
||||
if (state == null) {
|
||||
state = this.captureState();
|
||||
}
|
||||
sink.addState(state);
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* TODO: Missing Docs
|
||||
*/
|
||||
public static interface SinkFilter {
|
||||
boolean accept(AttributeSource source);
|
||||
}
|
||||
|
||||
public static final class SinkTokenStream extends TokenStream {
|
||||
private final List cachedStates = new LinkedList();
|
||||
private Iterator it = null;
|
||||
private SinkFilter filter;
|
||||
|
||||
private SinkTokenStream(AttributeSource source, SinkFilter filter) {
|
||||
super(source);
|
||||
this.filter = filter;
|
||||
}
|
||||
|
||||
private boolean accept(AttributeSource source) {
|
||||
return filter.accept(source);
|
||||
}
|
||||
|
||||
private void addState(AttributeSource.State state) {
|
||||
if (it != null) {
|
||||
throw new IllegalStateException("The tee must be consumed before sinks are consumed.");
|
||||
}
|
||||
cachedStates.add(state);
|
||||
}
|
||||
|
||||
public final boolean incrementToken() throws IOException {
|
||||
// lazy init the iterator
|
||||
if (it == null) {
|
||||
it = cachedStates.iterator();
|
||||
}
|
||||
|
||||
if (!it.hasNext()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
AttributeSource.State state = (State) it.next();
|
||||
restoreState(state);
|
||||
return true;
|
||||
}
|
||||
|
||||
public final void reset() {
|
||||
it = cachedStates.iterator();
|
||||
}
|
||||
}
|
||||
|
||||
private static final SinkFilter ACCEPT_ALL_FILTER = new SinkFilter() {
|
||||
public boolean accept(AttributeSource source) {
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
}
|
|
@ -18,7 +18,6 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Iterator;
|
||||
|
||||
|
||||
/**
|
||||
|
@ -30,8 +29,8 @@ import java.util.Iterator;
|
|||
* part of the analysis workflow and saving off those tokens for use in another field.
|
||||
*
|
||||
* <pre>
|
||||
SinkTokenizer sink1 = new SinkTokenizer(null);
|
||||
SinkTokenizer sink2 = new SinkTokenizer(null);
|
||||
SinkTokenizer sink1 = new SinkTokenizer();
|
||||
SinkTokenizer sink2 = new SinkTokenizer();
|
||||
|
||||
TokenStream source1 = new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(reader1), sink1), sink2);
|
||||
TokenStream source2 = new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(reader2), sink1), sink2);
|
||||
|
@ -46,14 +45,22 @@ d.add(new Field("f2", final2));
|
|||
d.add(new Field("f3", final3));
|
||||
d.add(new Field("f4", final4));
|
||||
* </pre>
|
||||
* In this example, sink1 and sink2 will both get tokens from both reader1 and reader2 after whitespace tokenizer
|
||||
and now we can further wrap any of these in extra analysis, and more "sources" can be inserted if desired.
|
||||
Note, the EntityDetect and URLDetect TokenStreams are for the example and do not currently exist in Lucene
|
||||
* In this example, <code>sink1</code> and <code>sink2<code> will both get tokens from both
|
||||
* <code>reader1</code> and <code>reader2</code> after whitespace tokenizer
|
||||
* and now we can further wrap any of these in extra analysis, and more "sources" can be inserted if desired.
|
||||
* It is important, that tees are consumed before sinks (in the above example, the field names must be
|
||||
* less the sink's field names).
|
||||
* Note, the EntityDetect and URLDetect TokenStreams are for the example and do not currently exist in Lucene
|
||||
<p/>
|
||||
*
|
||||
* See http://issues.apache.org/jira/browse/LUCENE-1058
|
||||
* See <a href="http://issues.apache.org/jira/browse/LUCENE-1058">LUCENE-1058</a>.
|
||||
* <p/>
|
||||
* WARNING: {@link TeeTokenFilter} and {@link SinkTokenizer} only work with the old TokenStream API.
|
||||
* If you switch to the new API, you need to use {@link TeeSinkTokenFilter} instead, which offers
|
||||
* the same functionality.
|
||||
|
||||
* @see SinkTokenizer
|
||||
*
|
||||
* @deprecated Use {@link TeeSinkTokenFilter} instead
|
||||
**/
|
||||
public class TeeTokenFilter extends TokenFilter {
|
||||
SinkTokenizer sink;
|
||||
|
@ -61,21 +68,8 @@ public class TeeTokenFilter extends TokenFilter {
|
|||
public TeeTokenFilter(TokenStream input, SinkTokenizer sink) {
|
||||
super(input);
|
||||
this.sink = sink;
|
||||
Iterator it = getAttributesIterator();
|
||||
while (it.hasNext()) {
|
||||
sink.addAttribute(it.next().getClass());
|
||||
}
|
||||
}
|
||||
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
sink.add(captureState());
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/** @deprecated */
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
Token nextToken = input.next(reusableToken);
|
||||
|
|
|
@ -17,14 +17,19 @@ package org.apache.lucene.analysis;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.index.Payload;
|
||||
import org.apache.lucene.index.TermPositions; // for javadoc
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.Attribute;
|
||||
import org.apache.lucene.util.AttributeImpl;
|
||||
|
||||
/**
|
||||
This class is now deprecated and a new TokenStream API was introduced with Lucene 2.9.
|
||||
See Javadocs in {@link TokenStream} for further details.
|
||||
<p>
|
||||
A Token is an occurrence of a term from the text of a field. It consists of
|
||||
a term's text, the start and end offset of the term in the text of the field,
|
||||
and a type string.
|
||||
|
@ -44,11 +49,13 @@ import org.apache.lucene.util.ArrayUtil;
|
|||
{@link TermPositions#getPayload(byte[], int)} to retrieve the payloads from the index.
|
||||
|
||||
<br><br>
|
||||
<p><font color="#FF0000">
|
||||
WARNING: The status of the <b>Payloads</b> feature is experimental.
|
||||
The APIs introduced here might change in the future and will not be
|
||||
supported anymore in such a case.</font>
|
||||
|
||||
|
||||
<p><b>NOTE:</b> As of 2.9, Token implements all {@link Attribute} interfaces
|
||||
that are part of core Lucene and can be found in the {@code tokenattributes} subpackage.
|
||||
Even though it is not necessary to use Token anymore, with the new TokenStream API it can
|
||||
be used as convenience class that implements all {@link Attribute}s, which is especially useful
|
||||
to easily switch from the old to the new TokenStream API.
|
||||
|
||||
<br><br>
|
||||
|
||||
<p><b>NOTE:</b> As of 2.3, Token stores the term text
|
||||
|
@ -118,10 +125,10 @@ import org.apache.lucene.util.ArrayUtil;
|
|||
</p>
|
||||
|
||||
@see org.apache.lucene.index.Payload
|
||||
@deprecated A new TokenStream API was introduced with Lucene 2.9.
|
||||
See javadocs in {@link TokenStream} for further details.
|
||||
*/
|
||||
public class Token implements Cloneable {
|
||||
public class Token extends AttributeImpl
|
||||
implements Cloneable, TermAttribute, TypeAttribute, PositionIncrementAttribute,
|
||||
FlagsAttribute, OffsetAttribute, PayloadAttribute {
|
||||
|
||||
public static final String DEFAULT_TYPE = "word";
|
||||
|
||||
|
@ -134,7 +141,7 @@ public class Token implements Cloneable {
|
|||
/**
|
||||
* Characters for the term text.
|
||||
* @deprecated This will be made private. Instead, use:
|
||||
* {@link termBuffer()},
|
||||
* {@link #termBuffer()},
|
||||
* {@link #setTermBuffer(char[], int, int)},
|
||||
* {@link #setTermBuffer(String)}, or
|
||||
* {@link #setTermBuffer(String, int, int)}
|
||||
|
@ -144,28 +151,28 @@ public class Token implements Cloneable {
|
|||
/**
|
||||
* Length of term text in the buffer.
|
||||
* @deprecated This will be made private. Instead, use:
|
||||
* {@link termLength()}, or @{link setTermLength(int)}.
|
||||
* {@link #termLength()}, or @{link setTermLength(int)}.
|
||||
*/
|
||||
int termLength;
|
||||
|
||||
/**
|
||||
* Start in source text.
|
||||
* @deprecated This will be made private. Instead, use:
|
||||
* {@link startOffset()}, or @{link setStartOffset(int)}.
|
||||
* {@link #startOffset()}, or @{link setStartOffset(int)}.
|
||||
*/
|
||||
int startOffset;
|
||||
|
||||
/**
|
||||
* End in source text.
|
||||
* @deprecated This will be made private. Instead, use:
|
||||
* {@link endOffset()}, or @{link setEndOffset(int)}.
|
||||
* {@link #endOffset()}, or @{link setEndOffset(int)}.
|
||||
*/
|
||||
int endOffset;
|
||||
|
||||
/**
|
||||
* The lexical type of the token.
|
||||
* @deprecated This will be made private. Instead, use:
|
||||
* {@link type()}, or @{link setType(String)}.
|
||||
* {@link #type()}, or @{link setType(String)}.
|
||||
*/
|
||||
String type = DEFAULT_TYPE;
|
||||
|
||||
|
@ -173,13 +180,13 @@ public class Token implements Cloneable {
|
|||
|
||||
/**
|
||||
* @deprecated This will be made private. Instead, use:
|
||||
* {@link getPayload()}, or @{link setPayload(Payload)}.
|
||||
* {@link #getPayload()}, or @{link setPayload(Payload)}.
|
||||
*/
|
||||
Payload payload;
|
||||
|
||||
/**
|
||||
* @deprecated This will be made private. Instead, use:
|
||||
* {@link getPositionIncrement()}, or @{link setPositionIncrement(String)}.
|
||||
* {@link #getPositionIncrement()}, or @{link setPositionIncrement(String)}.
|
||||
*/
|
||||
int positionIncrement = 1;
|
||||
|
||||
|
@ -561,6 +568,13 @@ public class Token implements Cloneable {
|
|||
public void setEndOffset(int offset) {
|
||||
this.endOffset = offset;
|
||||
}
|
||||
|
||||
/** Set the starting and ending offset.
|
||||
@see #startOffset() and #endOffset()*/
|
||||
public void setOffset(int startOffset, int endOffset) {
|
||||
this.startOffset = startOffset;
|
||||
this.endOffset = endOffset;
|
||||
}
|
||||
|
||||
/** Returns this Token's lexical type. Defaults to "word". */
|
||||
public final String type() {
|
||||
|
@ -640,19 +654,15 @@ public class Token implements Cloneable {
|
|||
}
|
||||
|
||||
public Object clone() {
|
||||
try {
|
||||
Token t = (Token)super.clone();
|
||||
// Do a deep clone
|
||||
if (termBuffer != null) {
|
||||
t.termBuffer = (char[]) termBuffer.clone();
|
||||
}
|
||||
if (payload != null) {
|
||||
t.setPayload((Payload) payload.clone());
|
||||
}
|
||||
return t;
|
||||
} catch (CloneNotSupportedException e) {
|
||||
throw new RuntimeException(e); // shouldn't happen
|
||||
Token t = (Token)super.clone();
|
||||
// Do a deep clone
|
||||
if (termBuffer != null) {
|
||||
t.termBuffer = (char[]) termBuffer.clone();
|
||||
}
|
||||
if (payload != null) {
|
||||
t.setPayload((Payload) payload.clone());
|
||||
}
|
||||
return t;
|
||||
}
|
||||
|
||||
/** Makes a clone, but replaces the term buffer &
|
||||
|
@ -862,4 +872,9 @@ public class Token implements Cloneable {
|
|||
type = prototype.type;
|
||||
payload = prototype.payload;
|
||||
}
|
||||
|
||||
public void copyTo(AttributeImpl target) {
|
||||
Token to = (Token) target;
|
||||
to.reinit(this);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -42,7 +42,7 @@ public abstract class TokenFilter extends TokenStream {
|
|||
super(input);
|
||||
this.input = input;
|
||||
}
|
||||
|
||||
|
||||
/** Close the input TokenStream. */
|
||||
public void close() throws IOException {
|
||||
input.close();
|
||||
|
@ -50,20 +50,6 @@ public abstract class TokenFilter extends TokenStream {
|
|||
|
||||
/** Reset the filter as well as the input TokenStream. */
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
input.reset();
|
||||
}
|
||||
|
||||
public boolean useNewAPI() {
|
||||
return input.useNewAPI();
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets whether or not to use the new TokenStream API. Settings this
|
||||
* will apply to this Filter and all TokenStream/Filters upstream.
|
||||
*/
|
||||
public void setUseNewAPI(boolean use) {
|
||||
input.setUseNewAPI(use);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -18,10 +18,15 @@ package org.apache.lucene.analysis;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Iterator;
|
||||
|
||||
import org.apache.lucene.index.Payload;
|
||||
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.util.Attribute;
|
||||
import org.apache.lucene.util.AttributeImpl;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
||||
/** A TokenStream enumerates the sequence of tokens, either from
|
||||
|
@ -36,13 +41,13 @@ import org.apache.lucene.util.AttributeSource;
|
|||
</ul>
|
||||
A new TokenStream API is introduced with Lucene 2.9. Since
|
||||
2.9 Token is deprecated and the preferred way to store
|
||||
the information of a token is to use {@link Attribute}s.
|
||||
the information of a token is to use {@link AttributeImpl}s.
|
||||
<p>
|
||||
For that reason TokenStream extends {@link AttributeSource}
|
||||
now. Note that only one instance per {@link Attribute} is
|
||||
now. Note that only one instance per {@link AttributeImpl} is
|
||||
created and reused for every token. This approach reduces
|
||||
object creations and allows local caching of references to
|
||||
the {@link Attribute}s. See {@link #incrementToken()} for further details.
|
||||
the {@link AttributeImpl}s. See {@link #incrementToken()} for further details.
|
||||
<p>
|
||||
<b>The workflow of the new TokenStream API is as follows:</b>
|
||||
<ol>
|
||||
|
@ -60,19 +65,8 @@ import org.apache.lucene.util.AttributeSource;
|
|||
<p>
|
||||
Sometimes it is desirable to capture a current state of a
|
||||
TokenStream, e. g. for buffering purposes (see {@link CachingTokenFilter},
|
||||
{@link TeeTokenFilter}/{@link SinkTokenizer}). For this usecase
|
||||
{@link AttributeSource#captureState()} and {@link AttributeSource#restoreState(AttributeSource)} can be used.
|
||||
<p>
|
||||
<b>NOTE:</b> In order to enable the new API the method
|
||||
{@link #useNewAPI()} has to be called with useNewAPI=true.
|
||||
Otherwise the deprecated method {@link #next(Token)} will
|
||||
be used by Lucene consumers (indexer and queryparser) to
|
||||
consume the tokens. {@link #next(Token)} will be removed
|
||||
in Lucene 3.0.
|
||||
<p>
|
||||
NOTE: To use the old API subclasses must override {@link #next(Token)}.
|
||||
It's also OK to instead override {@link #next()} but that
|
||||
method is slower compared to {@link #next(Token)}.
|
||||
{@link TeeSinkTokenFilter}). For this usecase
|
||||
{@link AttributeSource#captureState} and {@link AttributeSource#restoreState} can be used.
|
||||
* <p><font color="#FF0000">
|
||||
* WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental.
|
||||
* The APIs introduced in these classes with Lucene 2.9 might change in the future.
|
||||
|
@ -80,110 +74,203 @@ import org.apache.lucene.util.AttributeSource;
|
|||
*/
|
||||
|
||||
public abstract class TokenStream extends AttributeSource {
|
||||
private static boolean useNewAPIDefault = false;
|
||||
private boolean useNewAPI = useNewAPIDefault;
|
||||
|
||||
/** @deprecated Remove this when old API is removed! */
|
||||
private static final AttributeFactory DEFAULT_TOKEN_WRAPPER_ATTRIBUTE_FACTORY
|
||||
= new TokenWrapperAttributeFactory(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY);
|
||||
|
||||
protected TokenStream() {
|
||||
super();
|
||||
/** @deprecated Remove this when old API is removed! */
|
||||
private static final Class[] METHOD_NO_PARAMS = new Class[0];
|
||||
|
||||
/** @deprecated Remove this when old API is removed! */
|
||||
private static final Class[] METHOD_TOKEN_PARAM = new Class[]{Token.class};
|
||||
|
||||
/** @deprecated Remove this when old API is removed! */
|
||||
private final TokenWrapper tokenWrapper;
|
||||
|
||||
/** @deprecated Remove this when old API is removed! */
|
||||
private static boolean onlyUseNewAPI = false;
|
||||
|
||||
/** @deprecated Remove this when old API is removed! */
|
||||
private final boolean
|
||||
hasIncrementToken = isMethodOverridden("incrementToken", METHOD_NO_PARAMS),
|
||||
hasReusableNext = onlyUseNewAPI ? false : isMethodOverridden("next", METHOD_TOKEN_PARAM),
|
||||
hasNext = onlyUseNewAPI ? false : isMethodOverridden("next", METHOD_NO_PARAMS);
|
||||
|
||||
/** @deprecated Remove this when old API is removed! */
|
||||
private boolean isMethodOverridden(String name, Class[] params) {
|
||||
try {
|
||||
return this.getClass().getMethod(name, params).getDeclaringClass() != TokenStream.class;
|
||||
} catch (NoSuchMethodException e) {
|
||||
// should not happen
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
/** @deprecated Remove this when old API is removed! */
|
||||
private static final class TokenWrapperAttributeFactory extends AttributeFactory {
|
||||
private final AttributeFactory delegate;
|
||||
|
||||
private TokenWrapperAttributeFactory(AttributeFactory delegate) {
|
||||
this.delegate = delegate;
|
||||
}
|
||||
|
||||
public AttributeImpl createAttributeInstance(Class attClass) {
|
||||
return attClass.isAssignableFrom(TokenWrapper.class)
|
||||
? new TokenWrapper()
|
||||
: delegate.createAttributeInstance(attClass);
|
||||
}
|
||||
|
||||
// this is needed for TeeSinkTokenStream's check for compatibility of AttributeSource,
|
||||
// so two TokenStreams using old API have the same AttributeFactory wrapped by this one.
|
||||
public boolean equals(Object other) {
|
||||
if (this == other) return true;
|
||||
if (other instanceof TokenWrapperAttributeFactory) {
|
||||
final TokenWrapperAttributeFactory af = (TokenWrapperAttributeFactory) other;
|
||||
return this.delegate.equals(af.delegate);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* A TokenStream using the default attribute factory.
|
||||
*/
|
||||
protected TokenStream() {
|
||||
super(onlyUseNewAPI
|
||||
? AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY
|
||||
: TokenStream.DEFAULT_TOKEN_WRAPPER_ATTRIBUTE_FACTORY
|
||||
);
|
||||
tokenWrapper = initTokenWrapper(null);
|
||||
check();
|
||||
}
|
||||
|
||||
/**
|
||||
* A TokenStream that uses the same attributes as the supplied one.
|
||||
*/
|
||||
protected TokenStream(AttributeSource input) {
|
||||
super(input);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns whether or not the new TokenStream APIs are used
|
||||
* by default.
|
||||
* (see {@link #incrementToken()}, {@link AttributeSource}).
|
||||
*/
|
||||
public static boolean useNewAPIDefault() {
|
||||
return useNewAPIDefault;
|
||||
}
|
||||
|
||||
/**
|
||||
* Use this API to enable or disable the new TokenStream API.
|
||||
* by default. Can be overridden by calling {@link #setUseNewAPI(boolean)}.
|
||||
* (see {@link #incrementToken()}, {@link AttributeSource}).
|
||||
* <p>
|
||||
* If set to true, the indexer will call {@link #incrementToken()}
|
||||
* to consume Tokens from this stream.
|
||||
* <p>
|
||||
* If set to false, the indexer will call {@link #next(Token)}
|
||||
* instead.
|
||||
*/
|
||||
public static void setUseNewAPIDefault(boolean use) {
|
||||
useNewAPIDefault = use;
|
||||
tokenWrapper = initTokenWrapper(input);
|
||||
check();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns whether or not the new TokenStream APIs are used
|
||||
* for this stream.
|
||||
* (see {@link #incrementToken()}, {@link AttributeSource}).
|
||||
* A TokenStream using the supplied AttributeFactory for creating new {@link Attribute} instances.
|
||||
*/
|
||||
public boolean useNewAPI() {
|
||||
return useNewAPI;
|
||||
protected TokenStream(AttributeFactory factory) {
|
||||
super(onlyUseNewAPI
|
||||
? factory
|
||||
: new TokenWrapperAttributeFactory(factory)
|
||||
);
|
||||
tokenWrapper = initTokenWrapper(null);
|
||||
check();
|
||||
}
|
||||
|
||||
/**
|
||||
* Use this API to enable or disable the new TokenStream API
|
||||
* for this stream. Overrides {@link #setUseNewAPIDefault(boolean)}.
|
||||
* (see {@link #incrementToken()}, {@link AttributeSource}).
|
||||
* <p>
|
||||
* If set to true, the indexer will call {@link #incrementToken()}
|
||||
* to consume Tokens from this stream.
|
||||
* <p>
|
||||
* If set to false, the indexer will call {@link #next(Token)}
|
||||
* instead.
|
||||
* <p>
|
||||
* <b>NOTE: All streams and filters in one chain must use the
|
||||
* same API. </b>
|
||||
*/
|
||||
public void setUseNewAPI(boolean use) {
|
||||
useNewAPI = use;
|
||||
}
|
||||
|
||||
/**
|
||||
* Consumers (e. g. the indexer) use this method to advance the stream
|
||||
* to the next token. Implementing classes must implement this method
|
||||
* and update the appropriate {@link Attribute}s with content of the
|
||||
* next token.
|
||||
* <p>
|
||||
* This method is called for every token of a document, so an efficient
|
||||
* implementation is crucial for good performance. To avoid calls to
|
||||
* {@link #addAttribute(Class)} and {@link #getAttribute(Class)} and
|
||||
* downcasts, references to all {@link Attribute}s that this stream uses
|
||||
* should be retrieved during instantiation.
|
||||
* <p>
|
||||
* To make sure that filters and consumers know which attributes are available
|
||||
* the attributes must be added during instantiation. Filters and
|
||||
* consumers are not required to check for availability of attributes in {@link #incrementToken()}.
|
||||
*
|
||||
* @return false for end of stream; true otherwise
|
||||
*
|
||||
* <p>
|
||||
* <b>Note that this method will be defined abstract in Lucene 3.0.</b>
|
||||
*/
|
||||
public boolean incrementToken() throws IOException {
|
||||
// subclasses must implement this method; will be made abstract in Lucene 3.0
|
||||
return false;
|
||||
}
|
||||
|
||||
/** Returns the next token in the stream, or null at EOS.
|
||||
* @deprecated The returned Token is a "full private copy" (not
|
||||
* re-used across calls to next()) but will be slower
|
||||
* than calling {@link #next(Token)} instead.. */
|
||||
public Token next() throws IOException {
|
||||
final Token reusableToken = new Token();
|
||||
Token nextToken = next(reusableToken);
|
||||
|
||||
if (nextToken != null) {
|
||||
Payload p = nextToken.getPayload();
|
||||
if (p != null) {
|
||||
nextToken.setPayload((Payload) p.clone());
|
||||
/** @deprecated Remove this when old API is removed! */
|
||||
private TokenWrapper initTokenWrapper(AttributeSource input) {
|
||||
if (onlyUseNewAPI) {
|
||||
// no wrapper needed
|
||||
return null;
|
||||
} else {
|
||||
// if possible get the wrapper from the filter's input stream
|
||||
if (input instanceof TokenStream && ((TokenStream) input).tokenWrapper != null) {
|
||||
return ((TokenStream) input).tokenWrapper;
|
||||
}
|
||||
// check that all attributes are implemented by the same TokenWrapper instance
|
||||
final AttributeImpl att = addAttribute(TermAttribute.class);
|
||||
if (att instanceof TokenWrapper &&
|
||||
addAttribute(TypeAttribute.class) == att &&
|
||||
addAttribute(PositionIncrementAttribute.class) == att &&
|
||||
addAttribute(FlagsAttribute.class) == att &&
|
||||
addAttribute(OffsetAttribute.class) == att &&
|
||||
addAttribute(PayloadAttribute.class) == att
|
||||
) {
|
||||
return (TokenWrapper) att;
|
||||
} else {
|
||||
throw new UnsupportedOperationException(
|
||||
"If onlyUseNewAPI is disabled, all basic Attributes must be implemented by the internal class "+
|
||||
"TokenWrapper. Please make sure, that all TokenStreams/TokenFilters in this chain have been "+
|
||||
"instantiated with this flag disabled and do not add any custom instances for the basic Attributes!"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nextToken;
|
||||
/** @deprecated Remove this when old API is removed! */
|
||||
private void check() {
|
||||
if (onlyUseNewAPI && !hasIncrementToken) {
|
||||
throw new UnsupportedOperationException(getClass().getName()+" does not implement incrementToken() which is needed for onlyUseNewAPI.");
|
||||
}
|
||||
|
||||
// a TokenStream subclass must at least implement one of the methods!
|
||||
if (!(hasIncrementToken || hasNext || hasReusableNext)) {
|
||||
throw new UnsupportedOperationException(getClass().getName()+" does not implement any of incrementToken(), next(Token), next().");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* For extra performance you can globally enable the new {@link #incrementToken}
|
||||
* API using {@link Attribute}s. There will be a small, but in most cases neglectible performance
|
||||
* increase by enabling this, but it only works if <b>all</b> TokenStreams and -Filters
|
||||
* use the new API and implement {@link #incrementToken}. This setting can only be enabled
|
||||
* globally.
|
||||
* <P>This setting only affects TokenStreams instantiated after this call. All TokenStreams
|
||||
* already created use the other setting.
|
||||
* <P>All core analyzers are compatible with this setting, if you have own
|
||||
* TokenStreams/-Filters, that are also compatible, enable this.
|
||||
* <P>When enabled, tokenization may throw {@link UnsupportedOperationException}s,
|
||||
* if the whole tokenizer chain is not compatible.
|
||||
* <P>The default is <code>false</code>, so there is the fallback to the old API available.
|
||||
* @deprecated This setting will be <code>true</code> per default in Lucene 3.0,
|
||||
* when {@link #incrementToken} is abstract and must be always implemented.
|
||||
*/
|
||||
public static void setOnlyUseNewAPI(boolean onlyUseNewAPI) {
|
||||
TokenStream.onlyUseNewAPI = onlyUseNewAPI;
|
||||
}
|
||||
|
||||
/** Returns if only the new API is used.
|
||||
* @see #setOnlyUseNewAPI
|
||||
* @deprecated This setting will be <code>true</code> per default in Lucene 3.0,
|
||||
* when {@link #incrementToken} is abstract and must be always implemented.
|
||||
*/
|
||||
public static boolean getOnlyUseNewAPI() {
|
||||
return onlyUseNewAPI;
|
||||
}
|
||||
|
||||
/**
|
||||
* Consumers (e. g. the indexer) use this method to advance the stream
|
||||
* to the next token. Implementing classes must implement this method
|
||||
* and update the appropriate {@link AttributeImpl}s with content of the
|
||||
* next token.
|
||||
* <p>
|
||||
* This method is called for every token of a document, so an efficient
|
||||
* implementation is crucial for good performance. To avoid calls to
|
||||
* {@link #addAttribute(Class)} and {@link #getAttribute(Class)} and
|
||||
* downcasts, references to all {@link AttributeImpl}s that this stream uses
|
||||
* should be retrieved during instantiation.
|
||||
* <p>
|
||||
* To make sure that filters and consumers know which attributes are available
|
||||
* the attributes must be added during instantiation. Filters and
|
||||
* consumers are not required to check for availability of attributes in {@link #incrementToken()}.
|
||||
*
|
||||
* @return false for end of stream; true otherwise
|
||||
*
|
||||
* <p>
|
||||
* <b>Note that this method will be defined abstract in Lucene 3.0.</b>
|
||||
*/
|
||||
public boolean incrementToken() throws IOException {
|
||||
assert !onlyUseNewAPI && tokenWrapper != null;
|
||||
|
||||
final Token token;
|
||||
if (hasReusableNext) {
|
||||
token = next(tokenWrapper.delegate);
|
||||
} else {
|
||||
assert hasNext;
|
||||
token = next();
|
||||
}
|
||||
if (token == null) return false;
|
||||
tokenWrapper.delegate = token;
|
||||
return true;
|
||||
}
|
||||
|
||||
/** Returns the next token in the stream, or null at EOS.
|
||||
|
@ -215,12 +302,46 @@ public abstract class TokenStream extends AttributeSource {
|
|||
* good idea to assert that it is not null.)
|
||||
* @return next token in the stream or null if end-of-stream was hit
|
||||
* @deprecated The new {@link #incrementToken()} and {@link AttributeSource}
|
||||
* APIs should be used instead. See also {@link #useNewAPI()}.
|
||||
* APIs should be used instead.
|
||||
*/
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
// We don't actually use inputToken, but still add this assert
|
||||
assert reusableToken != null;
|
||||
return next();
|
||||
|
||||
if (onlyUseNewAPI)
|
||||
throw new UnsupportedOperationException("This TokenStream only supports the new Attributes API.");
|
||||
|
||||
if (hasIncrementToken) {
|
||||
tokenWrapper.delegate = reusableToken;
|
||||
return incrementToken() ? tokenWrapper.delegate : null;
|
||||
} else {
|
||||
assert hasNext;
|
||||
final Token token = next();
|
||||
if (token == null) return null;
|
||||
tokenWrapper.delegate = token;
|
||||
return token;
|
||||
}
|
||||
}
|
||||
|
||||
/** Returns the next token in the stream, or null at EOS.
|
||||
* @deprecated The returned Token is a "full private copy" (not
|
||||
* re-used across calls to next()) but will be slower
|
||||
* than calling {@link #next(Token)} or using the new
|
||||
* {@link #incrementToken()} method with the new
|
||||
* {@link AttributeSource} API.
|
||||
*/
|
||||
public Token next() throws IOException {
|
||||
if (onlyUseNewAPI)
|
||||
throw new UnsupportedOperationException("This TokenStream only supports the new Attributes API.");
|
||||
|
||||
if (hasIncrementToken) {
|
||||
return incrementToken() ? ((Token) tokenWrapper.delegate.clone()) : null;
|
||||
} else {
|
||||
assert hasReusableNext;
|
||||
final Token token = next(tokenWrapper.delegate);
|
||||
if (token == null) return null;
|
||||
tokenWrapper.delegate = token;
|
||||
return (Token) token.clone();
|
||||
}
|
||||
}
|
||||
|
||||
/** Resets this stream to the beginning. This is an
|
||||
|
@ -240,24 +361,4 @@ public abstract class TokenStream extends AttributeSource {
|
|||
/** Releases resources associated with this stream. */
|
||||
public void close() throws IOException {}
|
||||
|
||||
public String toString() {
|
||||
StringBuffer sb = new StringBuffer();
|
||||
sb.append('(');
|
||||
|
||||
if (hasAttributes()) {
|
||||
// TODO Java 1.5
|
||||
//Iterator<Attribute> it = attributes.values().iterator();
|
||||
Iterator it = getAttributesIterator();
|
||||
if (it.hasNext()) {
|
||||
sb.append(it.next().toString());
|
||||
}
|
||||
while (it.hasNext()) {
|
||||
sb.append(',');
|
||||
sb.append(it.next().toString());
|
||||
}
|
||||
}
|
||||
sb.append(')');
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,163 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.index.Payload;
|
||||
import org.apache.lucene.util.AttributeImpl;
|
||||
|
||||
/**
|
||||
* This class wraps a Token and supplies a single attribute instance
|
||||
* where the delegate token can be replaced.
|
||||
* @deprecated Will be removed, when old TokenStream API is removed.
|
||||
*/
|
||||
final class TokenWrapper extends AttributeImpl
|
||||
implements Cloneable, TermAttribute, TypeAttribute, PositionIncrementAttribute,
|
||||
FlagsAttribute, OffsetAttribute, PayloadAttribute {
|
||||
|
||||
Token delegate;
|
||||
|
||||
TokenWrapper() {
|
||||
this(new Token());
|
||||
}
|
||||
|
||||
TokenWrapper(Token delegate) {
|
||||
this.delegate = delegate;
|
||||
}
|
||||
|
||||
// TermAttribute:
|
||||
|
||||
public String term() {
|
||||
return delegate.term();
|
||||
}
|
||||
|
||||
public void setTermBuffer(char[] buffer, int offset, int length) {
|
||||
delegate.setTermBuffer(buffer, offset, length);
|
||||
}
|
||||
|
||||
public void setTermBuffer(String buffer) {
|
||||
delegate.setTermBuffer(buffer);
|
||||
}
|
||||
|
||||
public void setTermBuffer(String buffer, int offset, int length) {
|
||||
delegate.setTermBuffer(buffer, offset, length);
|
||||
}
|
||||
|
||||
public char[] termBuffer() {
|
||||
return delegate.termBuffer();
|
||||
}
|
||||
|
||||
public char[] resizeTermBuffer(int newSize) {
|
||||
return delegate.resizeTermBuffer(newSize);
|
||||
}
|
||||
|
||||
public int termLength() {
|
||||
return delegate.termLength();
|
||||
}
|
||||
|
||||
public void setTermLength(int length) {
|
||||
delegate.setTermLength(length);
|
||||
}
|
||||
|
||||
// TypeAttribute:
|
||||
|
||||
public String type() {
|
||||
return delegate.type();
|
||||
}
|
||||
|
||||
public void setType(String type) {
|
||||
delegate.setType(type);
|
||||
}
|
||||
|
||||
public void setPositionIncrement(int positionIncrement) {
|
||||
delegate.setPositionIncrement(positionIncrement);
|
||||
}
|
||||
|
||||
public int getPositionIncrement() {
|
||||
return delegate.getPositionIncrement();
|
||||
}
|
||||
|
||||
// FlagsAttribute
|
||||
|
||||
public int getFlags() {
|
||||
return delegate.getFlags();
|
||||
}
|
||||
|
||||
public void setFlags(int flags) {
|
||||
delegate.setFlags(flags);
|
||||
}
|
||||
|
||||
// OffsetAttribute
|
||||
|
||||
public int startOffset() {
|
||||
return delegate.startOffset();
|
||||
}
|
||||
|
||||
public void setOffset(int startOffset, int endOffset) {
|
||||
delegate.setOffset(startOffset, endOffset);
|
||||
}
|
||||
|
||||
public int endOffset() {
|
||||
return delegate.endOffset();
|
||||
}
|
||||
|
||||
// PayloadAttribute
|
||||
public Payload getPayload() {
|
||||
return delegate.getPayload();
|
||||
}
|
||||
|
||||
public void setPayload(Payload payload) {
|
||||
delegate.setPayload(payload);
|
||||
}
|
||||
|
||||
// TokenAttribute
|
||||
|
||||
public void clear() {
|
||||
delegate.clear();
|
||||
}
|
||||
|
||||
// AttributeImpl
|
||||
|
||||
public String toString() {
|
||||
return delegate.toString();
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
return delegate.hashCode();
|
||||
}
|
||||
|
||||
public boolean equals(Object other) {
|
||||
if (other instanceof TokenWrapper) {
|
||||
return ((TokenWrapper) other).delegate.equals(this.delegate);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public Object clone() {
|
||||
return new TokenWrapper((Token) delegate.clone());
|
||||
}
|
||||
|
||||
public void copyTo(AttributeImpl target) {
|
||||
((TokenWrapper) target).delegate = (Token) this.delegate.clone();
|
||||
}
|
||||
}
|
|
@ -24,17 +24,10 @@ import java.io.IOException;
|
|||
<p>
|
||||
This is an abstract class.
|
||||
<p>
|
||||
<b>NOTE:</b> In order to enable the new API the method
|
||||
{@link #useNewAPI()} has to be called with useNewAPI=true.
|
||||
Otherwise the deprecated method {@link #next(Token)} will
|
||||
be used by Lucene consumers (indexer and queryparser) to
|
||||
consume the tokens. {@link #next(Token)} will be removed
|
||||
in Lucene 3.0.
|
||||
<p>
|
||||
NOTE: To use the old API subclasses must override {@link #next(Token)}.
|
||||
It's also OK to instead override {@link #next()} but that
|
||||
method is slower compared to {@link #next(Token)}.
|
||||
<p>
|
||||
<p>
|
||||
NOTE: subclasses overriding {@link #next(Token)} must
|
||||
call {@link Token#clear()}.
|
||||
* <p><font color="#FF0000">
|
||||
|
|
|
@ -442,57 +442,73 @@ are retrieved from the input stream in the <code>incrementToken()</code> method.
|
|||
in the TermAttribute the length of the term can be determined and too short or too long tokens are skipped.
|
||||
Note how <code>incrementToken()</code> can efficiently access the instance variable; no attribute lookup or downcasting
|
||||
is neccessary. The same is true for the consumer, which can simply use local references to the Attributes.
|
||||
|
||||
<h4>Adding a custom Attribute</h4>
|
||||
Now we're going to implement our own custom Attribute for part-of-speech tagging and call it consequently
|
||||
<code>PartOfSpeechAttribute</code>:
|
||||
<code>PartOfSpeechAttribute</code>. First we need to define the interface of the new Attribute:
|
||||
<pre>
|
||||
public static enum PartOfSpeech {
|
||||
Noun, Verb, Adjective, Adverb, Pronoun, Preposition, Conjunction, Article, Unknown
|
||||
}
|
||||
public interface PartOfSpeechAttribute extends Attribute {
|
||||
public static enum PartOfSpeech {
|
||||
Noun, Verb, Adjective, Adverb, Pronoun, Preposition, Conjunction, Article, Unknown
|
||||
}
|
||||
|
||||
public static final class PartOfSpeechAttribute extends Attribute {
|
||||
|
||||
private PartOfSpeech pos = PartOfSpeech.Unknown;
|
||||
|
||||
public void setPartOfSpeech(PartOfSpeech pos) {
|
||||
this.pos = pos;
|
||||
}
|
||||
|
||||
public PartOfSpeech getPartOfSpeech() {
|
||||
return pos;
|
||||
}
|
||||
|
||||
public void clear() {
|
||||
pos = PartOfSpeech.Unknown;
|
||||
}
|
||||
|
||||
public void copyTo(Attribute target) {
|
||||
((PartOfSpeechAttribute) target).pos = pos;
|
||||
}
|
||||
|
||||
public boolean equals(Object other) {
|
||||
if (other == this) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (other instanceof PartOfSpeechAttribute) {
|
||||
return pos == ((PartOfSpeechAttribute) other).pos;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
return pos.ordinal();
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "PartOfSpeech=" + pos;
|
||||
}
|
||||
public void setPartOfSpeech(PartOfSpeech pos);
|
||||
|
||||
public PartOfSpeech getPartOfSpeech();
|
||||
}
|
||||
</pre>
|
||||
This is a simple Attribute that has only a single variable that stores the part-of-speech of a token. It extends the
|
||||
new <code>Attribute</code> class and therefore implements its abstract methods <code>clear(), copyTo(), equals(), hashCode(), toString()</code>.
|
||||
|
||||
Now we also need to write the implementing class. The name of that class is important here: By default, Lucene
|
||||
checks if there is a class with the name of the Attribute with the postfix 'Impl'. In this example, we would
|
||||
consequently call the implementing class <code>PartOfSpeechAttributeImpl</code>. <br/>
|
||||
This should be the usual behavior. However, there is also an expert-API that allows changing these naming conventions:
|
||||
{@link org.apache.lucene.util.AttributeSource.AttributeFactory}. The factory accepts an Attribute interface as argument
|
||||
and returns an actual instance. You can implement your own factory if you need to change the default behavior. <br/><br/>
|
||||
|
||||
Now here is the actual class that implements our new Attribute. Notice that the class has to extend
|
||||
{@link org.apache.lucene.util.AttributeSource.AttributeImpl}:
|
||||
|
||||
<pre>
|
||||
public final class PartOfSpeechAttributeImpl extends AttributeImpl
|
||||
implements PartOfSpeechAttribute{
|
||||
|
||||
private PartOfSpeech pos = PartOfSpeech.Unknown;
|
||||
|
||||
public void setPartOfSpeech(PartOfSpeech pos) {
|
||||
this.pos = pos;
|
||||
}
|
||||
|
||||
public PartOfSpeech getPartOfSpeech() {
|
||||
return pos;
|
||||
}
|
||||
|
||||
public void clear() {
|
||||
pos = PartOfSpeech.Unknown;
|
||||
}
|
||||
|
||||
public void copyTo(AttributeImpl target) {
|
||||
((PartOfSpeechAttributeImpl) target).pos = pos;
|
||||
}
|
||||
|
||||
public boolean equals(Object other) {
|
||||
if (other == this) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (other instanceof PartOfSpeechAttributeImpl) {
|
||||
return pos == ((PartOfSpeechAttributeImpl) other).pos;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
return pos.ordinal();
|
||||
}
|
||||
}
|
||||
</pre>
|
||||
This is a simple Attribute implementation has only a single variable that stores the part-of-speech of a token. It extends the
|
||||
new <code>AttributeImpl</code> class and therefore implements its abstract methods <code>clear(), copyTo(), equals(), hashCode()</code>.
|
||||
Now we need a TokenFilter that can set this new PartOfSpeechAttribute for each token. In this example we show a very naive filter
|
||||
that tags every word with a leading upper-case letter as a 'Noun' and all other words as 'Unknown'.
|
||||
<pre>
|
||||
|
@ -523,7 +539,9 @@ that tags every word with a leading upper-case letter as a 'Noun' and all other
|
|||
}
|
||||
</pre>
|
||||
Just like the LengthFilter, this new filter accesses the attributes it needs in the constructor and
|
||||
stores references in instance variables. Now we need to add the filter to the chain:
|
||||
stores references in instance variables. Notice how you only need to pass in the interface of the new
|
||||
Attribute and instantiating the correct class is automatically been taken care of.
|
||||
Now we need to add the filter to the chain:
|
||||
<pre>
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
TokenStream stream = new WhitespaceTokenizer(reader);
|
||||
|
@ -582,7 +600,8 @@ of a sentence or not. Then the PartOfSpeechTaggingFilter can make use of this kn
|
|||
as nouns if not the first word of a sentence (we know, this is still not a correct behavior, but hey, it's a good exercise).
|
||||
As a small hint, this is how the new Attribute class could begin:
|
||||
<pre>
|
||||
public class FirstTokenOfSentenceAttribute extends Attribute {
|
||||
public class FirstTokenOfSentenceAttributeImpl extends Attribute
|
||||
implements FirstTokenOfSentenceAttribute {
|
||||
|
||||
private boolean firstToken;
|
||||
|
||||
|
|
|
@ -73,39 +73,4 @@ public final class StandardFilter extends TokenFilter {
|
|||
|
||||
return true;
|
||||
}
|
||||
|
||||
/** Returns the next token in the stream, or null at EOS.
|
||||
* <p>Removes <tt>'s</tt> from the end of words.
|
||||
* <p>Removes dots from acronyms.
|
||||
* @deprecated
|
||||
*/
|
||||
public final Token next(final Token reusableToken) throws java.io.IOException {
|
||||
assert reusableToken != null;
|
||||
Token nextToken = input.next(reusableToken);
|
||||
|
||||
if (nextToken == null)
|
||||
return null;
|
||||
|
||||
char[] buffer = nextToken.termBuffer();
|
||||
final int bufferLength = nextToken.termLength();
|
||||
final String type = nextToken.type();
|
||||
|
||||
if (type == APOSTROPHE_TYPE && // remove 's
|
||||
bufferLength >= 2 &&
|
||||
buffer[bufferLength-2] == '\'' &&
|
||||
(buffer[bufferLength-1] == 's' || buffer[bufferLength-1] == 'S')) {
|
||||
// Strip last 2 characters off
|
||||
nextToken.setTermLength(bufferLength - 2);
|
||||
} else if (type == ACRONYM_TYPE) { // remove dots
|
||||
int upto = 0;
|
||||
for(int i=0;i<bufferLength;i++) {
|
||||
char c = buffer[i];
|
||||
if (c != '.')
|
||||
buffer[upto++] = c;
|
||||
}
|
||||
nextToken.setTermLength(upto);
|
||||
}
|
||||
|
||||
return nextToken;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -147,7 +147,7 @@ public class StandardTokenizer extends Tokenizer {
|
|||
*
|
||||
* @see org.apache.lucene.analysis.TokenStream#next()
|
||||
*/
|
||||
public boolean incrementToken() throws IOException {
|
||||
public final boolean incrementToken() throws IOException {
|
||||
int posIncr = 1;
|
||||
|
||||
while(true) {
|
||||
|
@ -183,66 +183,33 @@ public class StandardTokenizer extends Tokenizer {
|
|||
posIncr++;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
|
||||
* not be overridden. Delegates to the backwards compatibility layer. */
|
||||
public final Token next(final Token reusableToken) throws IOException {
|
||||
return super.next(reusableToken);
|
||||
}
|
||||
|
||||
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
|
||||
* not be overridden. Delegates to the backwards compatibility layer. */
|
||||
public final Token next() throws IOException {
|
||||
return super.next();
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see org.apache.lucene.analysis.TokenStream#next()
|
||||
* @see org.apache.lucene.analysis.TokenStream#reset()
|
||||
*/
|
||||
/** @deprecated */
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
int posIncr = 1;
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
scanner.yyreset(input);
|
||||
}
|
||||
|
||||
while(true) {
|
||||
int tokenType = scanner.getNextToken();
|
||||
|
||||
if (tokenType == StandardTokenizerImpl.YYEOF) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (scanner.yylength() <= maxTokenLength) {
|
||||
reusableToken.clear();
|
||||
reusableToken.setPositionIncrement(posIncr);
|
||||
scanner.getText(reusableToken);
|
||||
final int start = scanner.yychar();
|
||||
reusableToken.setStartOffset(input.correctOffset(start));
|
||||
reusableToken.setEndOffset(input.correctOffset(start+reusableToken.termLength()));
|
||||
// This 'if' should be removed in the next release. For now, it converts
|
||||
// invalid acronyms to HOST. When removed, only the 'else' part should
|
||||
// remain.
|
||||
if (tokenType == StandardTokenizerImpl.ACRONYM_DEP) {
|
||||
if (replaceInvalidAcronym) {
|
||||
reusableToken.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]);
|
||||
reusableToken.setTermLength(reusableToken.termLength() - 1); // remove extra '.'
|
||||
} else {
|
||||
reusableToken.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]);
|
||||
}
|
||||
} else {
|
||||
reusableToken.setType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]);
|
||||
}
|
||||
return reusableToken;
|
||||
} else
|
||||
// When we skip a too-long term, we still increment the
|
||||
// position increment
|
||||
posIncr++;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see org.apache.lucene.analysis.TokenStream#reset()
|
||||
*/
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
scanner.yyreset(input);
|
||||
}
|
||||
|
||||
public void reset(Reader reader) throws IOException {
|
||||
setInput(reader);
|
||||
reset();
|
||||
}
|
||||
public void reset(Reader reader) throws IOException {
|
||||
setInput(reader);
|
||||
reset();
|
||||
}
|
||||
|
||||
/**
|
||||
* Prior to https://issues.apache.org/jira/browse/LUCENE-1068, StandardTokenizer mischaracterized as acronyms tokens like www.abc.com
|
||||
|
|
|
@ -17,8 +17,6 @@ package org.apache.lucene.analysis.tokenattributes;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import org.apache.lucene.util.Attribute;
|
||||
|
||||
/**
|
||||
|
@ -31,9 +29,7 @@ import org.apache.lucene.util.Attribute;
|
|||
* We will make our best efforts to keep the APIs backwards-compatible.</font>
|
||||
|
||||
*/
|
||||
public class FlagsAttribute extends Attribute implements Cloneable, Serializable {
|
||||
private int flags = 0;
|
||||
|
||||
public interface FlagsAttribute extends Attribute {
|
||||
/**
|
||||
* EXPERIMENTAL: While we think this is here to stay, we may want to change it to be a long.
|
||||
* <p/>
|
||||
|
@ -44,43 +40,10 @@ public class FlagsAttribute extends Attribute implements Cloneable, Serializable
|
|||
*
|
||||
* @return The bits
|
||||
*/
|
||||
public int getFlags() {
|
||||
return flags;
|
||||
}
|
||||
public int getFlags();
|
||||
|
||||
/**
|
||||
* @see #getFlags()
|
||||
*/
|
||||
public void setFlags(int flags) {
|
||||
this.flags = flags;
|
||||
}
|
||||
|
||||
public void clear() {
|
||||
flags = 0;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "flags=" + flags;
|
||||
}
|
||||
|
||||
public boolean equals(Object other) {
|
||||
if (this == other) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (other instanceof FlagsAttribute) {
|
||||
return ((FlagsAttribute) other).flags == flags;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
return flags;
|
||||
}
|
||||
|
||||
public void copyTo(Attribute target) {
|
||||
FlagsAttribute t = (FlagsAttribute) target;
|
||||
t.setFlags(flags);
|
||||
}
|
||||
public void setFlags(int flags);
|
||||
}
|
||||
|
|
|
@ -0,0 +1,82 @@
|
|||
package org.apache.lucene.analysis.tokenattributes;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import org.apache.lucene.util.AttributeImpl;
|
||||
|
||||
/**
|
||||
* This attribute can be used to pass different flags down the tokenizer chain,
|
||||
* e. g. from one TokenFilter to another one.
|
||||
*
|
||||
* <p><font color="#FF0000">
|
||||
* WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental.
|
||||
* The APIs introduced in these classes with Lucene 2.9 might change in the future.
|
||||
* We will make our best efforts to keep the APIs backwards-compatible.</font>
|
||||
|
||||
*/
|
||||
public class FlagsAttributeImpl extends AttributeImpl implements FlagsAttribute, Cloneable, Serializable {
|
||||
private int flags = 0;
|
||||
|
||||
/**
|
||||
* EXPERIMENTAL: While we think this is here to stay, we may want to change it to be a long.
|
||||
* <p/>
|
||||
*
|
||||
* Get the bitset for any bits that have been set. This is completely distinct from {@link TypeAttribute#type()}, although they do share similar purposes.
|
||||
* The flags can be used to encode information about the token for use by other {@link org.apache.lucene.analysis.TokenFilter}s.
|
||||
*
|
||||
*
|
||||
* @return The bits
|
||||
*/
|
||||
public int getFlags() {
|
||||
return flags;
|
||||
}
|
||||
|
||||
/**
|
||||
* @see #getFlags()
|
||||
*/
|
||||
public void setFlags(int flags) {
|
||||
this.flags = flags;
|
||||
}
|
||||
|
||||
public void clear() {
|
||||
flags = 0;
|
||||
}
|
||||
|
||||
public boolean equals(Object other) {
|
||||
if (this == other) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (other instanceof FlagsAttributeImpl) {
|
||||
return ((FlagsAttributeImpl) other).flags == flags;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
return flags;
|
||||
}
|
||||
|
||||
public void copyTo(AttributeImpl target) {
|
||||
FlagsAttribute t = (FlagsAttribute) target;
|
||||
t.setFlags(flags);
|
||||
}
|
||||
}
|
|
@ -17,8 +17,6 @@ package org.apache.lucene.analysis.tokenattributes;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import org.apache.lucene.util.Attribute;
|
||||
|
||||
/**
|
||||
|
@ -29,67 +27,23 @@ import org.apache.lucene.util.Attribute;
|
|||
* The APIs introduced in these classes with Lucene 2.9 might change in the future.
|
||||
* We will make our best efforts to keep the APIs backwards-compatible.</font>
|
||||
*/
|
||||
public class OffsetAttribute extends Attribute implements Cloneable, Serializable {
|
||||
private int startOffset;
|
||||
private int endOffset;
|
||||
|
||||
public interface OffsetAttribute extends Attribute {
|
||||
/** Returns this Token's starting offset, the position of the first character
|
||||
corresponding to this token in the source text.
|
||||
|
||||
Note that the difference between endOffset() and startOffset() may not be
|
||||
equal to termText.length(), as the term text may have been altered by a
|
||||
stemmer or some other filter. */
|
||||
public int startOffset() {
|
||||
return startOffset;
|
||||
}
|
||||
public int startOffset();
|
||||
|
||||
|
||||
/** Set the starting and ending offset.
|
||||
@see #startOffset() and #endOffset()*/
|
||||
public void setOffset(int startOffset, int endOffset) {
|
||||
this.startOffset = startOffset;
|
||||
this.endOffset = endOffset;
|
||||
}
|
||||
public void setOffset(int startOffset, int endOffset);
|
||||
|
||||
|
||||
/** Returns this Token's ending offset, one greater than the position of the
|
||||
last character corresponding to this token in the source text. The length
|
||||
of the token in the source text is (endOffset - startOffset). */
|
||||
public int endOffset() {
|
||||
return endOffset;
|
||||
}
|
||||
|
||||
|
||||
public void clear() {
|
||||
startOffset = 0;
|
||||
endOffset = 0;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "start=" + startOffset + ",end=" + endOffset;
|
||||
}
|
||||
|
||||
public boolean equals(Object other) {
|
||||
if (other == this) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (other instanceof OffsetAttribute) {
|
||||
OffsetAttribute o = (OffsetAttribute) other;
|
||||
return o.startOffset == startOffset && o.endOffset == endOffset;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
int code = startOffset;
|
||||
code = code * 31 + endOffset;
|
||||
return code;
|
||||
}
|
||||
|
||||
public void copyTo(Attribute target) {
|
||||
OffsetAttribute t = (OffsetAttribute) target;
|
||||
t.setOffset(startOffset, endOffset);
|
||||
}
|
||||
public int endOffset();
|
||||
}
|
||||
|
|
|
@ -0,0 +1,91 @@
|
|||
package org.apache.lucene.analysis.tokenattributes;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import org.apache.lucene.util.AttributeImpl;
|
||||
|
||||
/**
|
||||
* The start and end character offset of a Token.
|
||||
*
|
||||
* <p><font color="#FF0000">
|
||||
* WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental.
|
||||
* The APIs introduced in these classes with Lucene 2.9 might change in the future.
|
||||
* We will make our best efforts to keep the APIs backwards-compatible.</font>
|
||||
*/
|
||||
public class OffsetAttributeImpl extends AttributeImpl implements OffsetAttribute, Cloneable, Serializable {
|
||||
private int startOffset;
|
||||
private int endOffset;
|
||||
|
||||
/** Returns this Token's starting offset, the position of the first character
|
||||
corresponding to this token in the source text.
|
||||
|
||||
Note that the difference between endOffset() and startOffset() may not be
|
||||
equal to termText.length(), as the term text may have been altered by a
|
||||
stemmer or some other filter. */
|
||||
public int startOffset() {
|
||||
return startOffset;
|
||||
}
|
||||
|
||||
|
||||
/** Set the starting and ending offset.
|
||||
@see #startOffset() and #endOffset()*/
|
||||
public void setOffset(int startOffset, int endOffset) {
|
||||
this.startOffset = startOffset;
|
||||
this.endOffset = endOffset;
|
||||
}
|
||||
|
||||
|
||||
/** Returns this Token's ending offset, one greater than the position of the
|
||||
last character corresponding to this token in the source text. The length
|
||||
of the token in the source text is (endOffset - startOffset). */
|
||||
public int endOffset() {
|
||||
return endOffset;
|
||||
}
|
||||
|
||||
|
||||
public void clear() {
|
||||
startOffset = 0;
|
||||
endOffset = 0;
|
||||
}
|
||||
|
||||
public boolean equals(Object other) {
|
||||
if (other == this) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (other instanceof OffsetAttributeImpl) {
|
||||
OffsetAttributeImpl o = (OffsetAttributeImpl) other;
|
||||
return o.startOffset == startOffset && o.endOffset == endOffset;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
int code = startOffset;
|
||||
code = code * 31 + endOffset;
|
||||
return code;
|
||||
}
|
||||
|
||||
public void copyTo(AttributeImpl target) {
|
||||
OffsetAttribute t = (OffsetAttribute) target;
|
||||
t.setOffset(startOffset, endOffset);
|
||||
}
|
||||
}
|
|
@ -17,8 +17,6 @@ package org.apache.lucene.analysis.tokenattributes;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import org.apache.lucene.index.Payload;
|
||||
import org.apache.lucene.util.Attribute;
|
||||
|
||||
|
@ -30,80 +28,14 @@ import org.apache.lucene.util.Attribute;
|
|||
* The APIs introduced in these classes with Lucene 2.9 might change in the future.
|
||||
* We will make our best efforts to keep the APIs backwards-compatible.</font>
|
||||
*/
|
||||
public class PayloadAttribute extends Attribute implements Cloneable, Serializable {
|
||||
private Payload payload;
|
||||
|
||||
/**
|
||||
* Initialize this attribute with no payload.
|
||||
*/
|
||||
public PayloadAttribute() {}
|
||||
|
||||
/**
|
||||
* Initialize this attribute with the given payload.
|
||||
*/
|
||||
public PayloadAttribute(Payload payload) {
|
||||
this.payload = payload;
|
||||
}
|
||||
|
||||
public interface PayloadAttribute extends Attribute {
|
||||
/**
|
||||
* Returns this Token's payload.
|
||||
*/
|
||||
public Payload getPayload() {
|
||||
return this.payload;
|
||||
}
|
||||
public Payload getPayload();
|
||||
|
||||
/**
|
||||
* Sets this Token's payload.
|
||||
*/
|
||||
public void setPayload(Payload payload) {
|
||||
this.payload = payload;
|
||||
}
|
||||
|
||||
public void clear() {
|
||||
payload = null;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
if (payload == null) {
|
||||
return "payload=null";
|
||||
}
|
||||
|
||||
return "payload=" + payload.toString();
|
||||
}
|
||||
|
||||
public Object clone() {
|
||||
PayloadAttribute clone = (PayloadAttribute) super.clone();
|
||||
if (payload != null) {
|
||||
clone.payload = (Payload) payload.clone();
|
||||
}
|
||||
return clone;
|
||||
}
|
||||
|
||||
public boolean equals(Object other) {
|
||||
if (other == this) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (other instanceof PayloadAttribute) {
|
||||
PayloadAttribute o = (PayloadAttribute) other;
|
||||
if (o.payload == null || payload == null) {
|
||||
return o.payload == null && payload == null;
|
||||
}
|
||||
|
||||
return o.payload.equals(payload);
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
return (payload == null) ? 0 : payload.hashCode();
|
||||
}
|
||||
|
||||
public void copyTo(Attribute target) {
|
||||
PayloadAttribute t = (PayloadAttribute) target;
|
||||
t.setPayload((payload == null) ? null : (Payload) payload.clone());
|
||||
}
|
||||
|
||||
|
||||
public void setPayload(Payload payload);
|
||||
}
|
||||
|
|
|
@ -0,0 +1,101 @@
|
|||
package org.apache.lucene.analysis.tokenattributes;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import org.apache.lucene.index.Payload;
|
||||
import org.apache.lucene.util.AttributeImpl;
|
||||
|
||||
/**
|
||||
* The payload of a Token. See also {@link Payload}.
|
||||
*
|
||||
* <p><font color="#FF0000">
|
||||
* WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental.
|
||||
* The APIs introduced in these classes with Lucene 2.9 might change in the future.
|
||||
* We will make our best efforts to keep the APIs backwards-compatible.</font>
|
||||
*/
|
||||
public class PayloadAttributeImpl extends AttributeImpl implements PayloadAttribute, Cloneable, Serializable {
|
||||
private Payload payload;
|
||||
|
||||
/**
|
||||
* Initialize this attribute with no payload.
|
||||
*/
|
||||
public PayloadAttributeImpl() {}
|
||||
|
||||
/**
|
||||
* Initialize this attribute with the given payload.
|
||||
*/
|
||||
public PayloadAttributeImpl(Payload payload) {
|
||||
this.payload = payload;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns this Token's payload.
|
||||
*/
|
||||
public Payload getPayload() {
|
||||
return this.payload;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets this Token's payload.
|
||||
*/
|
||||
public void setPayload(Payload payload) {
|
||||
this.payload = payload;
|
||||
}
|
||||
|
||||
public void clear() {
|
||||
payload = null;
|
||||
}
|
||||
|
||||
public Object clone() {
|
||||
PayloadAttributeImpl clone = (PayloadAttributeImpl) super.clone();
|
||||
if (payload != null) {
|
||||
clone.payload = (Payload) payload.clone();
|
||||
}
|
||||
return clone;
|
||||
}
|
||||
|
||||
public boolean equals(Object other) {
|
||||
if (other == this) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (other instanceof PayloadAttribute) {
|
||||
PayloadAttributeImpl o = (PayloadAttributeImpl) other;
|
||||
if (o.payload == null || payload == null) {
|
||||
return o.payload == null && payload == null;
|
||||
}
|
||||
|
||||
return o.payload.equals(payload);
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
return (payload == null) ? 0 : payload.hashCode();
|
||||
}
|
||||
|
||||
public void copyTo(AttributeImpl target) {
|
||||
PayloadAttribute t = (PayloadAttribute) target;
|
||||
t.setPayload((payload == null) ? null : (Payload) payload.clone());
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -17,13 +17,10 @@ package org.apache.lucene.analysis.tokenattributes;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.util.Attribute;
|
||||
|
||||
/** The positionIncrement determines the position of this token
|
||||
* relative to the previous Token in a {@link TokenStream}, used in phrase
|
||||
* relative to the previous Token in a TokenStream, used in phrase
|
||||
* searching.
|
||||
*
|
||||
* <p>The default value is one.
|
||||
|
@ -53,54 +50,15 @@ import org.apache.lucene.util.Attribute;
|
|||
*
|
||||
* @see org.apache.lucene.index.TermPositions
|
||||
*/
|
||||
public class PositionIncrementAttribute extends Attribute implements Cloneable, Serializable {
|
||||
private int positionIncrement = 1;
|
||||
|
||||
public interface PositionIncrementAttribute extends Attribute {
|
||||
/** Set the position increment. The default value is one.
|
||||
*
|
||||
* @param positionIncrement the distance from the prior term
|
||||
*/
|
||||
public void setPositionIncrement(int positionIncrement) {
|
||||
if (positionIncrement < 0)
|
||||
throw new IllegalArgumentException
|
||||
("Increment must be zero or greater: " + positionIncrement);
|
||||
this.positionIncrement = positionIncrement;
|
||||
}
|
||||
public void setPositionIncrement(int positionIncrement);
|
||||
|
||||
/** Returns the position increment of this Token.
|
||||
* @see #setPositionIncrement
|
||||
*/
|
||||
public int getPositionIncrement() {
|
||||
return positionIncrement;
|
||||
}
|
||||
|
||||
public void clear() {
|
||||
this.positionIncrement = 1;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "positionIncrement=" + positionIncrement;
|
||||
}
|
||||
|
||||
public boolean equals(Object other) {
|
||||
if (other == this) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (other instanceof PositionIncrementAttribute) {
|
||||
return positionIncrement == ((PositionIncrementAttribute) other).positionIncrement;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
return positionIncrement;
|
||||
}
|
||||
|
||||
public void copyTo(Attribute target) {
|
||||
PositionIncrementAttribute t = (PositionIncrementAttribute) target;
|
||||
t.setPositionIncrement(positionIncrement);
|
||||
}
|
||||
|
||||
public int getPositionIncrement();
|
||||
}
|
||||
|
|
|
@ -0,0 +1,102 @@
|
|||
package org.apache.lucene.analysis.tokenattributes;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.util.AttributeImpl;
|
||||
|
||||
/** The positionIncrement determines the position of this token
|
||||
* relative to the previous Token in a {@link TokenStream}, used in phrase
|
||||
* searching.
|
||||
*
|
||||
* <p>The default value is one.
|
||||
*
|
||||
* <p>Some common uses for this are:<ul>
|
||||
*
|
||||
* <li>Set it to zero to put multiple terms in the same position. This is
|
||||
* useful if, e.g., a word has multiple stems. Searches for phrases
|
||||
* including either stem will match. In this case, all but the first stem's
|
||||
* increment should be set to zero: the increment of the first instance
|
||||
* should be one. Repeating a token with an increment of zero can also be
|
||||
* used to boost the scores of matches on that token.
|
||||
*
|
||||
* <li>Set it to values greater than one to inhibit exact phrase matches.
|
||||
* If, for example, one does not want phrases to match across removed stop
|
||||
* words, then one could build a stop word filter that removes stop words and
|
||||
* also sets the increment to the number of stop words removed before each
|
||||
* non-stop word. Then exact phrase queries will only match when the terms
|
||||
* occur with no intervening stop words.
|
||||
*
|
||||
* </ul>
|
||||
*
|
||||
* <p><font color="#FF0000">
|
||||
* WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental.
|
||||
* The APIs introduced in these classes with Lucene 2.9 might change in the future.
|
||||
* We will make our best efforts to keep the APIs backwards-compatible.</font>
|
||||
*
|
||||
* @see org.apache.lucene.index.TermPositions
|
||||
*/
|
||||
public class PositionIncrementAttributeImpl extends AttributeImpl implements PositionIncrementAttribute, Cloneable, Serializable {
|
||||
private int positionIncrement = 1;
|
||||
|
||||
/** Set the position increment. The default value is one.
|
||||
*
|
||||
* @param positionIncrement the distance from the prior term
|
||||
*/
|
||||
public void setPositionIncrement(int positionIncrement) {
|
||||
if (positionIncrement < 0)
|
||||
throw new IllegalArgumentException
|
||||
("Increment must be zero or greater: " + positionIncrement);
|
||||
this.positionIncrement = positionIncrement;
|
||||
}
|
||||
|
||||
/** Returns the position increment of this Token.
|
||||
* @see #setPositionIncrement
|
||||
*/
|
||||
public int getPositionIncrement() {
|
||||
return positionIncrement;
|
||||
}
|
||||
|
||||
public void clear() {
|
||||
this.positionIncrement = 1;
|
||||
}
|
||||
|
||||
public boolean equals(Object other) {
|
||||
if (other == this) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (other instanceof PositionIncrementAttributeImpl) {
|
||||
return positionIncrement == ((PositionIncrementAttributeImpl) other).positionIncrement;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
return positionIncrement;
|
||||
}
|
||||
|
||||
public void copyTo(AttributeImpl target) {
|
||||
PositionIncrementAttribute t = (PositionIncrementAttribute) target;
|
||||
t.setPositionIncrement(positionIncrement);
|
||||
}
|
||||
|
||||
}
|
|
@ -17,9 +17,6 @@ package org.apache.lucene.analysis.tokenattributes;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.Attribute;
|
||||
|
||||
/**
|
||||
|
@ -30,12 +27,7 @@ import org.apache.lucene.util.Attribute;
|
|||
* The APIs introduced in these classes with Lucene 2.9 might change in the future.
|
||||
* We will make our best efforts to keep the APIs backwards-compatible.</font>
|
||||
*/
|
||||
public class TermAttribute extends Attribute implements Cloneable, Serializable {
|
||||
private static int MIN_BUFFER_SIZE = 10;
|
||||
|
||||
private char[] termBuffer;
|
||||
private int termLength;
|
||||
|
||||
public interface TermAttribute extends Attribute {
|
||||
/** Returns the Token's term text.
|
||||
*
|
||||
* This method has a performance penalty
|
||||
|
@ -45,38 +37,20 @@ public class TermAttribute extends Attribute implements Cloneable, Serializable
|
|||
* String, use this method, which is nothing more than
|
||||
* a convenience call to <b>new String(token.termBuffer(), 0, token.termLength())</b>
|
||||
*/
|
||||
public String term() {
|
||||
initTermBuffer();
|
||||
return new String(termBuffer, 0, termLength);
|
||||
}
|
||||
|
||||
public String term();
|
||||
|
||||
/** Copies the contents of buffer, starting at offset for
|
||||
* length characters, into the termBuffer array.
|
||||
* @param buffer the buffer to copy
|
||||
* @param offset the index in the buffer of the first character to copy
|
||||
* @param length the number of characters to copy
|
||||
*/
|
||||
public void setTermBuffer(char[] buffer, int offset, int length) {
|
||||
char[] newCharBuffer = growTermBuffer(length);
|
||||
if (newCharBuffer != null) {
|
||||
termBuffer = newCharBuffer;
|
||||
}
|
||||
System.arraycopy(buffer, offset, termBuffer, 0, length);
|
||||
termLength = length;
|
||||
}
|
||||
public void setTermBuffer(char[] buffer, int offset, int length);
|
||||
|
||||
/** Copies the contents of buffer into the termBuffer array.
|
||||
* @param buffer the buffer to copy
|
||||
*/
|
||||
public void setTermBuffer(String buffer) {
|
||||
int length = buffer.length();
|
||||
char[] newCharBuffer = growTermBuffer(length);
|
||||
if (newCharBuffer != null) {
|
||||
termBuffer = newCharBuffer;
|
||||
}
|
||||
buffer.getChars(0, length, termBuffer, 0);
|
||||
termLength = length;
|
||||
}
|
||||
public void setTermBuffer(String buffer);
|
||||
|
||||
/** Copies the contents of buffer, starting at offset and continuing
|
||||
* for length characters, into the termBuffer array.
|
||||
|
@ -84,17 +58,8 @@ public class TermAttribute extends Attribute implements Cloneable, Serializable
|
|||
* @param offset the index in the buffer of the first character to copy
|
||||
* @param length the number of characters to copy
|
||||
*/
|
||||
public void setTermBuffer(String buffer, int offset, int length) {
|
||||
assert offset <= buffer.length();
|
||||
assert offset + length <= buffer.length();
|
||||
char[] newCharBuffer = growTermBuffer(length);
|
||||
if (newCharBuffer != null) {
|
||||
termBuffer = newCharBuffer;
|
||||
}
|
||||
buffer.getChars(offset, offset + length, termBuffer, 0);
|
||||
termLength = length;
|
||||
}
|
||||
|
||||
public void setTermBuffer(String buffer, int offset, int length);
|
||||
|
||||
/** Returns the internal termBuffer character array which
|
||||
* you can then directly alter. If the array is too
|
||||
* small for your token, use {@link
|
||||
|
@ -102,10 +67,7 @@ public class TermAttribute extends Attribute implements Cloneable, Serializable
|
|||
* altering the buffer be sure to call {@link
|
||||
* #setTermLength} to record the number of valid
|
||||
* characters that were placed into the termBuffer. */
|
||||
public char[] termBuffer() {
|
||||
initTermBuffer();
|
||||
return termBuffer;
|
||||
}
|
||||
public char[] termBuffer();
|
||||
|
||||
/** Grows the termBuffer to at least size newSize, preserving the
|
||||
* existing content. Note: If the next operation is to change
|
||||
|
@ -117,63 +79,12 @@ public class TermAttribute extends Attribute implements Cloneable, Serializable
|
|||
* @param newSize minimum size of the new termBuffer
|
||||
* @return newly created termBuffer with length >= newSize
|
||||
*/
|
||||
public char[] resizeTermBuffer(int newSize) {
|
||||
char[] newCharBuffer = growTermBuffer(newSize);
|
||||
if (termBuffer == null) {
|
||||
// If there were termText, then preserve it.
|
||||
// note that if termBuffer is null then newCharBuffer cannot be null
|
||||
assert newCharBuffer != null;
|
||||
termBuffer = newCharBuffer;
|
||||
} else if (newCharBuffer != null) {
|
||||
// Note: if newCharBuffer != null then termBuffer needs to grow.
|
||||
// If there were a termBuffer, then preserve it
|
||||
System.arraycopy(termBuffer, 0, newCharBuffer, 0, termBuffer.length);
|
||||
termBuffer = newCharBuffer;
|
||||
}
|
||||
return termBuffer;
|
||||
}
|
||||
|
||||
/** Allocates a buffer char[] of at least newSize
|
||||
* @param newSize minimum size of the buffer
|
||||
* @return newly created buffer with length >= newSize or null if the current termBuffer is big enough
|
||||
*/
|
||||
private char[] growTermBuffer(int newSize) {
|
||||
if (termBuffer != null) {
|
||||
if (termBuffer.length >= newSize)
|
||||
// Already big enough
|
||||
return null;
|
||||
else
|
||||
// Not big enough; create a new array with slight
|
||||
// over allocation:
|
||||
return new char[ArrayUtil.getNextSize(newSize)];
|
||||
} else {
|
||||
|
||||
// determine the best size
|
||||
// The buffer is always at least MIN_BUFFER_SIZE
|
||||
if (newSize < MIN_BUFFER_SIZE) {
|
||||
newSize = MIN_BUFFER_SIZE;
|
||||
}
|
||||
|
||||
return new char[newSize];
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: once we remove the deprecated termText() method
|
||||
// and switch entirely to char[] termBuffer we don't need
|
||||
// to use this method anymore
|
||||
private void initTermBuffer() {
|
||||
if (termBuffer == null) {
|
||||
termBuffer = new char[MIN_BUFFER_SIZE];
|
||||
termLength = 0;
|
||||
}
|
||||
}
|
||||
public char[] resizeTermBuffer(int newSize);
|
||||
|
||||
/** Return number of valid characters (length of the term)
|
||||
* in the termBuffer array. */
|
||||
public int termLength() {
|
||||
return termLength;
|
||||
}
|
||||
|
||||
public int termLength();
|
||||
|
||||
/** Set number of valid characters (length of the term) in
|
||||
* the termBuffer array. Use this to truncate the termBuffer
|
||||
* or to synchronize with external manipulation of the termBuffer.
|
||||
|
@ -181,61 +92,5 @@ public class TermAttribute extends Attribute implements Cloneable, Serializable
|
|||
* use {@link #resizeTermBuffer(int)} first.
|
||||
* @param length the truncated length
|
||||
*/
|
||||
public void setTermLength(int length) {
|
||||
initTermBuffer();
|
||||
if (length > termBuffer.length)
|
||||
throw new IllegalArgumentException("length " + length + " exceeds the size of the termBuffer (" + termBuffer.length + ")");
|
||||
termLength = length;
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
initTermBuffer();
|
||||
int code = termLength;
|
||||
code = code * 31 + ArrayUtil.hashCode(termBuffer, 0, termLength);
|
||||
return code;
|
||||
}
|
||||
|
||||
public void clear() {
|
||||
termLength = 0;
|
||||
}
|
||||
|
||||
public Object clone() {
|
||||
TermAttribute t = (TermAttribute)super.clone();
|
||||
// Do a deep clone
|
||||
if (termBuffer != null) {
|
||||
t.termBuffer = (char[]) termBuffer.clone();
|
||||
}
|
||||
return t;
|
||||
}
|
||||
|
||||
public boolean equals(Object other) {
|
||||
if (other == this) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (other instanceof TermAttribute) {
|
||||
initTermBuffer();
|
||||
TermAttribute o = ((TermAttribute) other);
|
||||
o.initTermBuffer();
|
||||
|
||||
for(int i=0;i<termLength;i++) {
|
||||
if (termBuffer[i] != o.termBuffer[i]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
initTermBuffer();
|
||||
return "term=" + new String(termBuffer, 0, termLength);
|
||||
}
|
||||
|
||||
public void copyTo(Attribute target) {
|
||||
TermAttribute t = (TermAttribute) target;
|
||||
t.setTermBuffer(termBuffer, 0, termLength);
|
||||
}
|
||||
public void setTermLength(int length);
|
||||
}
|
||||
|
|
|
@ -0,0 +1,241 @@
|
|||
package org.apache.lucene.analysis.tokenattributes;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.AttributeImpl;
|
||||
|
||||
/**
|
||||
* The term text of a Token.
|
||||
*
|
||||
* <p><font color="#FF0000">
|
||||
* WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental.
|
||||
* The APIs introduced in these classes with Lucene 2.9 might change in the future.
|
||||
* We will make our best efforts to keep the APIs backwards-compatible.</font>
|
||||
*/
|
||||
public class TermAttributeImpl extends AttributeImpl implements TermAttribute, Cloneable, Serializable {
|
||||
private static int MIN_BUFFER_SIZE = 10;
|
||||
|
||||
private char[] termBuffer;
|
||||
private int termLength;
|
||||
|
||||
/** Returns the Token's term text.
|
||||
*
|
||||
* This method has a performance penalty
|
||||
* because the text is stored internally in a char[]. If
|
||||
* possible, use {@link #termBuffer()} and {@link
|
||||
* #termLength()} directly instead. If you really need a
|
||||
* String, use this method, which is nothing more than
|
||||
* a convenience call to <b>new String(token.termBuffer(), 0, token.termLength())</b>
|
||||
*/
|
||||
public String term() {
|
||||
initTermBuffer();
|
||||
return new String(termBuffer, 0, termLength);
|
||||
}
|
||||
|
||||
/** Copies the contents of buffer, starting at offset for
|
||||
* length characters, into the termBuffer array.
|
||||
* @param buffer the buffer to copy
|
||||
* @param offset the index in the buffer of the first character to copy
|
||||
* @param length the number of characters to copy
|
||||
*/
|
||||
public void setTermBuffer(char[] buffer, int offset, int length) {
|
||||
char[] newCharBuffer = growTermBuffer(length);
|
||||
if (newCharBuffer != null) {
|
||||
termBuffer = newCharBuffer;
|
||||
}
|
||||
System.arraycopy(buffer, offset, termBuffer, 0, length);
|
||||
termLength = length;
|
||||
}
|
||||
|
||||
/** Copies the contents of buffer into the termBuffer array.
|
||||
* @param buffer the buffer to copy
|
||||
*/
|
||||
public void setTermBuffer(String buffer) {
|
||||
int length = buffer.length();
|
||||
char[] newCharBuffer = growTermBuffer(length);
|
||||
if (newCharBuffer != null) {
|
||||
termBuffer = newCharBuffer;
|
||||
}
|
||||
buffer.getChars(0, length, termBuffer, 0);
|
||||
termLength = length;
|
||||
}
|
||||
|
||||
/** Copies the contents of buffer, starting at offset and continuing
|
||||
* for length characters, into the termBuffer array.
|
||||
* @param buffer the buffer to copy
|
||||
* @param offset the index in the buffer of the first character to copy
|
||||
* @param length the number of characters to copy
|
||||
*/
|
||||
public void setTermBuffer(String buffer, int offset, int length) {
|
||||
assert offset <= buffer.length();
|
||||
assert offset + length <= buffer.length();
|
||||
char[] newCharBuffer = growTermBuffer(length);
|
||||
if (newCharBuffer != null) {
|
||||
termBuffer = newCharBuffer;
|
||||
}
|
||||
buffer.getChars(offset, offset + length, termBuffer, 0);
|
||||
termLength = length;
|
||||
}
|
||||
|
||||
/** Returns the internal termBuffer character array which
|
||||
* you can then directly alter. If the array is too
|
||||
* small for your token, use {@link
|
||||
* #resizeTermBuffer(int)} to increase it. After
|
||||
* altering the buffer be sure to call {@link
|
||||
* #setTermLength} to record the number of valid
|
||||
* characters that were placed into the termBuffer. */
|
||||
public char[] termBuffer() {
|
||||
initTermBuffer();
|
||||
return termBuffer;
|
||||
}
|
||||
|
||||
/** Grows the termBuffer to at least size newSize, preserving the
|
||||
* existing content. Note: If the next operation is to change
|
||||
* the contents of the term buffer use
|
||||
* {@link #setTermBuffer(char[], int, int)},
|
||||
* {@link #setTermBuffer(String)}, or
|
||||
* {@link #setTermBuffer(String, int, int)}
|
||||
* to optimally combine the resize with the setting of the termBuffer.
|
||||
* @param newSize minimum size of the new termBuffer
|
||||
* @return newly created termBuffer with length >= newSize
|
||||
*/
|
||||
public char[] resizeTermBuffer(int newSize) {
|
||||
char[] newCharBuffer = growTermBuffer(newSize);
|
||||
if (termBuffer == null) {
|
||||
// If there were termText, then preserve it.
|
||||
// note that if termBuffer is null then newCharBuffer cannot be null
|
||||
assert newCharBuffer != null;
|
||||
termBuffer = newCharBuffer;
|
||||
} else if (newCharBuffer != null) {
|
||||
// Note: if newCharBuffer != null then termBuffer needs to grow.
|
||||
// If there were a termBuffer, then preserve it
|
||||
System.arraycopy(termBuffer, 0, newCharBuffer, 0, termBuffer.length);
|
||||
termBuffer = newCharBuffer;
|
||||
}
|
||||
return termBuffer;
|
||||
}
|
||||
|
||||
/** Allocates a buffer char[] of at least newSize
|
||||
* @param newSize minimum size of the buffer
|
||||
* @return newly created buffer with length >= newSize or null if the current termBuffer is big enough
|
||||
*/
|
||||
private char[] growTermBuffer(int newSize) {
|
||||
if (termBuffer != null) {
|
||||
if (termBuffer.length >= newSize)
|
||||
// Already big enough
|
||||
return null;
|
||||
else
|
||||
// Not big enough; create a new array with slight
|
||||
// over allocation:
|
||||
return new char[ArrayUtil.getNextSize(newSize)];
|
||||
} else {
|
||||
|
||||
// determine the best size
|
||||
// The buffer is always at least MIN_BUFFER_SIZE
|
||||
if (newSize < MIN_BUFFER_SIZE) {
|
||||
newSize = MIN_BUFFER_SIZE;
|
||||
}
|
||||
|
||||
return new char[newSize];
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: once we remove the deprecated termText() method
|
||||
// and switch entirely to char[] termBuffer we don't need
|
||||
// to use this method anymore
|
||||
private void initTermBuffer() {
|
||||
if (termBuffer == null) {
|
||||
termBuffer = new char[MIN_BUFFER_SIZE];
|
||||
termLength = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/** Return number of valid characters (length of the term)
|
||||
* in the termBuffer array. */
|
||||
public int termLength() {
|
||||
return termLength;
|
||||
}
|
||||
|
||||
/** Set number of valid characters (length of the term) in
|
||||
* the termBuffer array. Use this to truncate the termBuffer
|
||||
* or to synchronize with external manipulation of the termBuffer.
|
||||
* Note: to grow the size of the array,
|
||||
* use {@link #resizeTermBuffer(int)} first.
|
||||
* @param length the truncated length
|
||||
*/
|
||||
public void setTermLength(int length) {
|
||||
initTermBuffer();
|
||||
if (length > termBuffer.length)
|
||||
throw new IllegalArgumentException("length " + length + " exceeds the size of the termBuffer (" + termBuffer.length + ")");
|
||||
termLength = length;
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
initTermBuffer();
|
||||
int code = termLength;
|
||||
code = code * 31 + ArrayUtil.hashCode(termBuffer, 0, termLength);
|
||||
return code;
|
||||
}
|
||||
|
||||
public void clear() {
|
||||
termLength = 0;
|
||||
}
|
||||
|
||||
public Object clone() {
|
||||
TermAttributeImpl t = (TermAttributeImpl)super.clone();
|
||||
// Do a deep clone
|
||||
if (termBuffer != null) {
|
||||
t.termBuffer = (char[]) termBuffer.clone();
|
||||
}
|
||||
return t;
|
||||
}
|
||||
|
||||
public boolean equals(Object other) {
|
||||
if (other == this) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (other instanceof TermAttribute) {
|
||||
initTermBuffer();
|
||||
TermAttributeImpl o = ((TermAttributeImpl) other);
|
||||
o.initTermBuffer();
|
||||
|
||||
for(int i=0;i<termLength;i++) {
|
||||
if (termBuffer[i] != o.termBuffer[i]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
initTermBuffer();
|
||||
return "term=" + new String(termBuffer, 0, termLength);
|
||||
}
|
||||
|
||||
public void copyTo(AttributeImpl target) {
|
||||
TermAttribute t = (TermAttribute) target;
|
||||
t.setTermBuffer(termBuffer, 0, termLength);
|
||||
}
|
||||
}
|
|
@ -17,8 +17,6 @@ package org.apache.lucene.analysis.tokenattributes;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import org.apache.lucene.util.Attribute;
|
||||
|
||||
/**
|
||||
|
@ -29,55 +27,11 @@ import org.apache.lucene.util.Attribute;
|
|||
* The APIs introduced in these classes with Lucene 2.9 might change in the future.
|
||||
* We will make our best efforts to keep the APIs backwards-compatible.</font>
|
||||
*/
|
||||
public class TypeAttribute extends Attribute implements Cloneable, Serializable {
|
||||
private String type;
|
||||
public static final String DEFAULT_TYPE = "word";
|
||||
|
||||
public TypeAttribute() {
|
||||
this(DEFAULT_TYPE);
|
||||
}
|
||||
|
||||
public TypeAttribute(String type) {
|
||||
this.type = type;
|
||||
}
|
||||
|
||||
public interface TypeAttribute extends Attribute {
|
||||
/** Returns this Token's lexical type. Defaults to "word". */
|
||||
public String type() {
|
||||
return type;
|
||||
}
|
||||
public String type();
|
||||
|
||||
/** Set the lexical type.
|
||||
@see #type() */
|
||||
public void setType(String type) {
|
||||
this.type = type;
|
||||
}
|
||||
|
||||
public void clear() {
|
||||
type = DEFAULT_TYPE;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "type=" + type;
|
||||
}
|
||||
|
||||
public boolean equals(Object other) {
|
||||
if (other == this) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (other instanceof TypeAttribute) {
|
||||
return type.equals(((TypeAttribute) other).type);
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
return type.hashCode();
|
||||
}
|
||||
|
||||
public void copyTo(Attribute target) {
|
||||
TypeAttribute t = (TypeAttribute) target;
|
||||
t.setType(new String(type));
|
||||
}
|
||||
public void setType(String type);
|
||||
}
|
||||
|
|
|
@ -0,0 +1,79 @@
|
|||
package org.apache.lucene.analysis.tokenattributes;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import org.apache.lucene.util.AttributeImpl;
|
||||
|
||||
/**
|
||||
* A Token's lexical type. The Default value is "word".
|
||||
*
|
||||
* <p><font color="#FF0000">
|
||||
* WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental.
|
||||
* The APIs introduced in these classes with Lucene 2.9 might change in the future.
|
||||
* We will make our best efforts to keep the APIs backwards-compatible.</font>
|
||||
*/
|
||||
public class TypeAttributeImpl extends AttributeImpl implements TypeAttribute, Cloneable, Serializable {
|
||||
private String type;
|
||||
public static final String DEFAULT_TYPE = "word";
|
||||
|
||||
public TypeAttributeImpl() {
|
||||
this(DEFAULT_TYPE);
|
||||
}
|
||||
|
||||
public TypeAttributeImpl(String type) {
|
||||
this.type = type;
|
||||
}
|
||||
|
||||
/** Returns this Token's lexical type. Defaults to "word". */
|
||||
public String type() {
|
||||
return type;
|
||||
}
|
||||
|
||||
/** Set the lexical type.
|
||||
@see #type() */
|
||||
public void setType(String type) {
|
||||
this.type = type;
|
||||
}
|
||||
|
||||
public void clear() {
|
||||
type = DEFAULT_TYPE;
|
||||
}
|
||||
|
||||
public boolean equals(Object other) {
|
||||
if (other == this) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (other instanceof TypeAttributeImpl) {
|
||||
return type.equals(((TypeAttributeImpl) other).type);
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
return type.hashCode();
|
||||
}
|
||||
|
||||
public void copyTo(AttributeImpl target) {
|
||||
TypeAttribute t = (TypeAttribute) target;
|
||||
t.setType(new String(type));
|
||||
}
|
||||
}
|
|
@ -20,7 +20,6 @@ package org.apache.lucene.index;
|
|||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import org.apache.lucene.document.Fieldable;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
|
@ -83,7 +82,6 @@ final class DocInverterPerField extends DocFieldConsumerPerField {
|
|||
final int valueLength = stringValue.length();
|
||||
perThread.singleTokenTokenStream.reinit(stringValue, 0, valueLength);
|
||||
fieldState.attributeSource = perThread.singleTokenTokenStream;
|
||||
perThread.localTokenStream.reset();
|
||||
consumer.start(field);
|
||||
|
||||
boolean success = false;
|
||||
|
@ -132,21 +130,15 @@ final class DocInverterPerField extends DocFieldConsumerPerField {
|
|||
try {
|
||||
int offsetEnd = fieldState.offset-1;
|
||||
|
||||
boolean useNewTokenStreamAPI = stream.useNewAPI();
|
||||
Token localToken = null;
|
||||
|
||||
if (useNewTokenStreamAPI) {
|
||||
fieldState.attributeSource = stream;
|
||||
} else {
|
||||
fieldState.attributeSource = perThread.localTokenStream;
|
||||
localToken = perThread.localToken;
|
||||
}
|
||||
|
||||
consumer.start(field);
|
||||
boolean hasMoreTokens = stream.incrementToken();
|
||||
|
||||
fieldState.attributeSource = stream;
|
||||
|
||||
OffsetAttribute offsetAttribute = (OffsetAttribute) fieldState.attributeSource.addAttribute(OffsetAttribute.class);
|
||||
PositionIncrementAttribute posIncrAttribute = (PositionIncrementAttribute) fieldState.attributeSource.addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
consumer.start(field);
|
||||
|
||||
for(;;) {
|
||||
|
||||
// If we hit an exception in stream.next below
|
||||
|
@ -155,14 +147,8 @@ final class DocInverterPerField extends DocFieldConsumerPerField {
|
|||
// non-aborting and (above) this one document
|
||||
// will be marked as deleted, but still
|
||||
// consume a docID
|
||||
Token token = null;
|
||||
if (useNewTokenStreamAPI) {
|
||||
if (!stream.incrementToken()) break;
|
||||
} else {
|
||||
token = stream.next(localToken);
|
||||
if (token == null) break;
|
||||
perThread.localTokenStream.set(token);
|
||||
}
|
||||
|
||||
if (!hasMoreTokens) break;
|
||||
|
||||
final int posIncr = posIncrAttribute.getPositionIncrement();
|
||||
fieldState.position += posIncr;
|
||||
|
@ -194,6 +180,8 @@ final class DocInverterPerField extends DocFieldConsumerPerField {
|
|||
docState.infoStream.println("maxFieldLength " +maxFieldLength+ " reached for field " + fieldInfo.name + ", ignoring following tokens");
|
||||
break;
|
||||
}
|
||||
|
||||
hasMoreTokens = stream.incrementToken();
|
||||
}
|
||||
fieldState.offset = offsetEnd+1;
|
||||
} finally {
|
||||
|
|
|
@ -19,15 +19,9 @@ package org.apache.lucene.index;
|
|||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.util.Attribute;
|
||||
|
||||
/** This is a DocFieldConsumer that inverts each field,
|
||||
* separately, from a Document, and accepts a
|
||||
|
@ -37,10 +31,8 @@ final class DocInverterPerThread extends DocFieldConsumerPerThread {
|
|||
final DocInverter docInverter;
|
||||
final InvertedDocConsumerPerThread consumer;
|
||||
final InvertedDocEndConsumerPerThread endConsumer;
|
||||
final Token localToken = new Token();
|
||||
//TODO: change to SingleTokenTokenStream after Token was removed
|
||||
final SingleTokenTokenStream singleTokenTokenStream = new SingleTokenTokenStream();
|
||||
final BackwardsCompatibilityStream localTokenStream = new BackwardsCompatibilityStream();
|
||||
|
||||
static class SingleTokenTokenStream extends TokenStream {
|
||||
TermAttribute termAttribute;
|
||||
|
@ -55,76 +47,13 @@ final class DocInverterPerThread extends DocFieldConsumerPerThread {
|
|||
termAttribute.setTermBuffer(stringValue);
|
||||
offsetAttribute.setOffset(startOffset, endOffset);
|
||||
}
|
||||
|
||||
// this is a dummy, to not throw an UOE because this class does not implement any iteration method
|
||||
public boolean incrementToken() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
}
|
||||
|
||||
/** This stream wrapper is only used to maintain backwards compatibility with the
|
||||
* old TokenStream API and can be removed in Lucene 3.0
|
||||
* @deprecated
|
||||
*/
|
||||
static class BackwardsCompatibilityStream extends TokenStream {
|
||||
private Token token;
|
||||
|
||||
TermAttribute termAttribute = new TermAttribute() {
|
||||
public String term() {
|
||||
return token.term();
|
||||
}
|
||||
|
||||
public char[] termBuffer() {
|
||||
return token.termBuffer();
|
||||
}
|
||||
|
||||
public int termLength() {
|
||||
return token.termLength();
|
||||
}
|
||||
};
|
||||
OffsetAttribute offsetAttribute = new OffsetAttribute() {
|
||||
public int startOffset() {
|
||||
return token.startOffset();
|
||||
}
|
||||
|
||||
public int endOffset() {
|
||||
return token.endOffset();
|
||||
}
|
||||
};
|
||||
|
||||
PositionIncrementAttribute positionIncrementAttribute = new PositionIncrementAttribute() {
|
||||
public int getPositionIncrement() {
|
||||
return token.getPositionIncrement();
|
||||
}
|
||||
};
|
||||
|
||||
FlagsAttribute flagsAttribute = new FlagsAttribute() {
|
||||
public int getFlags() {
|
||||
return token.getFlags();
|
||||
}
|
||||
};
|
||||
|
||||
PayloadAttribute payloadAttribute = new PayloadAttribute() {
|
||||
public Payload getPayload() {
|
||||
return token.getPayload();
|
||||
}
|
||||
};
|
||||
|
||||
TypeAttribute typeAttribute = new TypeAttribute() {
|
||||
public String type() {
|
||||
return token.type();
|
||||
}
|
||||
};
|
||||
|
||||
BackwardsCompatibilityStream() {
|
||||
attributes.put(TermAttribute.class, termAttribute);
|
||||
attributes.put(OffsetAttribute.class, offsetAttribute);
|
||||
attributes.put(PositionIncrementAttribute.class, positionIncrementAttribute);
|
||||
attributes.put(FlagsAttribute.class, flagsAttribute);
|
||||
attributes.put(PayloadAttribute.class, payloadAttribute);
|
||||
attributes.put(TypeAttribute.class, typeAttribute);
|
||||
}
|
||||
|
||||
public void set(Token token) {
|
||||
this.token = token;
|
||||
}
|
||||
};
|
||||
|
||||
final DocumentsWriter.DocState docState;
|
||||
|
||||
final FieldInvertState fieldState = new FieldInvertState();
|
||||
|
|
|
@ -531,66 +531,41 @@ public class QueryParser implements QueryParserConstants {
|
|||
PositionIncrementAttribute posIncrAtt = null;
|
||||
int numTokens = 0;
|
||||
|
||||
org.apache.lucene.analysis.Token reusableToken = null;
|
||||
org.apache.lucene.analysis.Token nextToken = null;
|
||||
|
||||
|
||||
boolean useNewAPI = TokenStream.useNewAPIDefault();
|
||||
|
||||
if (useNewAPI) {
|
||||
boolean success = false;
|
||||
try {
|
||||
buffer.reset();
|
||||
success = true;
|
||||
} catch (IOException e) {
|
||||
// success==false if we hit an exception
|
||||
boolean success = false;
|
||||
try {
|
||||
buffer.reset();
|
||||
success = true;
|
||||
} catch (IOException e) {
|
||||
// success==false if we hit an exception
|
||||
}
|
||||
if (success) {
|
||||
if (buffer.hasAttribute(TermAttribute.class)) {
|
||||
termAtt = (TermAttribute) buffer.getAttribute(TermAttribute.class);
|
||||
}
|
||||
if (success) {
|
||||
if (buffer.hasAttribute(TermAttribute.class)) {
|
||||
termAtt = (TermAttribute) buffer.getAttribute(TermAttribute.class);
|
||||
}
|
||||
if (buffer.hasAttribute(PositionIncrementAttribute.class)) {
|
||||
posIncrAtt = (PositionIncrementAttribute) buffer.getAttribute(PositionIncrementAttribute.class);
|
||||
}
|
||||
if (buffer.hasAttribute(PositionIncrementAttribute.class)) {
|
||||
posIncrAtt = (PositionIncrementAttribute) buffer.getAttribute(PositionIncrementAttribute.class);
|
||||
}
|
||||
} else {
|
||||
reusableToken = new org.apache.lucene.analysis.Token();
|
||||
}
|
||||
|
||||
int positionCount = 0;
|
||||
boolean severalTokensAtSamePosition = false;
|
||||
|
||||
if (useNewAPI) {
|
||||
if (termAtt != null) {
|
||||
try {
|
||||
while (buffer.incrementToken()) {
|
||||
numTokens++;
|
||||
int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1;
|
||||
if (positionIncrement != 0) {
|
||||
positionCount += positionIncrement;
|
||||
} else {
|
||||
severalTokensAtSamePosition = true;
|
||||
}
|
||||
boolean hasMoreTokens = false;
|
||||
if (termAtt != null) {
|
||||
try {
|
||||
hasMoreTokens = buffer.incrementToken();
|
||||
while (hasMoreTokens) {
|
||||
numTokens++;
|
||||
int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1;
|
||||
if (positionIncrement != 0) {
|
||||
positionCount += positionIncrement;
|
||||
} else {
|
||||
severalTokensAtSamePosition = true;
|
||||
}
|
||||
} catch (IOException e) {
|
||||
// ignore
|
||||
hasMoreTokens = buffer.incrementToken();
|
||||
}
|
||||
}
|
||||
} else {
|
||||
while (true) {
|
||||
try {
|
||||
nextToken = buffer.next(reusableToken);
|
||||
}
|
||||
catch (IOException e) {
|
||||
nextToken = null;
|
||||
}
|
||||
if (nextToken == null)
|
||||
break;
|
||||
numTokens++;
|
||||
if (nextToken.getPositionIncrement() != 0)
|
||||
positionCount += nextToken.getPositionIncrement();
|
||||
else
|
||||
severalTokensAtSamePosition = true;
|
||||
} catch (IOException e) {
|
||||
// ignore
|
||||
}
|
||||
}
|
||||
try {
|
||||
|
@ -609,16 +584,9 @@ public class QueryParser implements QueryParserConstants {
|
|||
else if (numTokens == 1) {
|
||||
String term = null;
|
||||
try {
|
||||
|
||||
if (useNewAPI) {
|
||||
boolean hasNext = buffer.incrementToken();
|
||||
assert hasNext == true;
|
||||
term = termAtt.term();
|
||||
} else {
|
||||
nextToken = buffer.next(reusableToken);
|
||||
assert nextToken != null;
|
||||
term = nextToken.term();
|
||||
}
|
||||
boolean hasNext = buffer.incrementToken();
|
||||
assert hasNext == true;
|
||||
term = termAtt.term();
|
||||
} catch (IOException e) {
|
||||
// safe to ignore, because we know the number of tokens
|
||||
}
|
||||
|
@ -631,15 +599,9 @@ public class QueryParser implements QueryParserConstants {
|
|||
for (int i = 0; i < numTokens; i++) {
|
||||
String term = null;
|
||||
try {
|
||||
if (useNewAPI) {
|
||||
boolean hasNext = buffer.incrementToken();
|
||||
assert hasNext == true;
|
||||
term = termAtt.term();
|
||||
} else {
|
||||
nextToken = buffer.next(reusableToken);
|
||||
assert nextToken != null;
|
||||
term = nextToken.term();
|
||||
}
|
||||
boolean hasNext = buffer.incrementToken();
|
||||
assert hasNext == true;
|
||||
term = termAtt.term();
|
||||
} catch (IOException e) {
|
||||
// safe to ignore, because we know the number of tokens
|
||||
}
|
||||
|
@ -660,18 +622,11 @@ public class QueryParser implements QueryParserConstants {
|
|||
String term = null;
|
||||
int positionIncrement = 1;
|
||||
try {
|
||||
if (useNewAPI) {
|
||||
boolean hasNext = buffer.incrementToken();
|
||||
assert hasNext == true;
|
||||
term = termAtt.term();
|
||||
if (posIncrAtt != null) {
|
||||
positionIncrement = posIncrAtt.getPositionIncrement();
|
||||
}
|
||||
} else {
|
||||
nextToken = buffer.next(reusableToken);
|
||||
assert nextToken != null;
|
||||
term = nextToken.term();
|
||||
positionIncrement = nextToken.getPositionIncrement();
|
||||
boolean hasNext = buffer.incrementToken();
|
||||
assert hasNext == true;
|
||||
term = termAtt.term();
|
||||
if (posIncrAtt != null) {
|
||||
positionIncrement = posIncrAtt.getPositionIncrement();
|
||||
}
|
||||
} catch (IOException e) {
|
||||
// safe to ignore, because we know the number of tokens
|
||||
|
@ -707,19 +662,11 @@ public class QueryParser implements QueryParserConstants {
|
|||
int positionIncrement = 1;
|
||||
|
||||
try {
|
||||
if (useNewAPI) {
|
||||
|
||||
boolean hasNext = buffer.incrementToken();
|
||||
assert hasNext == true;
|
||||
term = termAtt.term();
|
||||
if (posIncrAtt != null) {
|
||||
positionIncrement = posIncrAtt.getPositionIncrement();
|
||||
}
|
||||
} else {
|
||||
nextToken = buffer.next(reusableToken);
|
||||
assert nextToken != null;
|
||||
term = nextToken.term();
|
||||
positionIncrement = nextToken.getPositionIncrement();
|
||||
boolean hasNext = buffer.incrementToken();
|
||||
assert hasNext == true;
|
||||
term = termAtt.term();
|
||||
if (posIncrAtt != null) {
|
||||
positionIncrement = posIncrAtt.getPositionIncrement();
|
||||
}
|
||||
} catch (IOException e) {
|
||||
// safe to ignore, because we know the number of tokens
|
||||
|
@ -1625,12 +1572,6 @@ public class QueryParser implements QueryParserConstants {
|
|||
finally { jj_save(0, xla); }
|
||||
}
|
||||
|
||||
private boolean jj_3R_3() {
|
||||
if (jj_scan_token(STAR)) return true;
|
||||
if (jj_scan_token(COLON)) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
private boolean jj_3R_2() {
|
||||
if (jj_scan_token(TERM)) return true;
|
||||
if (jj_scan_token(COLON)) return true;
|
||||
|
@ -1647,6 +1588,12 @@ public class QueryParser implements QueryParserConstants {
|
|||
return false;
|
||||
}
|
||||
|
||||
private boolean jj_3R_3() {
|
||||
if (jj_scan_token(STAR)) return true;
|
||||
if (jj_scan_token(COLON)) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
/** Generated Token Manager. */
|
||||
public QueryParserTokenManager token_source;
|
||||
/** Current token. */
|
||||
|
|
|
@ -555,67 +555,42 @@ public class QueryParser {
|
|||
PositionIncrementAttribute posIncrAtt = null;
|
||||
int numTokens = 0;
|
||||
|
||||
org.apache.lucene.analysis.Token reusableToken = null;
|
||||
org.apache.lucene.analysis.Token nextToken = null;
|
||||
|
||||
|
||||
boolean useNewAPI = TokenStream.useNewAPIDefault();
|
||||
|
||||
if (useNewAPI) {
|
||||
boolean success = false;
|
||||
try {
|
||||
buffer.reset();
|
||||
success = true;
|
||||
} catch (IOException e) {
|
||||
// success==false if we hit an exception
|
||||
}
|
||||
if (success) {
|
||||
if (buffer.hasAttribute(TermAttribute.class)) {
|
||||
termAtt = (TermAttribute) buffer.getAttribute(TermAttribute.class);
|
||||
}
|
||||
if (buffer.hasAttribute(PositionIncrementAttribute.class)) {
|
||||
posIncrAtt = (PositionIncrementAttribute) buffer.getAttribute(PositionIncrementAttribute.class);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
reusableToken = new org.apache.lucene.analysis.Token();
|
||||
boolean success = false;
|
||||
try {
|
||||
buffer.reset();
|
||||
success = true;
|
||||
} catch (IOException e) {
|
||||
// success==false if we hit an exception
|
||||
}
|
||||
|
||||
if (success) {
|
||||
if (buffer.hasAttribute(TermAttribute.class)) {
|
||||
termAtt = (TermAttribute) buffer.getAttribute(TermAttribute.class);
|
||||
}
|
||||
if (buffer.hasAttribute(PositionIncrementAttribute.class)) {
|
||||
posIncrAtt = (PositionIncrementAttribute) buffer.getAttribute(PositionIncrementAttribute.class);
|
||||
}
|
||||
}
|
||||
|
||||
int positionCount = 0;
|
||||
boolean severalTokensAtSamePosition = false;
|
||||
|
||||
if (useNewAPI) {
|
||||
if (termAtt != null) {
|
||||
try {
|
||||
while (buffer.incrementToken()) {
|
||||
numTokens++;
|
||||
int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1;
|
||||
if (positionIncrement != 0) {
|
||||
positionCount += positionIncrement;
|
||||
} else {
|
||||
severalTokensAtSamePosition = true;
|
||||
}
|
||||
boolean hasMoreTokens = false;
|
||||
if (termAtt != null) {
|
||||
try {
|
||||
hasMoreTokens = buffer.incrementToken();
|
||||
while (hasMoreTokens) {
|
||||
numTokens++;
|
||||
int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1;
|
||||
if (positionIncrement != 0) {
|
||||
positionCount += positionIncrement;
|
||||
} else {
|
||||
severalTokensAtSamePosition = true;
|
||||
}
|
||||
} catch (IOException e) {
|
||||
// ignore
|
||||
hasMoreTokens = buffer.incrementToken();
|
||||
}
|
||||
} catch (IOException e) {
|
||||
// ignore
|
||||
}
|
||||
} else {
|
||||
while (true) {
|
||||
try {
|
||||
nextToken = buffer.next(reusableToken);
|
||||
}
|
||||
catch (IOException e) {
|
||||
nextToken = null;
|
||||
}
|
||||
if (nextToken == null)
|
||||
break;
|
||||
numTokens++;
|
||||
if (nextToken.getPositionIncrement() != 0)
|
||||
positionCount += nextToken.getPositionIncrement();
|
||||
else
|
||||
severalTokensAtSamePosition = true;
|
||||
}
|
||||
}
|
||||
try {
|
||||
// rewind the buffer stream
|
||||
|
@ -627,22 +602,15 @@ public class QueryParser {
|
|||
catch (IOException e) {
|
||||
// ignore
|
||||
}
|
||||
|
||||
|
||||
if (numTokens == 0)
|
||||
return null;
|
||||
else if (numTokens == 1) {
|
||||
String term = null;
|
||||
try {
|
||||
|
||||
if (useNewAPI) {
|
||||
boolean hasNext = buffer.incrementToken();
|
||||
assert hasNext == true;
|
||||
term = termAtt.term();
|
||||
} else {
|
||||
nextToken = buffer.next(reusableToken);
|
||||
assert nextToken != null;
|
||||
term = nextToken.term();
|
||||
}
|
||||
boolean hasNext = buffer.incrementToken();
|
||||
assert hasNext == true;
|
||||
term = termAtt.term();
|
||||
} catch (IOException e) {
|
||||
// safe to ignore, because we know the number of tokens
|
||||
}
|
||||
|
@ -655,19 +623,13 @@ public class QueryParser {
|
|||
for (int i = 0; i < numTokens; i++) {
|
||||
String term = null;
|
||||
try {
|
||||
if (useNewAPI) {
|
||||
boolean hasNext = buffer.incrementToken();
|
||||
assert hasNext == true;
|
||||
term = termAtt.term();
|
||||
} else {
|
||||
nextToken = buffer.next(reusableToken);
|
||||
assert nextToken != null;
|
||||
term = nextToken.term();
|
||||
}
|
||||
boolean hasNext = buffer.incrementToken();
|
||||
assert hasNext == true;
|
||||
term = termAtt.term();
|
||||
} catch (IOException e) {
|
||||
// safe to ignore, because we know the number of tokens
|
||||
}
|
||||
|
||||
|
||||
Query currentQuery = newTermQuery(
|
||||
new Term(field, term));
|
||||
q.add(currentQuery, BooleanClause.Occur.SHOULD);
|
||||
|
@ -684,18 +646,11 @@ public class QueryParser {
|
|||
String term = null;
|
||||
int positionIncrement = 1;
|
||||
try {
|
||||
if (useNewAPI) {
|
||||
boolean hasNext = buffer.incrementToken();
|
||||
assert hasNext == true;
|
||||
term = termAtt.term();
|
||||
if (posIncrAtt != null) {
|
||||
positionIncrement = posIncrAtt.getPositionIncrement();
|
||||
}
|
||||
} else {
|
||||
nextToken = buffer.next(reusableToken);
|
||||
assert nextToken != null;
|
||||
term = nextToken.term();
|
||||
positionIncrement = nextToken.getPositionIncrement();
|
||||
boolean hasNext = buffer.incrementToken();
|
||||
assert hasNext == true;
|
||||
term = termAtt.term();
|
||||
if (posIncrAtt != null) {
|
||||
positionIncrement = posIncrAtt.getPositionIncrement();
|
||||
}
|
||||
} catch (IOException e) {
|
||||
// safe to ignore, because we know the number of tokens
|
||||
|
@ -724,26 +679,18 @@ public class QueryParser {
|
|||
PhraseQuery pq = newPhraseQuery();
|
||||
pq.setSlop(phraseSlop);
|
||||
int position = -1;
|
||||
|
||||
|
||||
|
||||
|
||||
for (int i = 0; i < numTokens; i++) {
|
||||
String term = null;
|
||||
int positionIncrement = 1;
|
||||
|
||||
try {
|
||||
if (useNewAPI) {
|
||||
|
||||
boolean hasNext = buffer.incrementToken();
|
||||
assert hasNext == true;
|
||||
term = termAtt.term();
|
||||
if (posIncrAtt != null) {
|
||||
positionIncrement = posIncrAtt.getPositionIncrement();
|
||||
}
|
||||
} else {
|
||||
nextToken = buffer.next(reusableToken);
|
||||
assert nextToken != null;
|
||||
term = nextToken.term();
|
||||
positionIncrement = nextToken.getPositionIncrement();
|
||||
try {
|
||||
boolean hasNext = buffer.incrementToken();
|
||||
assert hasNext == true;
|
||||
term = termAtt.term();
|
||||
if (posIncrAtt != null) {
|
||||
positionIncrement = posIncrAtt.getPositionIncrement();
|
||||
}
|
||||
} catch (IOException e) {
|
||||
// safe to ignore, because we know the number of tokens
|
||||
|
|
|
@ -27,7 +27,6 @@ import java.util.List;
|
|||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.index.TermFreqVector;
|
||||
|
@ -59,17 +58,15 @@ public class QueryTermVector implements TermFreqVector {
|
|||
{
|
||||
List terms = new ArrayList();
|
||||
try {
|
||||
if (stream.useNewAPI()) {
|
||||
stream.reset();
|
||||
TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class);
|
||||
while (stream.incrementToken()) {
|
||||
terms.add(termAtt.term());
|
||||
}
|
||||
} else {
|
||||
final Token reusableToken = new Token();
|
||||
for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) {
|
||||
terms.add(nextToken.term());
|
||||
}
|
||||
boolean hasMoreTokens = false;
|
||||
|
||||
stream.reset();
|
||||
TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class);
|
||||
|
||||
hasMoreTokens = stream.incrementToken();
|
||||
while (hasMoreTokens) {
|
||||
terms.add(termAtt.term());
|
||||
hasMoreTokens = stream.incrementToken();
|
||||
}
|
||||
processTerms((String[])terms.toArray(new String[terms.size()]));
|
||||
} catch (IOException e) {
|
||||
|
|
|
@ -17,79 +17,14 @@ package org.apache.lucene.util;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
/**
|
||||
* Base class for Attributes that can be added to a
|
||||
* {@link org.apache.lucene.util.AttributeSource}.
|
||||
* <p>
|
||||
* Attributes are used to add data in a dynamic, yet type-safe way to a source
|
||||
* of usually streamed objects, e. g. a {@link org.apache.lucene.analysis.TokenStream}.
|
||||
* Base interface for attributes.
|
||||
*
|
||||
* <p><font color="#FF0000">
|
||||
* WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental.
|
||||
* The APIs introduced in these classes with Lucene 2.9 might change in the future.
|
||||
* We will make our best efforts to keep the APIs backwards-compatible.</font>
|
||||
*/
|
||||
public abstract class Attribute implements Cloneable, Serializable {
|
||||
/**
|
||||
* Clears the values in this Attribute and resets it to its
|
||||
* default value.
|
||||
*/
|
||||
public abstract void clear();
|
||||
|
||||
/**
|
||||
* Subclasses must implement this method and should follow a syntax
|
||||
* similar to this one:
|
||||
*
|
||||
* <pre>
|
||||
* public String toString() {
|
||||
* return "start=" + startOffset + ",end=" + endOffset;
|
||||
* }
|
||||
* </pre>
|
||||
*/
|
||||
public abstract String toString();
|
||||
|
||||
/**
|
||||
* Subclasses must implement this method and should compute
|
||||
* a hashCode similar to this:
|
||||
* <pre>
|
||||
* public int hashCode() {
|
||||
* int code = startOffset;
|
||||
* code = code * 31 + endOffset;
|
||||
* return code;
|
||||
* }
|
||||
* </pre>
|
||||
*
|
||||
* see also {@link #equals(Object)}
|
||||
*/
|
||||
public abstract int hashCode();
|
||||
|
||||
/**
|
||||
* All values used for computation of {@link #hashCode()}
|
||||
* should be checked here for equality.
|
||||
*
|
||||
* see also {@link Object#equals(Object)}
|
||||
*/
|
||||
public abstract boolean equals(Object other);
|
||||
|
||||
/**
|
||||
* Copies the values from this Attribute into the passed-in
|
||||
* target attribute. The type of the target must match the type
|
||||
* of this attribute.
|
||||
*/
|
||||
public abstract void copyTo(Attribute target);
|
||||
|
||||
/**
|
||||
* Shallow clone. Subclasses must override this if they
|
||||
* need to clone any members deeply,
|
||||
*/
|
||||
public Object clone() {
|
||||
Object clone = null;
|
||||
try {
|
||||
clone = super.clone();
|
||||
} catch (CloneNotSupportedException e) {
|
||||
throw new RuntimeException(e); // shouldn't happen
|
||||
}
|
||||
return clone;
|
||||
}
|
||||
public interface Attribute {
|
||||
public void clear();
|
||||
}
|
||||
|
|
|
@ -0,0 +1,123 @@
|
|||
package org.apache.lucene.util;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.lang.reflect.Field;
|
||||
|
||||
/**
|
||||
* Base class for Attributes that can be added to a
|
||||
* {@link org.apache.lucene.util.AttributeSource}.
|
||||
* <p>
|
||||
* Attributes are used to add data in a dynamic, yet type-safe way to a source
|
||||
* of usually streamed objects, e. g. a {@link org.apache.lucene.analysis.TokenStream}.
|
||||
* <p><font color="#FF0000">
|
||||
* WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental.
|
||||
* The APIs introduced in these classes with Lucene 2.9 might change in the future.
|
||||
* We will make our best efforts to keep the APIs backwards-compatible.</font>
|
||||
*/
|
||||
public abstract class AttributeImpl implements Cloneable, Serializable {
|
||||
/**
|
||||
* Clears the values in this Attribute and resets it to its
|
||||
* default value.
|
||||
*/
|
||||
public abstract void clear();
|
||||
|
||||
/**
|
||||
* The default implementation of this method accesses all declared
|
||||
* fields of this object and prints the values in the following syntax:
|
||||
*
|
||||
* <pre>
|
||||
* public String toString() {
|
||||
* return "start=" + startOffset + ",end=" + endOffset;
|
||||
* }
|
||||
* </pre>
|
||||
*
|
||||
* This method may be overridden by subclasses.
|
||||
*/
|
||||
public String toString() {
|
||||
StringBuffer buffer = new StringBuffer();
|
||||
Class clazz = this.getClass();
|
||||
Field[] fields = clazz.getDeclaredFields();
|
||||
try {
|
||||
for (int i = 0; i < fields.length; i++) {
|
||||
Field f = fields[i];
|
||||
f.setAccessible(true);
|
||||
Object value = f.get(this);
|
||||
if (value == null) {
|
||||
buffer.append(f.getName() + "=null");
|
||||
} else {
|
||||
buffer.append(f.getName() + "=" + value);
|
||||
}
|
||||
if (i < fields.length - 1) {
|
||||
buffer.append(',');
|
||||
}
|
||||
}
|
||||
} catch (IllegalAccessException e) {
|
||||
// this should never happen, because we're just accessing fields
|
||||
// from 'this'
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
return buffer.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Subclasses must implement this method and should compute
|
||||
* a hashCode similar to this:
|
||||
* <pre>
|
||||
* public int hashCode() {
|
||||
* int code = startOffset;
|
||||
* code = code * 31 + endOffset;
|
||||
* return code;
|
||||
* }
|
||||
* </pre>
|
||||
*
|
||||
* see also {@link #equals(Object)}
|
||||
*/
|
||||
public abstract int hashCode();
|
||||
|
||||
/**
|
||||
* All values used for computation of {@link #hashCode()}
|
||||
* should be checked here for equality.
|
||||
*
|
||||
* see also {@link Object#equals(Object)}
|
||||
*/
|
||||
public abstract boolean equals(Object other);
|
||||
|
||||
/**
|
||||
* Copies the values from this Attribute into the passed-in
|
||||
* target attribute. The type of the target must match the type
|
||||
* of this attribute.
|
||||
*/
|
||||
public abstract void copyTo(AttributeImpl target);
|
||||
|
||||
/**
|
||||
* Shallow clone. Subclasses must override this if they
|
||||
* need to clone any members deeply,
|
||||
*/
|
||||
public Object clone() {
|
||||
Object clone = null;
|
||||
try {
|
||||
clone = super.clone();
|
||||
} catch (CloneNotSupportedException e) {
|
||||
throw new RuntimeException(e); // shouldn't happen
|
||||
}
|
||||
return clone;
|
||||
}
|
||||
}
|
|
@ -18,14 +18,17 @@ package org.apache.lucene.util;
|
|||
*/
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.IdentityHashMap;
|
||||
import java.util.LinkedList;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream; // for javadocs
|
||||
|
||||
/**
|
||||
* An AttributeSource contains a list of different {@link Attribute}s,
|
||||
* An AttributeSource contains a list of different {@link AttributeImpl}s,
|
||||
* and methods to add and get them. There can only be a single instance
|
||||
* of an attribute in the same AttributeSource instance. This is ensured
|
||||
* by passing in the actual type of the Attribute (Class<Attribute>) to
|
||||
|
@ -40,43 +43,147 @@ import org.apache.lucene.analysis.TokenStream;
|
|||
*/
|
||||
public class AttributeSource {
|
||||
/**
|
||||
* An AttributeAcceptor defines only a single method {@link #accept(Class)}.
|
||||
* It can be used for e. g. buffering purposes to specify which attributes
|
||||
* to buffer.
|
||||
* An AttributeFactory creates instances of {@link AttributeImpl}s.
|
||||
*/
|
||||
public static abstract class AttributeAcceptor {
|
||||
/** Return true, to accept this attribute; false otherwise */
|
||||
public abstract boolean accept(Class attClass);
|
||||
public static abstract class AttributeFactory {
|
||||
/**
|
||||
* returns an {@link AttributeImpl} for the supplied {@link Attribute} interface class.
|
||||
*/
|
||||
public abstract AttributeImpl createAttributeInstance(Class attClass);
|
||||
|
||||
/**
|
||||
* This is the default factory that creates {@link AttributeImpl}s using the
|
||||
* class name of the supplied {@link Attribute} interface class by appending <code>Impl</code> to it.
|
||||
*/
|
||||
public static final AttributeFactory DEFAULT_ATTRIBUTE_FACTORY = new DefaultAttributeFactory();
|
||||
|
||||
private static final class DefaultAttributeFactory extends AttributeFactory {
|
||||
private static final IdentityHashMap/*<Class<? extends Attribute>,Class<? extends AttributeImpl>>*/ attClassImplMap = new IdentityHashMap();
|
||||
|
||||
private DefaultAttributeFactory() {}
|
||||
|
||||
public AttributeImpl createAttributeInstance(Class attClass) {
|
||||
try {
|
||||
return (AttributeImpl) getClassForInterface(attClass).newInstance();
|
||||
} catch (InstantiationException e) {
|
||||
throw new IllegalArgumentException("Could not instantiate class " + attClass);
|
||||
} catch (IllegalAccessException e) {
|
||||
throw new IllegalArgumentException("Could not instantiate class " + attClass);
|
||||
}
|
||||
}
|
||||
|
||||
private static Class getClassForInterface(Class attClass) {
|
||||
synchronized(attClassImplMap) {
|
||||
Class clazz = (Class) attClassImplMap.get(attClass);
|
||||
if (clazz == null) {
|
||||
try {
|
||||
attClassImplMap.put(attClass, clazz = Class.forName(attClass.getName() + "Impl"));
|
||||
} catch (ClassNotFoundException e) {
|
||||
throw new IllegalArgumentException("Could not find implementing class for " + attClass.getName());
|
||||
}
|
||||
}
|
||||
return clazz;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// These two maps must always be in sync!!!
|
||||
// So they are private, final and read-only from the outside (read-only iterators)
|
||||
private final Map/*<Class<Attribute>,AttributeImpl>*/ attributes;
|
||||
private final Map/*<Class<AttributeImpl>,AttributeImpl>*/ attributeImpls;
|
||||
|
||||
private AttributeFactory factory;
|
||||
|
||||
/**
|
||||
* Default AttributeAcceptor that accepts all attributes.
|
||||
* An AttributeSource using the default attribute factory {@link AttributeFactory#DEFAULT_ATTRIBUTE_FACTORY}.
|
||||
*/
|
||||
public static final AttributeAcceptor AllAcceptor = new AttributeAcceptor() {
|
||||
public boolean accept(Class attClass) {return true;}
|
||||
};
|
||||
|
||||
/**
|
||||
* Holds the Class<Attribute> -> Attribute mapping
|
||||
*/
|
||||
protected Map attributes;
|
||||
|
||||
public AttributeSource() {
|
||||
this.attributes = new LinkedHashMap();
|
||||
this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY);
|
||||
}
|
||||
|
||||
/**
|
||||
* An AttributeSource that uses the same attributes as the supplied one.
|
||||
*/
|
||||
public AttributeSource(AttributeSource input) {
|
||||
if (input == null) {
|
||||
throw new IllegalArgumentException("input AttributeSource must not be null");
|
||||
}
|
||||
this.attributes = input.attributes;
|
||||
this.attributeImpls = input.attributeImpls;
|
||||
this.factory = input.factory;
|
||||
}
|
||||
|
||||
/** Returns an iterator that iterates the attributes
|
||||
/**
|
||||
* An AttributeSource using the supplied {@link AttributeFactory} for creating new {@link Attribute} instances.
|
||||
*/
|
||||
public AttributeSource(AttributeFactory factory) {
|
||||
this.attributes = new LinkedHashMap();
|
||||
this.attributeImpls = new LinkedHashMap();
|
||||
this.factory = factory;
|
||||
}
|
||||
|
||||
/**
|
||||
* returns the used AttributeFactory.
|
||||
*/
|
||||
public AttributeFactory getAttributeFactory() {
|
||||
return this.factory;
|
||||
}
|
||||
|
||||
/** Returns a new iterator that iterates the attribute classes
|
||||
* in the same order they were added in.
|
||||
*/
|
||||
public Iterator getAttributesIterator() {
|
||||
return attributes.values().iterator();
|
||||
public Iterator/*<Class<? extends Attribute>>*/ getAttributeClassesIterator() {
|
||||
return Collections.unmodifiableSet(attributes.keySet()).iterator();
|
||||
}
|
||||
|
||||
/** Returns a new iterator that iterates all unique Attribute implementations.
|
||||
* This iterator may contain less entries that {@link #getAttributeClassesIterator},
|
||||
* if one instance implements more than one Attribute interface.
|
||||
*/
|
||||
public Iterator/*<AttributeImpl>*/ getAttributeImplsIterator() {
|
||||
return Collections.unmodifiableCollection(attributeImpls.values()).iterator();
|
||||
}
|
||||
|
||||
/** a cache that stores all interfaces for known implementation classes for performance (slow reflection) */
|
||||
private static final IdentityHashMap/*<Class<? extends AttributeImpl>,LinkedList<Class<? extends Attribute>>>*/ knownImplClasses = new IdentityHashMap();
|
||||
|
||||
/** Adds a custom AttributeImpl instance with one or more Attribute interfaces. */
|
||||
public void addAttributeImpl(final AttributeImpl att) {
|
||||
final Class clazz = att.getClass();
|
||||
if (attributeImpls.containsKey(clazz)) return;
|
||||
LinkedList foundInterfaces;
|
||||
synchronized(knownImplClasses) {
|
||||
foundInterfaces = (LinkedList) knownImplClasses.get(clazz);
|
||||
if (foundInterfaces == null) {
|
||||
knownImplClasses.put(clazz, foundInterfaces=new LinkedList());
|
||||
// find all interfaces that this attribute instance implements
|
||||
// and that extend the Attribute interface
|
||||
Class actClazz = clazz;
|
||||
do {
|
||||
Class[] interfaces = actClazz.getInterfaces();
|
||||
for (int i = 0; i < interfaces.length; i++) {
|
||||
final Class curInterface = interfaces[i];
|
||||
if (Attribute.class.isAssignableFrom(curInterface)) {
|
||||
foundInterfaces.add(curInterface);
|
||||
}
|
||||
}
|
||||
actClazz = actClazz.getSuperclass();
|
||||
} while (actClazz != null);
|
||||
}
|
||||
}
|
||||
|
||||
// add all interfaces of this AttributeImpl to the maps
|
||||
for (Iterator it = foundInterfaces.iterator(); it.hasNext(); ) {
|
||||
final Class curInterface = (Class) it.next();
|
||||
// Attribute is a superclass of this interface
|
||||
if (!attributes.containsKey(curInterface)) {
|
||||
// invalidate state to force recomputation in captureState()
|
||||
this.currentState = null;
|
||||
attributes.put(curInterface, att);
|
||||
attributeImpls.put(clazz, att);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -85,18 +192,11 @@ public class AttributeSource {
|
|||
* already in this AttributeSource and returns it. Otherwise a
|
||||
* new instance is created, added to this AttributeSource and returned.
|
||||
*/
|
||||
public Attribute addAttribute(Class attClass) {
|
||||
Attribute att = (Attribute) attributes.get(attClass);
|
||||
public AttributeImpl addAttribute(Class attClass) {
|
||||
AttributeImpl att = (AttributeImpl) attributes.get(attClass);
|
||||
if (att == null) {
|
||||
try {
|
||||
att = (Attribute) attClass.newInstance();
|
||||
} catch (InstantiationException e) {
|
||||
throw new IllegalArgumentException("Could not instantiate class " + attClass);
|
||||
} catch (IllegalAccessException e) {
|
||||
throw new IllegalArgumentException("Could not instantiate class " + attClass);
|
||||
}
|
||||
|
||||
attributes.put(attClass, att);
|
||||
att = this.factory.createAttributeInstance(attClass);
|
||||
addAttributeImpl(att);
|
||||
}
|
||||
return att;
|
||||
}
|
||||
|
@ -121,10 +221,10 @@ public class AttributeSource {
|
|||
* @throws IllegalArgumentException if this AttributeSource does not contain the
|
||||
* Attribute
|
||||
*/
|
||||
public Attribute getAttribute(Class attClass) {
|
||||
Attribute att = (Attribute) this.attributes.get(attClass);
|
||||
public AttributeImpl getAttribute(Class attClass) {
|
||||
AttributeImpl att = (AttributeImpl) this.attributes.get(attClass);
|
||||
if (att == null) {
|
||||
throw new IllegalArgumentException("This token does not have the attribute '" + attClass + "'.");
|
||||
throw new IllegalArgumentException("This AttributeSource does not have the attribute '" + attClass + "'.");
|
||||
}
|
||||
|
||||
return att;
|
||||
|
@ -132,52 +232,72 @@ public class AttributeSource {
|
|||
|
||||
/**
|
||||
* Resets all Attributes in this AttributeSource by calling
|
||||
* {@link Attribute#clear()} on each Attribute.
|
||||
* {@link AttributeImpl#clear()} on each Attribute implementation.
|
||||
*/
|
||||
public void clearAttributes() {
|
||||
Iterator it = getAttributesIterator();
|
||||
Iterator it = getAttributeImplsIterator();
|
||||
while (it.hasNext()) {
|
||||
((Attribute) it.next()).clear();
|
||||
((AttributeImpl) it.next()).clear();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Captures the current state of the passed in TokenStream.
|
||||
* <p>
|
||||
* This state will contain all of the passed in TokenStream's
|
||||
* {@link Attribute}s. If only a subset of the attributes is needed
|
||||
* please use {@link #captureState(AttributeAcceptor)}
|
||||
* This class holds the state of an AttributeSource.
|
||||
* @see #captureState
|
||||
* @see #restoreState
|
||||
*/
|
||||
public AttributeSource captureState() {
|
||||
return captureState(AllAcceptor);
|
||||
}
|
||||
|
||||
/**
|
||||
* Captures the current state of the passed in TokenStream.
|
||||
* <p>
|
||||
* This state will contain all of the passed in TokenStream's
|
||||
* {@link Attribute}s which the {@link AttributeAcceptor} accepts.
|
||||
*/
|
||||
public AttributeSource captureState(AttributeAcceptor acceptor) {
|
||||
AttributeSource state = new AttributeSource();
|
||||
|
||||
Iterator it = getAttributesIterator();
|
||||
while(it.hasNext()) {
|
||||
Attribute att = (Attribute) it.next();
|
||||
if (acceptor.accept(att.getClass())) {
|
||||
Attribute clone = (Attribute) att.clone();
|
||||
state.attributes.put(att.getClass(), clone);
|
||||
}
|
||||
}
|
||||
public static final class State implements Cloneable {
|
||||
private AttributeImpl attribute;
|
||||
private State next;
|
||||
|
||||
return state;
|
||||
public Object clone() {
|
||||
State clone = new State();
|
||||
clone.attribute = (AttributeImpl) attribute.clone();
|
||||
|
||||
if (next != null) {
|
||||
clone.next = (State) next.clone();
|
||||
}
|
||||
|
||||
return clone;
|
||||
}
|
||||
}
|
||||
|
||||
private State currentState = null;
|
||||
|
||||
private void computeCurrentState() {
|
||||
currentState = new State();
|
||||
State c = currentState;
|
||||
Iterator it = getAttributeImplsIterator();
|
||||
c.attribute = (AttributeImpl) it.next();
|
||||
while (it.hasNext()) {
|
||||
c.next = new State();
|
||||
c = c.next;
|
||||
c.attribute = (AttributeImpl) it.next();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Restores this state by copying the values of all attributes
|
||||
* that this state contains into the attributes of the targetStream.
|
||||
* Captures the state of all Attributes. The return value can be passed to
|
||||
* {@link #restoreState} to restore the state of this or another AttributeSource.
|
||||
*/
|
||||
public State captureState() {
|
||||
if (!hasAttributes()) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (currentState == null) {
|
||||
computeCurrentState();
|
||||
}
|
||||
return (State) this.currentState.clone();
|
||||
}
|
||||
|
||||
/**
|
||||
* Restores this state by copying the values of all attribute implementations
|
||||
* that this state contains into the attributes implementations of the targetStream.
|
||||
* The targetStream must contain a corresponding instance for each argument
|
||||
* contained in this state.
|
||||
* contained in this state (e.g. it is not possible to restore the state of
|
||||
* an AttributeSource containing a TermAttribute into a AttributeSource using
|
||||
* a Token instance as implementation).
|
||||
* <p>
|
||||
* Note that this method does not affect attributes of the targetStream
|
||||
* that are not contained in this state. In other words, if for example
|
||||
|
@ -186,19 +306,22 @@ public class AttributeSource {
|
|||
* reset its value to the default, in which case the caller should first
|
||||
* call {@link TokenStream#clearAttributes()} on the targetStream.
|
||||
*/
|
||||
public void restoreState(AttributeSource target) {
|
||||
Iterator it = getAttributesIterator();
|
||||
while (it.hasNext()) {
|
||||
Attribute att = (Attribute) it.next();
|
||||
Attribute targetAtt = target.getAttribute(att.getClass());
|
||||
att.copyTo(targetAtt);
|
||||
}
|
||||
public void restoreState(State state) {
|
||||
if (state == null) return;
|
||||
|
||||
do {
|
||||
AttributeImpl targetImpl = (AttributeImpl) attributeImpls.get(state.attribute.getClass());
|
||||
if (targetImpl == null)
|
||||
throw new IllegalArgumentException("State contains an AttributeImpl that is not in this AttributeSource");
|
||||
state.attribute.copyTo(targetImpl);
|
||||
state = state.next;
|
||||
} while (state != null);
|
||||
}
|
||||
|
||||
|
||||
public int hashCode() {
|
||||
int code = 0;
|
||||
if (hasAttributes()) {
|
||||
Iterator it = getAttributesIterator();
|
||||
Iterator it = getAttributeImplsIterator();
|
||||
while (it.hasNext()) {
|
||||
code = code * 31 + it.next().hashCode();
|
||||
}
|
||||
|
@ -220,16 +343,17 @@ public class AttributeSource {
|
|||
return false;
|
||||
}
|
||||
|
||||
if (attributes.size() != other.attributes.size()) {
|
||||
if (this.attributeImpls.size() != other.attributeImpls.size()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
Iterator it = getAttributesIterator();
|
||||
while (it.hasNext()) {
|
||||
Class attName = it.next().getClass();
|
||||
|
||||
Attribute otherAtt = (Attribute) other.attributes.get(attName);
|
||||
if (otherAtt == null || !otherAtt.equals(attributes.get(attName))) {
|
||||
// it is only equal if all attribute impls are the same in the same order
|
||||
Iterator thisIt = this.getAttributeImplsIterator();
|
||||
Iterator otherIt = other.getAttributeImplsIterator();
|
||||
while (thisIt.hasNext() && otherIt.hasNext()) {
|
||||
AttributeImpl thisAtt = (AttributeImpl) thisIt.next();
|
||||
AttributeImpl otherAtt = (AttributeImpl) otherIt.next();
|
||||
if (otherAtt.getClass() != thisAtt.getClass() || !otherAtt.equals(thisAtt)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
@ -240,38 +364,48 @@ public class AttributeSource {
|
|||
} else
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
// TODO: Java 1.5
|
||||
// private Map<Class<? extends Attribute>, Attribute> attributes;
|
||||
// public <T extends Attribute> T addAttribute(Class<T> attClass) {
|
||||
// T att = (T) attributes.get(attClass);
|
||||
// if (att == null) {
|
||||
// try {
|
||||
// att = attClass.newInstance();
|
||||
// } catch (InstantiationException e) {
|
||||
// throw new IllegalArgumentException("Could not instantiate class " + attClass);
|
||||
// } catch (IllegalAccessException e) {
|
||||
// throw new IllegalArgumentException("Could not instantiate class " + attClass);
|
||||
// }
|
||||
//
|
||||
// attributes.put(attClass, att);
|
||||
// }
|
||||
// return att;
|
||||
// }
|
||||
//
|
||||
// public boolean hasAttribute(Class<? extends Attribute> attClass) {
|
||||
// return this.attributes.containsKey(attClass);
|
||||
// }
|
||||
//
|
||||
// public <T extends Attribute> T getAttribute(Class<T> attClass) {
|
||||
// Attribute att = this.attributes.get(attClass);
|
||||
// if (att == null) {
|
||||
// throw new IllegalArgumentException("This token does not have the attribute '" + attClass + "'.");
|
||||
// }
|
||||
//
|
||||
// return (T) att;
|
||||
// }
|
||||
//
|
||||
public String toString() {
|
||||
StringBuffer sb = new StringBuffer();
|
||||
sb.append('(');
|
||||
|
||||
if (hasAttributes()) {
|
||||
Iterator it = getAttributeImplsIterator();
|
||||
if (it.hasNext()) {
|
||||
sb.append(it.next().toString());
|
||||
}
|
||||
while (it.hasNext()) {
|
||||
sb.append(',');
|
||||
sb.append(it.next().toString());
|
||||
}
|
||||
}
|
||||
sb.append(')');
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Performs a clone of all {@link AttributeImpl} instances returned in a new
|
||||
* AttributeSource instance. This method can be used to e.g. create another TokenStream
|
||||
* with exactly the same attributes (using {@link #AttributeSource(AttributeSource)})
|
||||
*/
|
||||
public AttributeSource cloneAttributes() {
|
||||
AttributeSource clone = new AttributeSource(this.factory);
|
||||
|
||||
// first clone the impls
|
||||
Iterator/*<AttributeImpl>*/ implIt = getAttributeImplsIterator();
|
||||
while (implIt.hasNext()) {
|
||||
AttributeImpl impl = (AttributeImpl) implIt.next();
|
||||
clone.attributeImpls.put(impl.getClass(), impl.clone());
|
||||
}
|
||||
|
||||
// now the interfaces
|
||||
Iterator/*<Entry<Class<Attribute>, AttributeImpl>>*/ attIt = this.attributes.entrySet().iterator();
|
||||
while (attIt.hasNext()) {
|
||||
Entry/*<Class<Attribute>, AttributeImpl>*/ entry = (Entry/*<Class<Attribute>, AttributeImpl>*/) attIt.next();
|
||||
clone.attributes.put(entry.getKey(), clone.attributeImpls.get(entry.getValue().getClass()));
|
||||
}
|
||||
|
||||
return clone;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -17,6 +17,7 @@ package org.apache.lucene.analysis;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
import java.io.StringReader;
|
||||
|
@ -34,84 +35,84 @@ public class TestASCIIFoldingFilter extends LuceneTestCase {
|
|||
+" ð ñ ò ó ô õ ö ø œ ß þ ù ú û ü ý ÿ fi fl"));
|
||||
ASCIIFoldingFilter filter = new ASCIIFoldingFilter(stream);
|
||||
|
||||
final Token reusableToken = new Token();
|
||||
TermAttribute termAtt = (TermAttribute) filter.getAttribute(TermAttribute.class);
|
||||
|
||||
assertEquals("Des", filter.next(reusableToken).term());
|
||||
assertEquals("mot", filter.next(reusableToken).term());
|
||||
assertEquals("cles", filter.next(reusableToken).term());
|
||||
assertEquals("A", filter.next(reusableToken).term());
|
||||
assertEquals("LA", filter.next(reusableToken).term());
|
||||
assertEquals("CHAINE", filter.next(reusableToken).term());
|
||||
assertEquals("A", filter.next(reusableToken).term());
|
||||
assertEquals("A", filter.next(reusableToken).term());
|
||||
assertEquals("A", filter.next(reusableToken).term());
|
||||
assertEquals("A", filter.next(reusableToken).term());
|
||||
assertEquals("A", filter.next(reusableToken).term());
|
||||
assertEquals("A", filter.next(reusableToken).term());
|
||||
assertEquals("AE", filter.next(reusableToken).term());
|
||||
assertEquals("C", filter.next(reusableToken).term());
|
||||
assertEquals("E", filter.next(reusableToken).term());
|
||||
assertEquals("E", filter.next(reusableToken).term());
|
||||
assertEquals("E", filter.next(reusableToken).term());
|
||||
assertEquals("E", filter.next(reusableToken).term());
|
||||
assertEquals("I", filter.next(reusableToken).term());
|
||||
assertEquals("I", filter.next(reusableToken).term());
|
||||
assertEquals("I", filter.next(reusableToken).term());
|
||||
assertEquals("I", filter.next(reusableToken).term());
|
||||
assertEquals("IJ", filter.next(reusableToken).term());
|
||||
assertEquals("D", filter.next(reusableToken).term());
|
||||
assertEquals("N", filter.next(reusableToken).term());
|
||||
assertEquals("O", filter.next(reusableToken).term());
|
||||
assertEquals("O", filter.next(reusableToken).term());
|
||||
assertEquals("O", filter.next(reusableToken).term());
|
||||
assertEquals("O", filter.next(reusableToken).term());
|
||||
assertEquals("O", filter.next(reusableToken).term());
|
||||
assertEquals("O", filter.next(reusableToken).term());
|
||||
assertEquals("OE", filter.next(reusableToken).term());
|
||||
assertEquals("TH", filter.next(reusableToken).term());
|
||||
assertEquals("U", filter.next(reusableToken).term());
|
||||
assertEquals("U", filter.next(reusableToken).term());
|
||||
assertEquals("U", filter.next(reusableToken).term());
|
||||
assertEquals("U", filter.next(reusableToken).term());
|
||||
assertEquals("Y", filter.next(reusableToken).term());
|
||||
assertEquals("Y", filter.next(reusableToken).term());
|
||||
assertEquals("a", filter.next(reusableToken).term());
|
||||
assertEquals("a", filter.next(reusableToken).term());
|
||||
assertEquals("a", filter.next(reusableToken).term());
|
||||
assertEquals("a", filter.next(reusableToken).term());
|
||||
assertEquals("a", filter.next(reusableToken).term());
|
||||
assertEquals("a", filter.next(reusableToken).term());
|
||||
assertEquals("ae", filter.next(reusableToken).term());
|
||||
assertEquals("c", filter.next(reusableToken).term());
|
||||
assertEquals("e", filter.next(reusableToken).term());
|
||||
assertEquals("e", filter.next(reusableToken).term());
|
||||
assertEquals("e", filter.next(reusableToken).term());
|
||||
assertEquals("e", filter.next(reusableToken).term());
|
||||
assertEquals("i", filter.next(reusableToken).term());
|
||||
assertEquals("i", filter.next(reusableToken).term());
|
||||
assertEquals("i", filter.next(reusableToken).term());
|
||||
assertEquals("i", filter.next(reusableToken).term());
|
||||
assertEquals("ij", filter.next(reusableToken).term());
|
||||
assertEquals("d", filter.next(reusableToken).term());
|
||||
assertEquals("n", filter.next(reusableToken).term());
|
||||
assertEquals("o", filter.next(reusableToken).term());
|
||||
assertEquals("o", filter.next(reusableToken).term());
|
||||
assertEquals("o", filter.next(reusableToken).term());
|
||||
assertEquals("o", filter.next(reusableToken).term());
|
||||
assertEquals("o", filter.next(reusableToken).term());
|
||||
assertEquals("o", filter.next(reusableToken).term());
|
||||
assertEquals("oe", filter.next(reusableToken).term());
|
||||
assertEquals("ss", filter.next(reusableToken).term());
|
||||
assertEquals("th", filter.next(reusableToken).term());
|
||||
assertEquals("u", filter.next(reusableToken).term());
|
||||
assertEquals("u", filter.next(reusableToken).term());
|
||||
assertEquals("u", filter.next(reusableToken).term());
|
||||
assertEquals("u", filter.next(reusableToken).term());
|
||||
assertEquals("y", filter.next(reusableToken).term());
|
||||
assertEquals("y", filter.next(reusableToken).term());
|
||||
assertEquals("fi", filter.next(reusableToken).term());
|
||||
assertEquals("fl", filter.next(reusableToken).term());
|
||||
assertNull(filter.next(reusableToken));
|
||||
assertTermEquals("Des", filter, termAtt);
|
||||
assertTermEquals("mot", filter, termAtt);
|
||||
assertTermEquals("cles", filter, termAtt);
|
||||
assertTermEquals("A", filter, termAtt);
|
||||
assertTermEquals("LA", filter, termAtt);
|
||||
assertTermEquals("CHAINE", filter, termAtt);
|
||||
assertTermEquals("A", filter, termAtt);
|
||||
assertTermEquals("A", filter, termAtt);
|
||||
assertTermEquals("A", filter, termAtt);
|
||||
assertTermEquals("A", filter, termAtt);
|
||||
assertTermEquals("A", filter, termAtt);
|
||||
assertTermEquals("A", filter, termAtt);
|
||||
assertTermEquals("AE", filter, termAtt);
|
||||
assertTermEquals("C", filter, termAtt);
|
||||
assertTermEquals("E", filter, termAtt);
|
||||
assertTermEquals("E", filter, termAtt);
|
||||
assertTermEquals("E", filter, termAtt);
|
||||
assertTermEquals("E", filter, termAtt);
|
||||
assertTermEquals("I", filter, termAtt);
|
||||
assertTermEquals("I", filter, termAtt);
|
||||
assertTermEquals("I", filter, termAtt);
|
||||
assertTermEquals("I", filter, termAtt);
|
||||
assertTermEquals("IJ", filter, termAtt);
|
||||
assertTermEquals("D", filter, termAtt);
|
||||
assertTermEquals("N", filter, termAtt);
|
||||
assertTermEquals("O", filter, termAtt);
|
||||
assertTermEquals("O", filter, termAtt);
|
||||
assertTermEquals("O", filter, termAtt);
|
||||
assertTermEquals("O", filter, termAtt);
|
||||
assertTermEquals("O", filter, termAtt);
|
||||
assertTermEquals("O", filter, termAtt);
|
||||
assertTermEquals("OE", filter, termAtt);
|
||||
assertTermEquals("TH", filter, termAtt);
|
||||
assertTermEquals("U", filter, termAtt);
|
||||
assertTermEquals("U", filter, termAtt);
|
||||
assertTermEquals("U", filter, termAtt);
|
||||
assertTermEquals("U", filter, termAtt);
|
||||
assertTermEquals("Y", filter, termAtt);
|
||||
assertTermEquals("Y", filter, termAtt);
|
||||
assertTermEquals("a", filter, termAtt);
|
||||
assertTermEquals("a", filter, termAtt);
|
||||
assertTermEquals("a", filter, termAtt);
|
||||
assertTermEquals("a", filter, termAtt);
|
||||
assertTermEquals("a", filter, termAtt);
|
||||
assertTermEquals("a", filter, termAtt);
|
||||
assertTermEquals("ae", filter, termAtt);
|
||||
assertTermEquals("c", filter, termAtt);
|
||||
assertTermEquals("e", filter, termAtt);
|
||||
assertTermEquals("e", filter, termAtt);
|
||||
assertTermEquals("e", filter, termAtt);
|
||||
assertTermEquals("e", filter, termAtt);
|
||||
assertTermEquals("i", filter, termAtt);
|
||||
assertTermEquals("i", filter, termAtt);
|
||||
assertTermEquals("i", filter, termAtt);
|
||||
assertTermEquals("i", filter, termAtt);
|
||||
assertTermEquals("ij", filter, termAtt);
|
||||
assertTermEquals("d", filter, termAtt);
|
||||
assertTermEquals("n", filter, termAtt);
|
||||
assertTermEquals("o", filter, termAtt);
|
||||
assertTermEquals("o", filter, termAtt);
|
||||
assertTermEquals("o", filter, termAtt);
|
||||
assertTermEquals("o", filter, termAtt);
|
||||
assertTermEquals("o", filter, termAtt);
|
||||
assertTermEquals("o", filter, termAtt);
|
||||
assertTermEquals("oe", filter, termAtt);
|
||||
assertTermEquals("ss", filter, termAtt);
|
||||
assertTermEquals("th", filter, termAtt);
|
||||
assertTermEquals("u", filter, termAtt);
|
||||
assertTermEquals("u", filter, termAtt);
|
||||
assertTermEquals("u", filter, termAtt);
|
||||
assertTermEquals("u", filter, termAtt);
|
||||
assertTermEquals("y", filter, termAtt);
|
||||
assertTermEquals("y", filter, termAtt);
|
||||
assertTermEquals("fi", filter, termAtt);
|
||||
assertTermEquals("fl", filter, termAtt);
|
||||
assertFalse(filter.incrementToken());
|
||||
}
|
||||
|
||||
|
||||
|
@ -1891,11 +1892,16 @@ public class TestASCIIFoldingFilter extends LuceneTestCase {
|
|||
|
||||
TokenStream stream = new WhitespaceTokenizer(new StringReader(inputText.toString()));
|
||||
ASCIIFoldingFilter filter = new ASCIIFoldingFilter(stream);
|
||||
final Token reusableToken = new Token();
|
||||
TermAttribute termAtt = (TermAttribute) filter.getAttribute(TermAttribute.class);
|
||||
Iterator expectedIter = expectedOutputTokens.iterator();
|
||||
while (expectedIter.hasNext()) {
|
||||
assertEquals(expectedIter.next(), filter.next(reusableToken).term());
|
||||
while (expectedIter.hasNext()) {;
|
||||
assertTermEquals((String)expectedIter.next(), filter, termAtt);
|
||||
}
|
||||
assertNull(filter.next(reusableToken));
|
||||
assertFalse(filter.incrementToken());
|
||||
}
|
||||
|
||||
void assertTermEquals(String expected, TokenStream stream, TermAttribute termAtt) throws Exception {
|
||||
assertTrue(stream.incrementToken());
|
||||
assertEquals(expected, termAtt.term());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -27,9 +27,8 @@ public class TestNumericTokenStream extends LuceneTestCase {
|
|||
static final long lvalue = 4573245871874382L;
|
||||
static final int ivalue = 123456;
|
||||
|
||||
public void testLongStreamNewAPI() throws Exception {
|
||||
public void testLongStream() throws Exception {
|
||||
final NumericTokenStream stream=new NumericTokenStream().setLongValue(lvalue);
|
||||
stream.setUseNewAPI(true);
|
||||
// use getAttribute to test if attributes really exist, if not an IAE will be throwed
|
||||
final TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class);
|
||||
final TypeAttribute typeAtt = (TypeAttribute) stream.getAttribute(TypeAttribute.class);
|
||||
|
@ -40,22 +39,9 @@ public class TestNumericTokenStream extends LuceneTestCase {
|
|||
}
|
||||
assertFalse("No more tokens available", stream.incrementToken());
|
||||
}
|
||||
|
||||
public void testLongStreamOldAPI() throws Exception {
|
||||
final NumericTokenStream stream=new NumericTokenStream().setLongValue(lvalue);
|
||||
stream.setUseNewAPI(false);
|
||||
Token tok=new Token();
|
||||
for (int shift=0; shift<64; shift+=NumericUtils.PRECISION_STEP_DEFAULT) {
|
||||
assertNotNull("New token is available", tok=stream.next(tok));
|
||||
assertEquals("Term is correctly encoded", NumericUtils.longToPrefixCoded(lvalue, shift), tok.term());
|
||||
assertEquals("Type correct", (shift == 0) ? NumericTokenStream.TOKEN_TYPE_FULL_PREC : NumericTokenStream.TOKEN_TYPE_LOWER_PREC, tok.type());
|
||||
}
|
||||
assertNull("No more tokens available", stream.next(tok));
|
||||
}
|
||||
|
||||
public void testIntStreamNewAPI() throws Exception {
|
||||
public void testIntStream() throws Exception {
|
||||
final NumericTokenStream stream=new NumericTokenStream().setIntValue(ivalue);
|
||||
stream.setUseNewAPI(true);
|
||||
// use getAttribute to test if attributes really exist, if not an IAE will be throwed
|
||||
final TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class);
|
||||
final TypeAttribute typeAtt = (TypeAttribute) stream.getAttribute(TypeAttribute.class);
|
||||
|
@ -67,18 +53,6 @@ public class TestNumericTokenStream extends LuceneTestCase {
|
|||
assertFalse("No more tokens available", stream.incrementToken());
|
||||
}
|
||||
|
||||
public void testIntStreamOldAPI() throws Exception {
|
||||
final NumericTokenStream stream=new NumericTokenStream().setIntValue(ivalue);
|
||||
stream.setUseNewAPI(false);
|
||||
Token tok=new Token();
|
||||
for (int shift=0; shift<32; shift+=NumericUtils.PRECISION_STEP_DEFAULT) {
|
||||
assertNotNull("New token is available", tok=stream.next(tok));
|
||||
assertEquals("Term is correctly encoded", NumericUtils.intToPrefixCoded(ivalue, shift), tok.term());
|
||||
assertEquals("Type correct", (shift == 0) ? NumericTokenStream.TOKEN_TYPE_FULL_PREC : NumericTokenStream.TOKEN_TYPE_LOWER_PREC, tok.type());
|
||||
}
|
||||
assertNull("No more tokens available", stream.next(tok));
|
||||
}
|
||||
|
||||
public void testNotInitialized() throws Exception {
|
||||
final NumericTokenStream stream=new NumericTokenStream();
|
||||
|
||||
|
@ -89,21 +63,12 @@ public class TestNumericTokenStream extends LuceneTestCase {
|
|||
// pass
|
||||
}
|
||||
|
||||
stream.setUseNewAPI(true);
|
||||
try {
|
||||
stream.incrementToken();
|
||||
fail("incrementToken() should not succeed.");
|
||||
} catch (IllegalStateException e) {
|
||||
// pass
|
||||
}
|
||||
|
||||
stream.setUseNewAPI(false);
|
||||
try {
|
||||
stream.next(new Token());
|
||||
fail("next() should not succeed.");
|
||||
} catch (IllegalStateException e) {
|
||||
// pass
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,267 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
/**
|
||||
* Copyright 2004 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.English;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* tests for the TestTeeSinkTokenFilter
|
||||
*/
|
||||
public class TestTeeSinkTokenFilter extends LuceneTestCase {
|
||||
protected StringBuffer buffer1;
|
||||
protected StringBuffer buffer2;
|
||||
protected String[] tokens1;
|
||||
protected String[] tokens2;
|
||||
|
||||
|
||||
public TestTeeSinkTokenFilter(String s) {
|
||||
super(s);
|
||||
}
|
||||
|
||||
protected void setUp() throws Exception {
|
||||
super.setUp();
|
||||
tokens1 = new String[]{"The", "quick", "Burgundy", "Fox", "jumped", "over", "the", "lazy", "Red", "Dogs"};
|
||||
tokens2 = new String[]{"The", "Lazy", "Dogs", "should", "stay", "on", "the", "porch"};
|
||||
buffer1 = new StringBuffer();
|
||||
|
||||
for (int i = 0; i < tokens1.length; i++) {
|
||||
buffer1.append(tokens1[i]).append(' ');
|
||||
}
|
||||
buffer2 = new StringBuffer();
|
||||
for (int i = 0; i < tokens2.length; i++) {
|
||||
buffer2.append(tokens2[i]).append(' ');
|
||||
}
|
||||
}
|
||||
|
||||
static final TeeSinkTokenFilter.SinkFilter theFilter = new TeeSinkTokenFilter.SinkFilter() {
|
||||
public boolean accept(AttributeSource a) {
|
||||
TermAttribute termAtt = (TermAttribute) a.getAttribute(TermAttribute.class);
|
||||
return termAtt.term().equalsIgnoreCase("The");
|
||||
}
|
||||
};
|
||||
|
||||
static final TeeSinkTokenFilter.SinkFilter dogFilter = new TeeSinkTokenFilter.SinkFilter() {
|
||||
public boolean accept(AttributeSource a) {
|
||||
TermAttribute termAtt = (TermAttribute) a.getAttribute(TermAttribute.class);
|
||||
return termAtt.term().equalsIgnoreCase("Dogs");
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
public void testGeneral() throws IOException {
|
||||
final TeeSinkTokenFilter source = new TeeSinkTokenFilter(new WhitespaceTokenizer(new StringReader(buffer1.toString())));
|
||||
final TokenStream sink1 = source.newSinkTokenStream();
|
||||
final TokenStream sink2 = source.newSinkTokenStream(theFilter);
|
||||
int i = 0;
|
||||
TermAttribute termAtt = (TermAttribute) source.getAttribute(TermAttribute.class);
|
||||
while (source.incrementToken()) {
|
||||
assertEquals(tokens1[i], termAtt.term());
|
||||
i++;
|
||||
}
|
||||
assertEquals(tokens1.length, i);
|
||||
|
||||
i = 0;
|
||||
termAtt = (TermAttribute) sink1.getAttribute(TermAttribute.class);
|
||||
while (sink1.incrementToken()) {
|
||||
assertEquals(tokens1[i], termAtt.term());
|
||||
i++;
|
||||
}
|
||||
assertEquals(tokens1.length, i);
|
||||
|
||||
i = 0;
|
||||
termAtt = (TermAttribute) sink2.getAttribute(TermAttribute.class);
|
||||
while (sink2.incrementToken()) {
|
||||
assertTrue(termAtt.term().equalsIgnoreCase("The"));
|
||||
i++;
|
||||
}
|
||||
assertEquals("there should be two times 'the' in the stream", 2, i);
|
||||
}
|
||||
|
||||
public void testMultipleSources() throws Exception {
|
||||
final TeeSinkTokenFilter tee1 = new TeeSinkTokenFilter(new WhitespaceTokenizer(new StringReader(buffer1.toString())));
|
||||
final TeeSinkTokenFilter.SinkTokenStream dogDetector = tee1.newSinkTokenStream(dogFilter);
|
||||
final TeeSinkTokenFilter.SinkTokenStream theDetector = tee1.newSinkTokenStream(theFilter);
|
||||
final TokenStream source1 = new CachingTokenFilter(tee1);
|
||||
|
||||
final TeeSinkTokenFilter tee2 = new TeeSinkTokenFilter(new WhitespaceTokenizer(new StringReader(buffer2.toString())));
|
||||
tee2.addSinkTokenStream(dogDetector);
|
||||
tee2.addSinkTokenStream(theDetector);
|
||||
final TokenStream source2 = tee2;
|
||||
|
||||
int i = 0;
|
||||
TermAttribute termAtt = (TermAttribute) source1.getAttribute(TermAttribute.class);
|
||||
while (source1.incrementToken()) {
|
||||
assertEquals(tokens1[i], termAtt.term());
|
||||
i++;
|
||||
}
|
||||
assertEquals(tokens1.length, i);
|
||||
i = 0;
|
||||
termAtt = (TermAttribute) source2.getAttribute(TermAttribute.class);
|
||||
while (source2.incrementToken()) {
|
||||
assertEquals(tokens2[i], termAtt.term());
|
||||
i++;
|
||||
}
|
||||
assertEquals(tokens2.length, i);
|
||||
i = 0;
|
||||
termAtt = (TermAttribute) theDetector.getAttribute(TermAttribute.class);
|
||||
while (theDetector.incrementToken()) {
|
||||
assertTrue("'" + termAtt.term() + "' is not equal to 'The'", termAtt.term().equalsIgnoreCase("The"));
|
||||
i++;
|
||||
}
|
||||
assertEquals("there must be 4 times 'The' in the stream", 4, i);
|
||||
i = 0;
|
||||
termAtt = (TermAttribute) dogDetector.getAttribute(TermAttribute.class);
|
||||
while (dogDetector.incrementToken()) {
|
||||
assertTrue("'" + termAtt.term() + "' is not equal to 'Dogs'", termAtt.term().equalsIgnoreCase("Dogs"));
|
||||
i++;
|
||||
}
|
||||
assertEquals("there must be 2 times 'Dog' in the stream", 2, i);
|
||||
|
||||
source1.reset();
|
||||
TokenStream lowerCasing = new LowerCaseFilter(source1);
|
||||
i = 0;
|
||||
termAtt = (TermAttribute) lowerCasing.getAttribute(TermAttribute.class);
|
||||
while (lowerCasing.incrementToken()) {
|
||||
assertEquals(tokens1[i].toLowerCase(), termAtt.term());
|
||||
i++;
|
||||
}
|
||||
assertEquals(i, tokens1.length);
|
||||
}
|
||||
|
||||
/**
|
||||
* Not an explicit test, just useful to print out some info on performance
|
||||
*
|
||||
* @throws Exception
|
||||
*/
|
||||
public void performance() throws Exception {
|
||||
int[] tokCount = {100, 500, 1000, 2000, 5000, 10000};
|
||||
int[] modCounts = {1, 2, 5, 10, 20, 50, 100, 200, 500};
|
||||
for (int k = 0; k < tokCount.length; k++) {
|
||||
StringBuffer buffer = new StringBuffer();
|
||||
System.out.println("-----Tokens: " + tokCount[k] + "-----");
|
||||
for (int i = 0; i < tokCount[k]; i++) {
|
||||
buffer.append(English.intToEnglish(i).toUpperCase()).append(' ');
|
||||
}
|
||||
//make sure we produce the same tokens
|
||||
TeeSinkTokenFilter teeStream = new TeeSinkTokenFilter(new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString()))));
|
||||
TokenStream sink = teeStream.newSinkTokenStream(new ModuloSinkFilter(100));
|
||||
teeStream.consumeAllTokens();
|
||||
TokenStream stream = new ModuloTokenFilter(new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString()))), 100);
|
||||
TermAttribute tfTok = (TermAttribute) stream.addAttribute(TermAttribute.class);
|
||||
TermAttribute sinkTok = (TermAttribute) sink.addAttribute(TermAttribute.class);
|
||||
for (int i=0; stream.incrementToken(); i++) {
|
||||
assertTrue(sink.incrementToken());
|
||||
assertTrue(tfTok + " is not equal to " + sinkTok + " at token: " + i, tfTok.equals(sinkTok) == true);
|
||||
}
|
||||
|
||||
//simulate two fields, each being analyzed once, for 20 documents
|
||||
for (int j = 0; j < modCounts.length; j++) {
|
||||
int tfPos = 0;
|
||||
long start = System.currentTimeMillis();
|
||||
for (int i = 0; i < 20; i++) {
|
||||
stream = new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString())));
|
||||
PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) stream.getAttribute(PositionIncrementAttribute.class);
|
||||
while (stream.incrementToken()) {
|
||||
tfPos += posIncrAtt.getPositionIncrement();
|
||||
}
|
||||
stream = new ModuloTokenFilter(new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString()))), modCounts[j]);
|
||||
posIncrAtt = (PositionIncrementAttribute) stream.getAttribute(PositionIncrementAttribute.class);
|
||||
while (stream.incrementToken()) {
|
||||
tfPos += posIncrAtt.getPositionIncrement();
|
||||
}
|
||||
}
|
||||
long finish = System.currentTimeMillis();
|
||||
System.out.println("ModCount: " + modCounts[j] + " Two fields took " + (finish - start) + " ms");
|
||||
int sinkPos = 0;
|
||||
//simulate one field with one sink
|
||||
start = System.currentTimeMillis();
|
||||
for (int i = 0; i < 20; i++) {
|
||||
teeStream = new TeeSinkTokenFilter(new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString()))));
|
||||
sink = teeStream.newSinkTokenStream(new ModuloSinkFilter(modCounts[j]));
|
||||
PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) teeStream.getAttribute(PositionIncrementAttribute.class);
|
||||
while (teeStream.incrementToken()) {
|
||||
sinkPos += posIncrAtt.getPositionIncrement();
|
||||
}
|
||||
//System.out.println("Modulo--------");
|
||||
posIncrAtt = (PositionIncrementAttribute) sink.getAttribute(PositionIncrementAttribute.class);
|
||||
while (sink.incrementToken()) {
|
||||
sinkPos += posIncrAtt.getPositionIncrement();
|
||||
}
|
||||
}
|
||||
finish = System.currentTimeMillis();
|
||||
System.out.println("ModCount: " + modCounts[j] + " Tee fields took " + (finish - start) + " ms");
|
||||
assertTrue(sinkPos + " does not equal: " + tfPos, sinkPos == tfPos);
|
||||
|
||||
}
|
||||
System.out.println("- End Tokens: " + tokCount[k] + "-----");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
class ModuloTokenFilter extends TokenFilter {
|
||||
|
||||
int modCount;
|
||||
|
||||
ModuloTokenFilter(TokenStream input, int mc) {
|
||||
super(input);
|
||||
modCount = mc;
|
||||
}
|
||||
|
||||
int count = 0;
|
||||
|
||||
//return every 100 tokens
|
||||
public boolean incrementToken() throws IOException {
|
||||
boolean hasNext;
|
||||
for (hasNext = input.incrementToken();
|
||||
hasNext && count % modCount != 0;
|
||||
hasNext = input.incrementToken()) {
|
||||
count++;
|
||||
}
|
||||
count++;
|
||||
return hasNext;
|
||||
}
|
||||
}
|
||||
|
||||
class ModuloSinkFilter implements TeeSinkTokenFilter.SinkFilter {
|
||||
int count = 0;
|
||||
int modCount;
|
||||
|
||||
ModuloSinkFilter(int mc) {
|
||||
modCount = mc;
|
||||
}
|
||||
|
||||
public boolean accept(AttributeSource a) {
|
||||
boolean b = (a != null && count % modCount == 0);
|
||||
count++;
|
||||
return b;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
|
@ -18,9 +18,6 @@ package org.apache.lucene.analysis;
|
|||
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.English;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
|
@ -43,8 +40,7 @@ public class TestTeeTokenFilter extends LuceneTestCase {
|
|||
super(s);
|
||||
}
|
||||
|
||||
protected void setUp() throws Exception {
|
||||
super.setUp();
|
||||
protected void setUp() {
|
||||
tokens1 = new String[]{"The", "quick", "Burgundy", "Fox", "jumped", "over", "the", "lazy", "Red", "Dogs"};
|
||||
tokens2 = new String[]{"The", "Lazy", "Dogs", "should", "stay", "on", "the", "porch"};
|
||||
buffer1 = new StringBuffer();
|
||||
|
@ -66,29 +62,24 @@ public class TestTeeTokenFilter extends LuceneTestCase {
|
|||
public void test() throws IOException {
|
||||
|
||||
SinkTokenizer sink1 = new SinkTokenizer(null) {
|
||||
public void add(AttributeSource a) throws IOException {
|
||||
TermAttribute termAtt = null;
|
||||
if (a.hasAttribute(TermAttribute.class)) {
|
||||
termAtt = (TermAttribute) a.getAttribute(TermAttribute.class);
|
||||
}
|
||||
if (termAtt != null && termAtt.term().equalsIgnoreCase("The")) {
|
||||
super.add(a);
|
||||
public void add(Token t) {
|
||||
if (t != null && t.term().equalsIgnoreCase("The")) {
|
||||
super.add(t);
|
||||
}
|
||||
}
|
||||
};
|
||||
TokenStream source = new TeeTokenFilter(new WhitespaceTokenizer(new StringReader(buffer1.toString())), sink1);
|
||||
int i = 0;
|
||||
TermAttribute termAtt = (TermAttribute) source.getAttribute(TermAttribute.class);
|
||||
while (source.incrementToken()) {
|
||||
assertTrue(termAtt.term() + " is not equal to " + tokens1[i], termAtt.term().equals(tokens1[i]) == true);
|
||||
final Token reusableToken = new Token();
|
||||
for (Token nextToken = source.next(reusableToken); nextToken != null; nextToken = source.next(reusableToken)) {
|
||||
assertTrue(nextToken.term() + " is not equal to " + tokens1[i], nextToken.term().equals(tokens1[i]) == true);
|
||||
i++;
|
||||
}
|
||||
assertTrue(i + " does not equal: " + tokens1.length, i == tokens1.length);
|
||||
assertTrue("sink1 Size: " + sink1.getTokens().size() + " is not: " + 2, sink1.getTokens().size() == 2);
|
||||
i = 0;
|
||||
termAtt = (TermAttribute) sink1.getAttribute(TermAttribute.class);
|
||||
while (sink1.incrementToken()) {
|
||||
assertTrue(termAtt.term() + " is not equal to " + "The", termAtt.term().equalsIgnoreCase("The") == true);
|
||||
for (Token token = sink1.next(reusableToken); token != null; token = sink1.next(reusableToken)) {
|
||||
assertTrue(token.term() + " is not equal to " + "The", token.term().equalsIgnoreCase("The") == true);
|
||||
i++;
|
||||
}
|
||||
assertTrue(i + " does not equal: " + sink1.getTokens().size(), i == sink1.getTokens().size());
|
||||
|
@ -96,67 +87,55 @@ public class TestTeeTokenFilter extends LuceneTestCase {
|
|||
|
||||
public void testMultipleSources() throws Exception {
|
||||
SinkTokenizer theDetector = new SinkTokenizer(null) {
|
||||
public void add(AttributeSource a) throws IOException {
|
||||
TermAttribute termAtt = null;
|
||||
if (a.hasAttribute(TermAttribute.class)) {
|
||||
termAtt = (TermAttribute) a.getAttribute(TermAttribute.class);
|
||||
}
|
||||
if (termAtt != null && termAtt.term().equalsIgnoreCase("The")) {
|
||||
super.add(a);
|
||||
public void add(Token t) {
|
||||
if (t != null && t.term().equalsIgnoreCase("The")) {
|
||||
super.add(t);
|
||||
}
|
||||
}
|
||||
};
|
||||
SinkTokenizer dogDetector = new SinkTokenizer(null) {
|
||||
public void add(AttributeSource a) throws IOException {
|
||||
TermAttribute termAtt = null;
|
||||
if (a.hasAttribute(TermAttribute.class)) {
|
||||
termAtt = (TermAttribute) a.getAttribute(TermAttribute.class);
|
||||
}
|
||||
if (termAtt != null && termAtt.term().equalsIgnoreCase("Dogs")) {
|
||||
super.add(a);
|
||||
SinkTokenizer dogDetector = new SinkTokenizer(null) {
|
||||
public void add(Token t) {
|
||||
if (t != null && t.term().equalsIgnoreCase("Dogs")) {
|
||||
super.add(t);
|
||||
}
|
||||
}
|
||||
};
|
||||
TokenStream source1 = new CachingTokenFilter(new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(new StringReader(buffer1.toString())), theDetector), dogDetector));
|
||||
TokenStream source2 = new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(new StringReader(buffer2.toString())), theDetector), dogDetector);
|
||||
int i = 0;
|
||||
TermAttribute termAtt = (TermAttribute) source1.getAttribute(TermAttribute.class);
|
||||
while (source1.incrementToken()) {
|
||||
assertTrue(termAtt.term() + " is not equal to " + tokens1[i], termAtt.term().equals(tokens1[i]) == true);
|
||||
final Token reusableToken = new Token();
|
||||
for (Token nextToken = source1.next(reusableToken); nextToken != null; nextToken = source1.next(reusableToken)) {
|
||||
assertTrue(nextToken.term() + " is not equal to " + tokens1[i], nextToken.term().equals(tokens1[i]) == true);
|
||||
i++;
|
||||
}
|
||||
assertTrue(i + " does not equal: " + tokens1.length, i == tokens1.length);
|
||||
assertTrue("theDetector Size: " + theDetector.getTokens().size() + " is not: " + 2, theDetector.getTokens().size() == 2);
|
||||
assertTrue("dogDetector Size: " + dogDetector.getTokens().size() + " is not: " + 1, dogDetector.getTokens().size() == 1);
|
||||
i = 0;
|
||||
termAtt = (TermAttribute) source2.getAttribute(TermAttribute.class);
|
||||
while (source2.incrementToken()) {
|
||||
assertTrue(termAtt.term() + " is not equal to " + tokens2[i], termAtt.term().equals(tokens2[i]) == true);
|
||||
for (Token nextToken = source2.next(reusableToken); nextToken != null; nextToken = source2.next(reusableToken)) {
|
||||
assertTrue(nextToken.term() + " is not equal to " + tokens2[i], nextToken.term().equals(tokens2[i]) == true);
|
||||
i++;
|
||||
}
|
||||
assertTrue(i + " does not equal: " + tokens2.length, i == tokens2.length);
|
||||
assertTrue("theDetector Size: " + theDetector.getTokens().size() + " is not: " + 4, theDetector.getTokens().size() == 4);
|
||||
assertTrue("dogDetector Size: " + dogDetector.getTokens().size() + " is not: " + 2, dogDetector.getTokens().size() == 2);
|
||||
i = 0;
|
||||
termAtt = (TermAttribute) theDetector.getAttribute(TermAttribute.class);
|
||||
while (theDetector.incrementToken()) {
|
||||
assertTrue(termAtt.term() + " is not equal to " + "The", termAtt.term().equalsIgnoreCase("The") == true);
|
||||
for (Token nextToken = theDetector.next(reusableToken); nextToken != null; nextToken = theDetector.next(reusableToken)) {
|
||||
assertTrue(nextToken.term() + " is not equal to " + "The", nextToken.term().equalsIgnoreCase("The") == true);
|
||||
i++;
|
||||
}
|
||||
assertTrue(i + " does not equal: " + theDetector.getTokens().size(), i == theDetector.getTokens().size());
|
||||
i = 0;
|
||||
termAtt = (TermAttribute) dogDetector.getAttribute(TermAttribute.class);
|
||||
while (dogDetector.incrementToken()) {
|
||||
assertTrue(termAtt.term() + " is not equal to " + "Dogs", termAtt.term().equalsIgnoreCase("Dogs") == true);
|
||||
for (Token nextToken = dogDetector.next(reusableToken); nextToken != null; nextToken = dogDetector.next(reusableToken)) {
|
||||
assertTrue(nextToken.term() + " is not equal to " + "Dogs", nextToken.term().equalsIgnoreCase("Dogs") == true);
|
||||
i++;
|
||||
}
|
||||
assertTrue(i + " does not equal: " + dogDetector.getTokens().size(), i == dogDetector.getTokens().size());
|
||||
source1.reset();
|
||||
TokenStream lowerCasing = new LowerCaseFilter(source1);
|
||||
i = 0;
|
||||
termAtt = (TermAttribute) lowerCasing.getAttribute(TermAttribute.class);
|
||||
while (lowerCasing.incrementToken()) {
|
||||
assertTrue(termAtt.term() + " is not equal to " + tokens1[i].toLowerCase(), termAtt.term().equals(tokens1[i].toLowerCase()) == true);
|
||||
for (Token nextToken = lowerCasing.next(reusableToken); nextToken != null; nextToken = lowerCasing.next(reusableToken)) {
|
||||
assertTrue(nextToken.term() + " is not equal to " + tokens1[i].toLowerCase(), nextToken.term().equals(tokens1[i].toLowerCase()) == true);
|
||||
i++;
|
||||
}
|
||||
assertTrue(i + " does not equal: " + tokens1.length, i == tokens1.length);
|
||||
|
@ -167,7 +146,7 @@ public class TestTeeTokenFilter extends LuceneTestCase {
|
|||
*
|
||||
* @throws Exception
|
||||
*/
|
||||
public void doTestPerformance() throws Exception {
|
||||
public void performance() throws Exception {
|
||||
int[] tokCount = {100, 500, 1000, 2000, 5000, 10000};
|
||||
int[] modCounts = {1, 2, 5, 10, 20, 50, 100, 200, 500};
|
||||
for (int k = 0; k < tokCount.length; k++) {
|
||||
|
@ -178,20 +157,21 @@ public class TestTeeTokenFilter extends LuceneTestCase {
|
|||
}
|
||||
//make sure we produce the same tokens
|
||||
ModuloSinkTokenizer sink = new ModuloSinkTokenizer(tokCount[k], 100);
|
||||
final Token reusableToken = new Token();
|
||||
TokenStream stream = new TeeTokenFilter(new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString()))), sink);
|
||||
while (stream.incrementToken()) {
|
||||
while (stream.next(reusableToken) != null) {
|
||||
}
|
||||
stream = new ModuloTokenFilter(new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString()))), 100);
|
||||
List tmp = new ArrayList();
|
||||
while (stream.incrementToken()) {
|
||||
tmp.add(stream.captureState());
|
||||
for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) {
|
||||
tmp.add(nextToken.clone());
|
||||
}
|
||||
List sinkList = sink.getTokens();
|
||||
assertTrue("tmp Size: " + tmp.size() + " is not: " + sinkList.size(), tmp.size() == sinkList.size());
|
||||
for (int i = 0; i < tmp.size(); i++) {
|
||||
AttributeSource tfTok = (AttributeSource) tmp.get(i);
|
||||
AttributeSource sinkTok = (AttributeSource) sinkList.get(i);
|
||||
assertTrue(tfTok + " is not equal to " + sinkTok + " at token: " + i, tfTok.equals(sinkTok) == true);
|
||||
Token tfTok = (Token) tmp.get(i);
|
||||
Token sinkTok = (Token) sinkList.get(i);
|
||||
assertTrue(tfTok.term() + " is not equal to " + sinkTok.term() + " at token: " + i, tfTok.term().equals(sinkTok.term()) == true);
|
||||
}
|
||||
//simulate two fields, each being analyzed once, for 20 documents
|
||||
|
||||
|
@ -200,14 +180,12 @@ public class TestTeeTokenFilter extends LuceneTestCase {
|
|||
long start = System.currentTimeMillis();
|
||||
for (int i = 0; i < 20; i++) {
|
||||
stream = new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString())));
|
||||
PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) stream.getAttribute(PositionIncrementAttribute.class);
|
||||
while (stream.incrementToken()) {
|
||||
tfPos += posIncrAtt.getPositionIncrement();
|
||||
for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) {
|
||||
tfPos += nextToken.getPositionIncrement();
|
||||
}
|
||||
stream = new ModuloTokenFilter(new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString()))), modCounts[j]);
|
||||
posIncrAtt = (PositionIncrementAttribute) stream.getAttribute(PositionIncrementAttribute.class);
|
||||
while (stream.incrementToken()) {
|
||||
tfPos += posIncrAtt.getPositionIncrement();
|
||||
for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) {
|
||||
tfPos += nextToken.getPositionIncrement();
|
||||
}
|
||||
}
|
||||
long finish = System.currentTimeMillis();
|
||||
|
@ -218,15 +196,13 @@ public class TestTeeTokenFilter extends LuceneTestCase {
|
|||
for (int i = 0; i < 20; i++) {
|
||||
sink = new ModuloSinkTokenizer(tokCount[k], modCounts[j]);
|
||||
stream = new TeeTokenFilter(new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString()))), sink);
|
||||
PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) stream.getAttribute(PositionIncrementAttribute.class);
|
||||
while (stream.incrementToken()) {
|
||||
sinkPos += posIncrAtt.getPositionIncrement();
|
||||
for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) {
|
||||
sinkPos += nextToken.getPositionIncrement();
|
||||
}
|
||||
//System.out.println("Modulo--------");
|
||||
stream = sink;
|
||||
posIncrAtt = (PositionIncrementAttribute) stream.getAttribute(PositionIncrementAttribute.class);
|
||||
while (stream.incrementToken()) {
|
||||
sinkPos += posIncrAtt.getPositionIncrement();
|
||||
for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) {
|
||||
sinkPos += nextToken.getPositionIncrement();
|
||||
}
|
||||
}
|
||||
finish = System.currentTimeMillis();
|
||||
|
@ -252,15 +228,15 @@ public class TestTeeTokenFilter extends LuceneTestCase {
|
|||
int count = 0;
|
||||
|
||||
//return every 100 tokens
|
||||
public boolean incrementToken() throws IOException {
|
||||
boolean hasNext;
|
||||
for (hasNext = input.incrementToken();
|
||||
hasNext && count % modCount != 0;
|
||||
hasNext = input.incrementToken()) {
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
Token nextToken = null;
|
||||
for (nextToken = input.next(reusableToken);
|
||||
nextToken != null && count % modCount != 0;
|
||||
nextToken = input.next(reusableToken)) {
|
||||
count++;
|
||||
}
|
||||
count++;
|
||||
return hasNext;
|
||||
return nextToken;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -274,9 +250,9 @@ public class TestTeeTokenFilter extends LuceneTestCase {
|
|||
lst = new ArrayList(numToks % mc);
|
||||
}
|
||||
|
||||
public void add(AttributeSource a) throws IOException {
|
||||
if (a != null && count % modCount == 0) {
|
||||
super.add(a);
|
||||
public void add(Token t) {
|
||||
if (t != null && count % modCount == 0) {
|
||||
super.add(t);
|
||||
}
|
||||
count++;
|
||||
}
|
||||
|
|
|
@ -0,0 +1,311 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.index.Payload;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.analysis.tokenattributes.*;
|
||||
|
||||
/** This class tests some special cases of backwards compatibility when using the new TokenStream API with old analyzers */
|
||||
public class TestTokenStreamBWComp extends LuceneTestCase {
|
||||
|
||||
private final String doc = "This is the new TokenStream api";
|
||||
private final String[] stopwords = new String[] {"is", "the", "this"};
|
||||
|
||||
public static class POSToken extends Token {
|
||||
public static final int PROPERNOUN = 1;
|
||||
public static final int NO_NOUN = 2;
|
||||
|
||||
private int partOfSpeech;
|
||||
|
||||
public void setPartOfSpeech(int pos) {
|
||||
partOfSpeech = pos;
|
||||
}
|
||||
|
||||
public int getPartOfSpeech() {
|
||||
return this.partOfSpeech;
|
||||
}
|
||||
}
|
||||
|
||||
static class PartOfSpeechTaggingFilter extends TokenFilter {
|
||||
|
||||
protected PartOfSpeechTaggingFilter(TokenStream input) {
|
||||
super(input);
|
||||
}
|
||||
|
||||
public Token next() throws IOException {
|
||||
Token t = input.next();
|
||||
if (t == null) return null;
|
||||
|
||||
POSToken pt = new POSToken();
|
||||
pt.reinit(t);
|
||||
if (pt.termLength() > 0) {
|
||||
if (Character.isUpperCase(pt.termBuffer()[0])) {
|
||||
pt.setPartOfSpeech(POSToken.PROPERNOUN);
|
||||
} else {
|
||||
pt.setPartOfSpeech(POSToken.NO_NOUN);
|
||||
}
|
||||
}
|
||||
return pt;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
static class PartOfSpeechAnnotatingFilter extends TokenFilter {
|
||||
public final static byte PROPER_NOUN_ANNOTATION = 1;
|
||||
|
||||
|
||||
protected PartOfSpeechAnnotatingFilter(TokenStream input) {
|
||||
super(input);
|
||||
}
|
||||
|
||||
public Token next() throws IOException {
|
||||
Token t = input.next();
|
||||
if (t == null) return null;
|
||||
|
||||
if (t instanceof POSToken) {
|
||||
POSToken pt = (POSToken) t;
|
||||
if (pt.getPartOfSpeech() == POSToken.PROPERNOUN) {
|
||||
pt.setPayload(new Payload(new byte[] {PROPER_NOUN_ANNOTATION}));
|
||||
}
|
||||
return pt;
|
||||
} else {
|
||||
return t;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// test the chain: The one and only term "TokenStream" should be declared as proper noun:
|
||||
|
||||
public void testTeeSinkCustomTokenNewAPI() throws IOException {
|
||||
testTeeSinkCustomToken(0);
|
||||
}
|
||||
|
||||
public void testTeeSinkCustomTokenOldAPI() throws IOException {
|
||||
testTeeSinkCustomToken(1);
|
||||
}
|
||||
|
||||
public void testTeeSinkCustomTokenVeryOldAPI() throws IOException {
|
||||
testTeeSinkCustomToken(2);
|
||||
}
|
||||
|
||||
private void testTeeSinkCustomToken(int api) throws IOException {
|
||||
TokenStream stream = new WhitespaceTokenizer(new StringReader(doc));
|
||||
stream = new PartOfSpeechTaggingFilter(stream);
|
||||
stream = new LowerCaseFilter(stream);
|
||||
stream = new StopFilter(stream, stopwords);
|
||||
|
||||
SinkTokenizer sink = new SinkTokenizer();
|
||||
TokenStream stream1 = new PartOfSpeechAnnotatingFilter(sink);
|
||||
|
||||
stream = new TeeTokenFilter(stream, sink);
|
||||
stream = new PartOfSpeechAnnotatingFilter(stream);
|
||||
|
||||
switch (api) {
|
||||
case 0:
|
||||
consumeStreamNewAPI(stream);
|
||||
consumeStreamNewAPI(stream1);
|
||||
break;
|
||||
case 1:
|
||||
consumeStreamOldAPI(stream);
|
||||
consumeStreamOldAPI(stream1);
|
||||
break;
|
||||
case 2:
|
||||
consumeStreamVeryOldAPI(stream);
|
||||
consumeStreamVeryOldAPI(stream1);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// test caching the special custom POSToken works in all cases
|
||||
|
||||
public void testCachingCustomTokenNewAPI() throws IOException {
|
||||
testTeeSinkCustomToken(0);
|
||||
}
|
||||
|
||||
public void testCachingCustomTokenOldAPI() throws IOException {
|
||||
testTeeSinkCustomToken(1);
|
||||
}
|
||||
|
||||
public void testCachingCustomTokenVeryOldAPI() throws IOException {
|
||||
testTeeSinkCustomToken(2);
|
||||
}
|
||||
|
||||
public void testCachingCustomTokenMixed() throws IOException {
|
||||
testTeeSinkCustomToken(3);
|
||||
}
|
||||
|
||||
private void testCachingCustomToken(int api) throws IOException {
|
||||
TokenStream stream = new WhitespaceTokenizer(new StringReader(doc));
|
||||
stream = new PartOfSpeechTaggingFilter(stream);
|
||||
stream = new LowerCaseFilter(stream);
|
||||
stream = new StopFilter(stream, stopwords);
|
||||
stream = new CachingTokenFilter(stream); // <- the caching is done before the annotating!
|
||||
stream = new PartOfSpeechAnnotatingFilter(stream);
|
||||
|
||||
switch (api) {
|
||||
case 0:
|
||||
consumeStreamNewAPI(stream);
|
||||
consumeStreamNewAPI(stream);
|
||||
break;
|
||||
case 1:
|
||||
consumeStreamOldAPI(stream);
|
||||
consumeStreamOldAPI(stream);
|
||||
break;
|
||||
case 2:
|
||||
consumeStreamVeryOldAPI(stream);
|
||||
consumeStreamVeryOldAPI(stream);
|
||||
break;
|
||||
case 3:
|
||||
consumeStreamNewAPI(stream);
|
||||
consumeStreamOldAPI(stream);
|
||||
consumeStreamVeryOldAPI(stream);
|
||||
consumeStreamNewAPI(stream);
|
||||
consumeStreamVeryOldAPI(stream);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
private static void consumeStreamNewAPI(TokenStream stream) throws IOException {
|
||||
stream.reset();
|
||||
PayloadAttribute payloadAtt = (PayloadAttribute) stream.addAttribute(PayloadAttribute.class);
|
||||
TermAttribute termAtt = (TermAttribute) stream.addAttribute(TermAttribute.class);
|
||||
|
||||
while (stream.incrementToken()) {
|
||||
String term = termAtt.term();
|
||||
Payload p = payloadAtt.getPayload();
|
||||
if (p != null && p.getData().length == 1 && p.getData()[0] == PartOfSpeechAnnotatingFilter.PROPER_NOUN_ANNOTATION) {
|
||||
assertTrue("only TokenStream is a proper noun", "tokenstream".equals(term));
|
||||
} else {
|
||||
assertFalse("all other tokens (if this test fails, the special POSToken subclass is not correctly passed through the chain)", "tokenstream".equals(term));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static void consumeStreamOldAPI(TokenStream stream) throws IOException {
|
||||
stream.reset();
|
||||
Token reusableToken = new Token();
|
||||
|
||||
while ((reusableToken = stream.next(reusableToken)) != null) {
|
||||
String term = reusableToken.term();
|
||||
Payload p = reusableToken.getPayload();
|
||||
if (p != null && p.getData().length == 1 && p.getData()[0] == PartOfSpeechAnnotatingFilter.PROPER_NOUN_ANNOTATION) {
|
||||
assertTrue("only TokenStream is a proper noun", "tokenstream".equals(term));
|
||||
} else {
|
||||
assertFalse("all other tokens (if this test fails, the special POSToken subclass is not correctly passed through the chain)", "tokenstream".equals(term));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static void consumeStreamVeryOldAPI(TokenStream stream) throws IOException {
|
||||
stream.reset();
|
||||
|
||||
Token token;
|
||||
while ((token = stream.next()) != null) {
|
||||
String term = token.term();
|
||||
Payload p = token.getPayload();
|
||||
if (p != null && p.getData().length == 1 && p.getData()[0] == PartOfSpeechAnnotatingFilter.PROPER_NOUN_ANNOTATION) {
|
||||
assertTrue("only TokenStream is a proper noun", "tokenstream".equals(term));
|
||||
} else {
|
||||
assertFalse("all other tokens (if this test fails, the special POSToken subclass is not correctly passed through the chain)", "tokenstream".equals(term));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// test if tokenization fails, if only the new API is allowed and an old TokenStream is in the chain
|
||||
public void testOnlyNewAPI() throws IOException {
|
||||
TokenStream.setOnlyUseNewAPI(true);
|
||||
try {
|
||||
|
||||
// this should fail with UOE
|
||||
try {
|
||||
TokenStream stream = new WhitespaceTokenizer(new StringReader(doc));
|
||||
stream = new PartOfSpeechTaggingFilter(stream); // <-- this one is evil!
|
||||
stream = new LowerCaseFilter(stream);
|
||||
stream = new StopFilter(stream, stopwords);
|
||||
while (stream.incrementToken());
|
||||
fail("If only the new API is allowed, this should fail with an UOE");
|
||||
} catch (UnsupportedOperationException uoe) {
|
||||
assertTrue((PartOfSpeechTaggingFilter.class.getName()+" does not implement incrementToken() which is needed for onlyUseNewAPI.").equals(uoe.getMessage()));
|
||||
}
|
||||
|
||||
// this should pass, as all core token streams support the new API
|
||||
TokenStream stream = new WhitespaceTokenizer(new StringReader(doc));
|
||||
stream = new LowerCaseFilter(stream);
|
||||
stream = new StopFilter(stream, stopwords);
|
||||
while (stream.incrementToken());
|
||||
|
||||
// Test, if all attributes are implemented by their implementation, not Token/TokenWrapper
|
||||
assertTrue("TermAttribute is implemented by TermAttributeImpl",
|
||||
stream.addAttribute(TermAttribute.class) instanceof TermAttributeImpl);
|
||||
assertTrue("OffsetAttribute is implemented by OffsetAttributeImpl",
|
||||
stream.addAttribute(OffsetAttribute.class) instanceof OffsetAttributeImpl);
|
||||
assertTrue("FlagsAttribute is implemented by FlagsAttributeImpl",
|
||||
stream.addAttribute(FlagsAttribute.class) instanceof FlagsAttributeImpl);
|
||||
assertTrue("PayloadAttribute is implemented by PayloadAttributeImpl",
|
||||
stream.addAttribute(PayloadAttribute.class) instanceof PayloadAttributeImpl);
|
||||
assertTrue("PositionIncrementAttribute is implemented by PositionIncrementAttributeImpl",
|
||||
stream.addAttribute(PositionIncrementAttribute.class) instanceof PositionIncrementAttributeImpl);
|
||||
assertTrue("TypeAttribute is implemented by TypeAttributeImpl",
|
||||
stream.addAttribute(TypeAttribute.class) instanceof TypeAttributeImpl);
|
||||
|
||||
// Test if the wrapper API (onlyUseNewAPI==false) uses TokenWrapper
|
||||
// as attribute instance.
|
||||
// TokenWrapper encapsulates a Token instance that can be exchanged
|
||||
// by another Token instance without changing the AttributeImpl instance
|
||||
// itsself.
|
||||
TokenStream.setOnlyUseNewAPI(false);
|
||||
stream = new WhitespaceTokenizer(new StringReader(doc));
|
||||
assertTrue("TermAttribute is implemented by TokenWrapper",
|
||||
stream.addAttribute(TermAttribute.class) instanceof TokenWrapper);
|
||||
assertTrue("OffsetAttribute is implemented by TokenWrapper",
|
||||
stream.addAttribute(OffsetAttribute.class) instanceof TokenWrapper);
|
||||
assertTrue("FlagsAttribute is implemented by TokenWrapper",
|
||||
stream.addAttribute(FlagsAttribute.class) instanceof TokenWrapper);
|
||||
assertTrue("PayloadAttribute is implemented by TokenWrapper",
|
||||
stream.addAttribute(PayloadAttribute.class) instanceof TokenWrapper);
|
||||
assertTrue("PositionIncrementAttribute is implemented by TokenWrapper",
|
||||
stream.addAttribute(PositionIncrementAttribute.class) instanceof TokenWrapper);
|
||||
assertTrue("TypeAttribute is implemented by TokenWrapper",
|
||||
stream.addAttribute(TypeAttribute.class) instanceof TokenWrapper);
|
||||
|
||||
} finally {
|
||||
TokenStream.setOnlyUseNewAPI(false);
|
||||
}
|
||||
}
|
||||
|
||||
public void testOverridesAny() throws Exception {
|
||||
try {
|
||||
TokenStream stream = new WhitespaceTokenizer(new StringReader(doc));
|
||||
stream = new TokenFilter(stream) {
|
||||
// we implement nothing, only un-abstract it
|
||||
};
|
||||
stream = new LowerCaseFilter(stream);
|
||||
stream = new StopFilter(stream, stopwords);
|
||||
while (stream.incrementToken());
|
||||
fail("One TokenFilter does not override any of the required methods, so it should fail.");
|
||||
} catch (UnsupportedOperationException uoe) {
|
||||
assertTrue(uoe.getMessage().endsWith("does not implement any of incrementToken(), next(Token), next()."));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -141,11 +141,11 @@ public class TestDocumentWriter extends LuceneTestCase {
|
|||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
return new TokenFilter(new WhitespaceTokenizer(reader)) {
|
||||
boolean first=true;
|
||||
AttributeSource state;
|
||||
AttributeSource.State state;
|
||||
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (state != null) {
|
||||
state.restoreState(this);
|
||||
restoreState(state);
|
||||
payloadAtt.setPayload(null);
|
||||
posIncrAtt.setPositionIncrement(0);
|
||||
termAtt.setTermBuffer(new char[]{'b'}, 0, 1);
|
||||
|
|
|
@ -28,6 +28,7 @@ import java.util.Iterator;
|
|||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.SinkTokenizer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
||||
|
@ -3521,47 +3522,21 @@ public class TestIndexWriter extends LuceneTestCase
|
|||
}
|
||||
}
|
||||
|
||||
private static class MyAnalyzer extends Analyzer {
|
||||
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
TokenStream s = new WhitespaceTokenizer(reader);
|
||||
s.addAttribute(PositionIncrementAttribute.class);
|
||||
return s;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// LUCENE-1255
|
||||
public void testNegativePositions() throws Throwable {
|
||||
SinkTokenizer tokens = new SinkTokenizer();
|
||||
tokens.addAttribute(TermAttribute.class);
|
||||
tokens.addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
AttributeSource state = new AttributeSource();
|
||||
TermAttribute termAtt = (TermAttribute) state.addAttribute(TermAttribute.class);
|
||||
PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) state.addAttribute(PositionIncrementAttribute.class);
|
||||
termAtt.setTermBuffer("a");
|
||||
posIncrAtt.setPositionIncrement(0);
|
||||
tokens.add(state);
|
||||
|
||||
state = new AttributeSource();
|
||||
termAtt = (TermAttribute) state.addAttribute(TermAttribute.class);
|
||||
posIncrAtt = (PositionIncrementAttribute) state.addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
termAtt.setTermBuffer("b");
|
||||
posIncrAtt.setPositionIncrement(1);
|
||||
tokens.add(state);
|
||||
|
||||
state = new AttributeSource();
|
||||
termAtt = (TermAttribute) state.addAttribute(TermAttribute.class);
|
||||
posIncrAtt = (PositionIncrementAttribute) state.addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
termAtt.setTermBuffer("c");
|
||||
posIncrAtt.setPositionIncrement(1);
|
||||
tokens.add(state);
|
||||
Token t = new Token();
|
||||
t.setTermBuffer("a");
|
||||
t.setPositionIncrement(0);
|
||||
tokens.add(t);
|
||||
t.setTermBuffer("b");
|
||||
t.setPositionIncrement(1);
|
||||
tokens.add(t);
|
||||
t.setTermBuffer("c");
|
||||
tokens.add(t);
|
||||
|
||||
MockRAMDirectory dir = new MockRAMDirectory();
|
||||
IndexWriter w = new IndexWriter(dir, new MyAnalyzer(), true, IndexWriter.MaxFieldLength.UNLIMITED);
|
||||
IndexWriter w = new IndexWriter(dir, new WhitespaceAnalyzer(), true, IndexWriter.MaxFieldLength.UNLIMITED);
|
||||
Document doc = new Document();
|
||||
doc.add(new Field("field", tokens));
|
||||
w.addDocument(doc);
|
||||
|
|
|
@ -17,6 +17,7 @@ package org.apache.lucene.queryParser;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
@ -317,8 +318,8 @@ public class TestMultiFieldQueryParser extends LuceneTestCase {
|
|||
}
|
||||
|
||||
private static class EmptyTokenStream extends TokenStream {
|
||||
public Token next(final Token reusableToken) {
|
||||
return null;
|
||||
public boolean incrementToken() throws IOException {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -44,7 +44,6 @@ public abstract class LuceneTestCase extends TestCase {
|
|||
|
||||
protected void setUp() throws Exception {
|
||||
ConcurrentMergeScheduler.setTestMode();
|
||||
TokenStream.setUseNewAPIDefault(true);
|
||||
}
|
||||
|
||||
protected void tearDown() throws Exception {
|
||||
|
|
|
@ -0,0 +1,122 @@
|
|||
package org.apache.lucene.util;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.tokenattributes.*;
|
||||
|
||||
import java.util.Iterator;
|
||||
|
||||
public class TestAttributeSource extends LuceneTestCase {
|
||||
|
||||
public void testCaptureState() {
|
||||
// init a first instance
|
||||
AttributeSource src = new AttributeSource();
|
||||
TermAttribute termAtt = (TermAttribute) src.addAttribute(TermAttribute.class);
|
||||
TypeAttribute typeAtt = (TypeAttribute) src.addAttribute(TypeAttribute.class);
|
||||
termAtt.setTermBuffer("TestTerm");
|
||||
typeAtt.setType("TestType");
|
||||
final int hashCode = src.hashCode();
|
||||
|
||||
AttributeSource.State state = src.captureState();
|
||||
|
||||
// modify the attributes
|
||||
termAtt.setTermBuffer("AnotherTestTerm");
|
||||
typeAtt.setType("AnotherTestType");
|
||||
assertTrue("Hash code should be different", hashCode != src.hashCode());
|
||||
|
||||
src.restoreState(state);
|
||||
assertEquals("TestTerm", termAtt.term());
|
||||
assertEquals("TestType", typeAtt.type());
|
||||
assertEquals("Hash code should be equal after restore", hashCode, src.hashCode());
|
||||
|
||||
// restore into an exact configured copy
|
||||
AttributeSource copy = new AttributeSource();
|
||||
copy.addAttribute(TermAttribute.class);
|
||||
copy.addAttribute(TypeAttribute.class);
|
||||
copy.restoreState(state);
|
||||
assertEquals("Both AttributeSources should have same hashCode after restore", src.hashCode(), copy.hashCode());
|
||||
assertEquals("Both AttributeSources should be equal after restore", src, copy);
|
||||
|
||||
// init a second instance (with attributes in different order and one additional attribute)
|
||||
AttributeSource src2 = new AttributeSource();
|
||||
typeAtt = (TypeAttribute) src2.addAttribute(TypeAttribute.class);
|
||||
FlagsAttribute flagsAtt = (FlagsAttribute) src2.addAttribute(FlagsAttribute.class);
|
||||
termAtt = (TermAttribute) src2.addAttribute(TermAttribute.class);
|
||||
flagsAtt.setFlags(12345);
|
||||
|
||||
src2.restoreState(state);
|
||||
assertEquals("TestTerm", termAtt.term());
|
||||
assertEquals("TestType", typeAtt.type());
|
||||
assertEquals("FlagsAttribute should not be touched", 12345, flagsAtt.getFlags());
|
||||
|
||||
// init a third instance missing one Attribute
|
||||
AttributeSource src3 = new AttributeSource();
|
||||
termAtt = (TermAttribute) src3.addAttribute(TermAttribute.class);
|
||||
try {
|
||||
src3.restoreState(state);
|
||||
fail("The third instance is missing the TypeAttribute, so restoreState() should throw IllegalArgumentException");
|
||||
} catch (IllegalArgumentException iae) {
|
||||
// pass
|
||||
}
|
||||
}
|
||||
|
||||
public void testCloneAttributes() {
|
||||
final AttributeSource src = new AttributeSource();
|
||||
final TermAttribute termAtt = (TermAttribute) src.addAttribute(TermAttribute.class);
|
||||
final TypeAttribute typeAtt = (TypeAttribute) src.addAttribute(TypeAttribute.class);
|
||||
termAtt.setTermBuffer("TestTerm");
|
||||
typeAtt.setType("TestType");
|
||||
|
||||
final AttributeSource clone = src.cloneAttributes();
|
||||
final Iterator it = clone.getAttributeClassesIterator();
|
||||
assertEquals("TermAttribute must be the first attribute", TermAttribute.class, it.next());
|
||||
assertEquals("TypeAttribute must be the second attribute", TypeAttribute.class, it.next());
|
||||
assertFalse("No more attributes", it.hasNext());
|
||||
|
||||
final TermAttribute termAtt2 = (TermAttribute) clone.getAttribute(TermAttribute.class);
|
||||
final TypeAttribute typeAtt2 = (TypeAttribute) clone.getAttribute(TypeAttribute.class);
|
||||
assertNotSame("TermAttribute of original and clone must be different instances", termAtt2, termAtt);
|
||||
assertNotSame("TypeAttribute of original and clone must be different instances", typeAtt2, typeAtt);
|
||||
assertEquals("TermAttribute of original and clone must be equal", termAtt2, termAtt);
|
||||
assertEquals("TypeAttribute of original and clone must be equal", typeAtt2, typeAtt);
|
||||
}
|
||||
|
||||
public void testToStringAndMultiAttributeImplementations() {
|
||||
AttributeSource src = new AttributeSource();
|
||||
TermAttribute termAtt = (TermAttribute) src.addAttribute(TermAttribute.class);
|
||||
TypeAttribute typeAtt = (TypeAttribute) src.addAttribute(TypeAttribute.class);
|
||||
termAtt.setTermBuffer("TestTerm");
|
||||
typeAtt.setType("TestType");
|
||||
assertEquals("Attributes should appear in original order", "("+termAtt.toString()+","+typeAtt.toString()+")", src.toString());
|
||||
|
||||
src = new AttributeSource();
|
||||
src.addAttributeImpl(new Token());
|
||||
// this should not add a new attribute as Token implements TermAttribute, too
|
||||
termAtt = (TermAttribute) src.addAttribute(TermAttribute.class);
|
||||
assertTrue("TermAttribute should be implemented by Token", termAtt instanceof Token);
|
||||
// get the Token attribute and check, that it is the only one
|
||||
final Iterator it = src.getAttributeImplsIterator();
|
||||
Token tok = (Token) it.next();
|
||||
assertFalse("There should be only one attribute implementation instance", it.hasNext());
|
||||
|
||||
termAtt.setTermBuffer("TestTerm");
|
||||
assertEquals("Token should only printed once", "("+tok.toString()+")", src.toString());
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue