LUCENE-1422: New TokenStream API that uses a new class called AttributeSource instead of the now deprecated Token class. All attributes that the Token class had have been moved into separate classes: TermAttribute, OffsetAttribute, PositionIncrementAttribute, PayloadAttribute, TypeAttribute and FlagsAttribute.

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@718798 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael Busch 2008-11-18 23:41:49 +00:00
parent 72e94add53
commit 898cfe87cd
69 changed files with 3226 additions and 628 deletions

View File

@ -25,6 +25,15 @@ API Changes
and deprecate FSDirectory.getDirectory(). FSDirectory instances
are not required to be singletons per path. (yonik)
4. LUCENE-1422: New TokenStream API that uses a new class called
AttributeSource instead of the now deprecated Token class. All attributes
that the Token class had have been moved into separate classes:
TermAttribute, OffsetAttribute, PositionIncrementAttribute,
PayloadAttribute, TypeAttribute and FlagsAttribute. The new API
is much more flexible; it allows to combine the Attributes arbitrarily
and also to define custom Attributes. The new API has the same performance
as the old next(Token) approach. (Michael Busch)
Bug fixes
1. LUCENE-1415: MultiPhraseQuery has incorrect hashCode() and equals()

View File

@ -22,6 +22,8 @@ import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import org.apache.lucene.util.AttributeSource;
/**
* This class can be used if the Tokens of a TokenStream
* are intended to be consumed more than once. It caches
@ -40,6 +42,25 @@ public class CachingTokenFilter extends TokenFilter {
super(input);
}
public boolean incrementToken() throws IOException {
if (cache == null) {
// fill cache lazily
cache = new LinkedList();
fillCache();
iterator = cache.iterator();
}
if (!iterator.hasNext()) {
// the cache is exhausted, return null
return false;
}
// Since the TokenFilter can be reset, the tokens need to be preserved as immutable.
AttributeSource state = (AttributeSource) iterator.next();
state.restoreState(this);
return true;
}
/** @deprecated */
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
if (cache == null) {
@ -60,10 +81,17 @@ public class CachingTokenFilter extends TokenFilter {
public void reset() throws IOException {
if(cache != null) {
iterator = cache.iterator();
iterator = cache.iterator();
}
}
private void fillCache() throws IOException {
while(input.incrementToken()) {
cache.add(captureState());
}
}
/** @deprecated */
private void fillCache(final Token reusableToken) throws IOException {
for (Token nextToken = input.next(reusableToken); nextToken != null; nextToken = input.next(reusableToken)) {
cache.add(nextToken.clone());

View File

@ -20,10 +20,15 @@ package org.apache.lucene.analysis;
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/** An abstract base class for simple, character-oriented tokenizers.*/
public abstract class CharTokenizer extends Tokenizer {
public CharTokenizer(Reader input) {
super(input);
offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
}
private int offset = 0, bufferIndex = 0, dataLen = 0;
@ -31,6 +36,9 @@ public abstract class CharTokenizer extends Tokenizer {
private static final int IO_BUFFER_SIZE = 4096;
private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
private TermAttribute termAtt;
private OffsetAttribute offsetAtt;
/** Returns true iff a character should be included in a token. This
* tokenizer generates as tokens adjacent sequences of characters which
* satisfy this predicate. Characters for which this is false are used to
@ -44,6 +52,50 @@ public abstract class CharTokenizer extends Tokenizer {
return c;
}
public final boolean incrementToken() throws IOException {
clearAttributes();
int length = 0;
int start = bufferIndex;
char[] buffer = termAtt.termBuffer();
while (true) {
if (bufferIndex >= dataLen) {
offset += dataLen;
dataLen = input.read(ioBuffer);
if (dataLen == -1) {
if (length > 0)
break;
else
return false;
}
bufferIndex = 0;
}
final char c = ioBuffer[bufferIndex++];
if (isTokenChar(c)) { // if it's a token char
if (length == 0) // start of token
start = offset + bufferIndex - 1;
else if (length == buffer.length)
buffer = termAtt.resizeTermBuffer(1+length);
buffer[length++] = normalize(c); // buffer it, normalized
if (length == MAX_WORD_LEN) // buffer overflow!
break;
} else if (length > 0) // at non-Letter w/ chars
break; // return 'em
}
termAtt.setTermLength(length);
offsetAtt.setStartOffset(start);
offsetAtt.setEndOffset(start+length);
return true;
}
/** @deprecated */
public final Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
reusableToken.clear();

View File

@ -1,5 +1,7 @@
package org.apache.lucene.analysis;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@ -27,11 +29,33 @@ package org.apache.lucene.analysis;
public class ISOLatin1AccentFilter extends TokenFilter {
public ISOLatin1AccentFilter(TokenStream input) {
super(input);
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
}
private char[] output = new char[256];
private int outputPos;
private TermAttribute termAtt;
public final boolean incrementToken() throws java.io.IOException {
if (input.incrementToken()) {
final char[] buffer = termAtt.termBuffer();
final int length = termAtt.termLength();
// If no characters actually require rewriting then we
// just return token as-is:
for(int i=0;i<length;i++) {
final char c = buffer[i];
if (c >= '\u00c0' && c <= '\uFB06') {
removeAccents(buffer, length);
termAtt.setTermBuffer(output, 0, outputPos);
break;
}
}
return true;
} else
return false;
}
/** @deprecated */
public final Token next(final Token reusableToken) throws java.io.IOException {
assert reusableToken != null;
Token nextToken = input.next(reusableToken);
@ -241,7 +265,7 @@ public class ISOLatin1AccentFilter extends TokenFilter {
case '\uFB06': //
output[outputPos++] = 's';
output[outputPos++] = 't';
break;
break;
default :
output[outputPos++] = c;
break;

View File

@ -20,6 +20,9 @@ package org.apache.lucene.analysis;
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/**
* Emits the entire input as a single token.
*/
@ -28,6 +31,8 @@ public class KeywordTokenizer extends Tokenizer {
private static final int DEFAULT_BUFFER_SIZE = 256;
private boolean done;
private TermAttribute termAtt;
private OffsetAttribute offsetAtt;
public KeywordTokenizer(Reader input) {
this(input, DEFAULT_BUFFER_SIZE);
@ -36,8 +41,32 @@ public class KeywordTokenizer extends Tokenizer {
public KeywordTokenizer(Reader input, int bufferSize) {
super(input);
this.done = false;
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
}
public boolean incrementToken() throws IOException {
if (!done) {
done = true;
int upto = 0;
termAtt.clear();
char[] buffer = termAtt.termBuffer();
while (true) {
final int length = input.read(buffer, upto, buffer.length-upto);
if (length == -1) break;
upto += length;
if (upto == buffer.length)
buffer = termAtt.resizeTermBuffer(1+buffer.length);
}
termAtt.setTermLength(upto);
offsetAtt.setStartOffset(0);
offsetAtt.setEndOffset(upto);
return true;
}
return false;
}
/** @deprecated */
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
if (!done) {

View File

@ -19,6 +19,8 @@ package org.apache.lucene.analysis;
import java.io.IOException;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/**
* Removes words that are too long and too short from the stream.
*
@ -30,6 +32,8 @@ public final class LengthFilter extends TokenFilter {
final int min;
final int max;
private TermAttribute termAtt;
/**
* Build a filter that removes words that are too long or too
* short from the text.
@ -39,11 +43,29 @@ public final class LengthFilter extends TokenFilter {
super(in);
this.min = min;
this.max = max;
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
}
/**
* Returns the next input Token whose term() is the right len
*/
public final boolean incrementToken() throws IOException {
// return the first non-stop word found
while (input.incrementToken()) {
int len = termAtt.termLength();
if (len >= min && len <= max) {
return true;
}
// note: else we ignore it but should we index each part of it?
}
// reached EOS -- return null
return false;
}
/**
* Returns the next input Token whose term() is the right len
* @deprecated
*/
public final Token next(final Token reusableToken) throws IOException
{
assert reusableToken != null;

View File

@ -19,6 +19,8 @@ package org.apache.lucene.analysis;
import java.io.IOException;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/**
* Normalizes token text to lower case.
*
@ -27,8 +29,25 @@ import java.io.IOException;
public final class LowerCaseFilter extends TokenFilter {
public LowerCaseFilter(TokenStream in) {
super(in);
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
}
private TermAttribute termAtt;
public final boolean incrementToken() throws IOException {
if (input.incrementToken()) {
final char[] buffer = termAtt.termBuffer();
final int length = termAtt.termLength();
for(int i=0;i<length;i++)
buffer[i] = Character.toLowerCase(buffer[i]);
return true;
} else
return false;
}
/** @deprecated */
public final Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
Token nextToken = input.next(reusableToken);

View File

@ -19,6 +19,8 @@ package org.apache.lucene.analysis;
import java.io.IOException;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/** Transforms the token stream as per the Porter stemming algorithm.
Note: the input to the stemming filter must already be in lower case,
so you will need to use LowerCaseFilter or LowerCaseTokenizer farther
@ -39,12 +41,24 @@ import java.io.IOException;
*/
public final class PorterStemFilter extends TokenFilter {
private PorterStemmer stemmer;
private TermAttribute termAtt;
public PorterStemFilter(TokenStream in) {
super(in);
stemmer = new PorterStemmer();
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
}
public final boolean incrementToken() throws IOException {
if (!input.incrementToken())
return false;
if (stemmer.stem(termAtt.termBuffer(), 0, termAtt.termLength()))
termAtt.setTermBuffer(stemmer.getResultBuffer(), 0, stemmer.getResultLength());
return true;
}
/** @deprecated */
public final Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
Token nextToken = input.next(reusableToken);

View File

@ -22,6 +22,8 @@ import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.lucene.util.AttributeSource;
/**
* A SinkTokenizer can be used to cache Tokens for use in an Analyzer
@ -61,10 +63,30 @@ public class SinkTokenizer extends Tokenizer {
return lst;
}
/**
* Increments this stream to the next token out of the list of cached tokens
* @throws IOException
*/
public boolean incrementToken() throws IOException {
if (iter == null) iter = lst.iterator();
// Since this TokenStream can be reset we have to maintain the tokens as immutable
if (iter.hasNext()) {
AttributeSource state = (AttributeSource) iter.next();
state.restoreState(this);
return true;
}
return false;
}
public void add(AttributeSource source) throws IOException {
lst.add(source);
}
/**
* Returns the next token out of the list of cached tokens
* @return The next {@link org.apache.lucene.analysis.Token} in the Sink.
* @throws IOException
* @deprecated
*/
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
@ -77,8 +99,6 @@ public class SinkTokenizer extends Tokenizer {
return null;
}
/**
* Override this method to cache only certain tokens, or new tokens based
* on the old tokens.

View File

@ -21,6 +21,9 @@ import java.io.IOException;
import java.util.Arrays;
import java.util.Set;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/**
* Removes stop words from a token stream.
*/
@ -32,6 +35,9 @@ public final class StopFilter extends TokenFilter {
private final CharArraySet stopWords;
private boolean enablePositionIncrements = ENABLE_POSITION_INCREMENTS_DEFAULT;
private TermAttribute termAtt;
private PositionIncrementAttribute posIncrAtt;
/**
* Construct a token stream filtering the given input.
*/
@ -47,6 +53,7 @@ public final class StopFilter extends TokenFilter {
public StopFilter(TokenStream in, String[] stopWords, boolean ignoreCase) {
super(in);
this.stopWords = (CharArraySet)makeStopSet(stopWords, ignoreCase);
init();
}
@ -74,6 +81,7 @@ public final class StopFilter extends TokenFilter {
this.stopWords = new CharArraySet(stopWords.size(), ignoreCase);
this.stopWords.addAll(stopWords);
}
init();
}
/**
@ -86,6 +94,11 @@ public final class StopFilter extends TokenFilter {
this(in, stopWords, false);
}
public void init() {
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
}
/**
* Builds a Set from an array of stop words,
* appropriate for passing into the StopFilter constructor.
@ -113,6 +126,26 @@ public final class StopFilter extends TokenFilter {
/**
* Returns the next input Token whose term() is not a stop word.
*/
public final boolean incrementToken() throws IOException {
// return the first non-stop word found
int skippedPositions = 0;
while (input.incrementToken()) {
if (!stopWords.contains(termAtt.termBuffer(), 0, termAtt.termLength())) {
if (enablePositionIncrements) {
posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
}
return true;
}
skippedPositions += posIncrAtt.getPositionIncrement();
}
// reached EOS -- return null
return false;
}
/**
* Returns the next input Token whose term() is not a stop word.
* @deprecated
*/
public final Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
// return the first non-stop word found

View File

@ -18,6 +18,7 @@
package org.apache.lucene.analysis;
import java.io.IOException;
import java.util.Iterator;
/**
@ -60,8 +61,21 @@ public class TeeTokenFilter extends TokenFilter {
public TeeTokenFilter(TokenStream input, SinkTokenizer sink) {
super(input);
this.sink = sink;
Iterator it = getAttributesIterator();
while (it.hasNext()) {
sink.addAttribute(it.next().getClass());
}
}
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
sink.add(captureState());
return true;
}
return false;
}
/** @deprecated */
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
Token nextToken = input.next(reusableToken);

View File

@ -21,7 +21,11 @@ import org.apache.lucene.index.Payload;
import org.apache.lucene.index.TermPositions; // for javadoc
import org.apache.lucene.util.ArrayUtil;
/** A Token is an occurrence of a term from the text of a field. It consists of
/**
This class is now deprecated and a new TokenStream API was introduced with Lucene 2.9.
See Javadocs in {@link TokenStream} for further details.
<p>
A Token is an occurrence of a term from the text of a field. It consists of
a term's text, the start and end offset of the term in the text of the field,
and a type string.
<p>
@ -114,6 +118,8 @@ import org.apache.lucene.util.ArrayUtil;
</p>
@see org.apache.lucene.index.Payload
@deprecated A new TokenStream API was introduced with Lucene 2.9.
See javadocs in {@link TokenStream} for further details.
*/
public class Token implements Cloneable {

View File

@ -22,9 +22,16 @@ import java.io.IOException;
/** A TokenFilter is a TokenStream whose input is another token stream.
<p>
This is an abstract class.
NOTE: subclasses must override {@link #next(Token)}. It's
also OK to instead override {@link #next()} but that
method is now deprecated in favor of {@link #next(Token)}.
NOTE: subclasses must override
{@link #incrementToken()} if the new TokenStream API is used
and {@link #next(Token)} or {@link #next()} if the old
TokenStream API is used.
* <p><font color="#FF0000">
* WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental.
* The APIs introduced in these classes with Lucene 2.9 might change in the future.
* We will make our best efforts to keep the APIs backwards-compatible.</font>
<p>
See {@link TokenStream}
*/
public abstract class TokenFilter extends TokenStream {
/** The source of tokens for this filter. */
@ -32,6 +39,7 @@ public abstract class TokenFilter extends TokenStream {
/** Construct a token stream filtering the given input. */
protected TokenFilter(TokenStream input) {
super(input);
this.input = input;
}
@ -45,4 +53,17 @@ public abstract class TokenFilter extends TokenStream {
super.reset();
input.reset();
}
public boolean useNewAPI() {
return input.useNewAPI();
}
/**
* Sets whether or not to use the new TokenStream API. Settings this
* will apply to this Filter and all TokenStream/Filters upstream.
*/
public void setUseNewAPI(boolean use) {
input.setUseNewAPI(use);
}
}

View File

@ -17,9 +17,12 @@ package org.apache.lucene.analysis;
* limitations under the License.
*/
import org.apache.lucene.index.Payload;
import java.io.IOException;
import java.util.Iterator;
import org.apache.lucene.index.Payload;
import org.apache.lucene.util.Attribute;
import org.apache.lucene.util.AttributeSource;
/** A TokenStream enumerates the sequence of tokens, either from
fields of a document or from query text.
@ -31,12 +34,139 @@ import java.io.IOException;
<li>{@link TokenFilter}, a TokenStream
whose input is another TokenStream.
</ul>
NOTE: subclasses must override {@link #next(Token)}. It's
also OK to instead override {@link #next()} but that
method is now deprecated in favor of {@link #next(Token)}.
A new TokenStream API is introduced with Lucene 2.9. Since
2.9 Token is deprecated and the preferred way to store
the information of a token is to use {@link Attribute}s.
<p>
For that reason TokenStream extends {@link AttributeSource}
now. Note that only one instance per {@link Attribute} is
created and reused for every token. This approach reduces
object creations and allows local caching of references to
the {@link Attribute}s. See {@link #incrementToken()} for further details.
<p>
<b>The workflow of the new TokenStream API is as follows:</b>
<ol>
<li>Instantiation of TokenStream/TokenFilters which add/get attributes
to/from the {@link AttributeSource}.
<li>The consumer calls {@link TokenStream#reset()}.
<li>the consumer retrieves attributes from the
stream and stores local references to all attributes it wants to access
<li>The consumer calls {@link #incrementToken()} until it returns false and
consumes the attributes after each call.
</ol>
To make sure that filters and consumers know which attributes are available
the attributes must be added in the during instantiation. Filters and
consumers are not required to check for availability of attributes in {@link #incrementToken()}.
<p>
Sometimes it is desirable to capture a current state of a
TokenStream, e. g. for buffering purposes (see {@link CachingTokenFilter},
{@link TeeTokenFilter}/{@link SinkTokenizer}). For this usecase
{@link AttributeSource#captureState()} and {@link AttributeSource#restoreState(AttributeSource)} can be used.
<p>
<b>NOTE:</b> In order to enable the new API the method
{@link #useNewAPI()} has to be called with useNewAPI=true.
Otherwise the deprecated method {@link #next(Token)} will
be used by Lucene consumers (indexer and queryparser) to
consume the tokens. {@link #next(Token)} will be removed
in Lucene 3.0.
<p>
NOTE: To use the old API subclasses must override {@link #next(Token)}.
It's also OK to instead override {@link #next()} but that
method is slower compared to {@link #next(Token)}.
* <p><font color="#FF0000">
* WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental.
* The APIs introduced in these classes with Lucene 2.9 might change in the future.
* We will make our best efforts to keep the APIs backwards-compatible.</font>
*/
public abstract class TokenStream {
public abstract class TokenStream extends AttributeSource {
private static boolean useNewAPIDefault = false;
private boolean useNewAPI = useNewAPIDefault;
protected TokenStream() {
super();
}
protected TokenStream(AttributeSource input) {
super(input);
}
/**
* Returns whether or not the new TokenStream APIs are used
* by default.
* (see {@link #incrementToken()}, {@link AttributeSource}).
*/
public static boolean useNewAPIDefault() {
return useNewAPIDefault;
}
/**
* Use this API to enable or disable the new TokenStream API.
* by default. Can be overridden by calling {@link #setUseNewAPI(boolean)}.
* (see {@link #incrementToken()}, {@link AttributeSource}).
* <p>
* If set to true, the indexer will call {@link #incrementToken()}
* to consume Tokens from this stream.
* <p>
* If set to false, the indexer will call {@link #next(Token)}
* instead.
*/
public static void setUseNewAPIDefault(boolean use) {
useNewAPIDefault = use;
}
/**
* Returns whether or not the new TokenStream APIs are used
* for this stream.
* (see {@link #incrementToken()}, {@link AttributeSource}).
*/
public boolean useNewAPI() {
return useNewAPI;
}
/**
* Use this API to enable or disable the new TokenStream API
* for this stream. Overrides {@link #setUseNewAPIDefault(boolean)}.
* (see {@link #incrementToken()}, {@link AttributeSource}).
* <p>
* If set to true, the indexer will call {@link #incrementToken()}
* to consume Tokens from this stream.
* <p>
* If set to false, the indexer will call {@link #next(Token)}
* instead.
* <p>
* <b>NOTE: All streams and filters in one chain must use the
* same API. </b>
*/
public void setUseNewAPI(boolean use) {
useNewAPI = use;
}
/**
* Consumers (e. g. the indexer) use this method to advance the stream
* to the next token. Implementing classes must implement this method
* and update the appropriate {@link Attribute}s with content of the
* next token.
* <p>
* This method is called for every token of a document, so an efficient
* implementation is crucial for good performance. To avoid calls to
* {@link #addAttribute(Class)} and {@link #getAttribute(Class)} and
* downcasts, references to all {@link Attribute}s that this stream uses
* should be retrieved during instantiation.
* <p>
* To make sure that filters and consumers know which attributes are available
* the attributes must be added during instantiation. Filters and
* consumers are not required to check for availability of attributes in {@link #incrementToken()}.
*
* @return false for end of stream; true otherwise
*
* <p>
* <b>Note that this method will be defined abstract in Lucene 3.0.<b>
*/
public boolean incrementToken() throws IOException {
// subclasses must implement this method; will be made abstract in Lucene 3.0
return false;
}
/** Returns the next token in the stream, or null at EOS.
* @deprecated The returned Token is a "full private copy" (not
@ -84,6 +214,8 @@ public abstract class TokenStream {
* is not required to check for null before using it, but it is a
* good idea to assert that it is not null.)
* @return next token in the stream or null if end-of-stream was hit
* @deprecated The new {@link #incrementToken()} and {@link AttributeSource}
* APIs should be used instead. See also {@link #useNewAPI()}.
*/
public Token next(final Token reusableToken) throws IOException {
// We don't actually use inputToken, but still add this assert
@ -107,4 +239,25 @@ public abstract class TokenStream {
/** Releases resources associated with this stream. */
public void close() throws IOException {}
public String toString() {
StringBuffer sb = new StringBuffer();
sb.append('(');
if (hasAttributes()) {
// TODO Java 1.5
//Iterator<Attribute> it = attributes.values().iterator();
Iterator it = getAttributesIterator();
if (it.hasNext()) {
sb.append(it.next().toString());
}
while (it.hasNext()) {
sb.append(',');
sb.append(it.next().toString());
}
}
sb.append(')');
return sb.toString();
}
}

View File

@ -24,12 +24,23 @@ import java.io.IOException;
<p>
This is an abstract class.
<p>
NOTE: subclasses must override {@link #next(Token)}. It's
also OK to instead override {@link #next()} but that
method is now deprecated in favor of {@link #next(Token)}.
<b>NOTE:</b> In order to enable the new API the method
{@link #useNewAPI()} has to be called with useNewAPI=true.
Otherwise the deprecated method {@link #next(Token)} will
be used by Lucene consumers (indexer and queryparser) to
consume the tokens. {@link #next(Token)} will be removed
in Lucene 3.0.
<p>
NOTE: To use the old API subclasses must override {@link #next(Token)}.
It's also OK to instead override {@link #next()} but that
method is slower compared to {@link #next(Token)}.
<p>
NOTE: subclasses overriding {@link #next(Token)} must
call {@link Token#clear()}.
* <p><font color="#FF0000">
* WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental.
* The APIs introduced in these classes with Lucene 2.9 might change in the future.
* We will make our best efforts to keep the APIs backwards-compatible.</font>
*/
public abstract class Tokenizer extends TokenStream {

View File

@ -35,8 +35,7 @@ application using Lucene to use an appropriate <i>Parser</i> to convert the orig
<h2>Tokenization</h2>
<p>
Plain text passed to Lucene for indexing goes through a process generally called tokenization &ndash; namely breaking of the
input text into small indexing elements &ndash;
{@link org.apache.lucene.analysis.Token Tokens}.
input text into small indexing elements &ndash; tokens.
The way input text is broken into tokens very
much dictates further capabilities of search upon that text.
For instance, sentences beginnings and endings can be identified to provide for more accurate phrase
@ -72,12 +71,13 @@ providing for several functions, including (but not limited to):
<li>{@link org.apache.lucene.analysis.Analyzer} &ndash; An Analyzer is responsible for building a {@link org.apache.lucene.analysis.TokenStream} which can be consumed
by the indexing and searching processes. See below for more information on implementing your own Analyzer.</li>
<li>{@link org.apache.lucene.analysis.Tokenizer} &ndash; A Tokenizer is a {@link org.apache.lucene.analysis.TokenStream} and is responsible for breaking
up incoming text into {@link org.apache.lucene.analysis.Token}s. In most cases, an Analyzer will use a Tokenizer as the first step in
up incoming text into tokens. In most cases, an Analyzer will use a Tokenizer as the first step in
the analysis process.</li>
<li>{@link org.apache.lucene.analysis.TokenFilter} &ndash; A TokenFilter is also a {@link org.apache.lucene.analysis.TokenStream} and is responsible
for modifying {@link org.apache.lucene.analysis.Token}s that have been created by the Tokenizer. Common modifications performed by a
for modifying tokenss that have been created by the Tokenizer. Common modifications performed by a
TokenFilter are: deletion, stemming, synonym injection, and down casing. Not all Analyzers require TokenFilters</li>
</ul>
<b>Since Lucene 2.9 the TokenStream API was changed. Please see section "New TokenStream API" below for details.</b>
</p>
<h2>Hints, Tips and Traps</h2>
<p>
@ -140,9 +140,8 @@ providing for several functions, including (but not limited to):
<PRE>
Analyzer analyzer = new StandardAnalyzer(); // or any other analyzer
TokenStream ts = analyzer.tokenStream("myfield",new StringReader("some text goes here"));
Token t = ts.next();
while (t!=null) {
System.out.println("token: "+t));
while (ts.incrementToken()) {
System.out.println("token: "+ts));
t = ts.next();
}
</PRE>
@ -179,7 +178,7 @@ the source code of any one of the many samples located in this package.
<p>
The following sections discuss some aspects of implementing your own analyzer.
</p>
<h3>Field Section Boundaries</h2>
<h3>Field Section Boundaries</h3>
<p>
When {@link org.apache.lucene.document.Document#add(org.apache.lucene.document.Fieldable) document.add(field)}
is called multiple times for the same field name, we could say that each such call creates a new
@ -208,10 +207,10 @@ the source code of any one of the many samples located in this package.
};
</PRE>
</p>
<h3>Token Position Increments</h2>
<h3>Token Position Increments</h3>
<p>
By default, all tokens created by Analyzers and Tokenizers have a
{@link org.apache.lucene.analysis.Token#getPositionIncrement() position increment} of one.
{@link org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute#getPositionIncrement() position increment} of one.
This means that the position stored for that token in the index would be one more than
that of the previous token.
Recall that phrase and proximity searches rely on position info.
@ -227,26 +226,29 @@ the source code of any one of the many samples located in this package.
If this behavior does not fit the application needs,
a modified analyzer can be used, that would increment further the positions of
tokens following a removed stop word, using
{@link org.apache.lucene.analysis.Token#setPositionIncrement(int)}.
{@link org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute#setPositionIncrement(int)}.
This can be done with something like:
<PRE>
public TokenStream tokenStream(final String fieldName, Reader reader) {
final TokenStream ts = someAnalyzer.tokenStream(fieldName, reader);
TokenStream res = new TokenStream() {
public Token next() throws IOException {
TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class);
PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
public boolean incrementToken() throws IOException {
int extraIncrement = 0;
while (true) {
Token t = ts.next();
if (t!=null) {
if (stopWords.contains(t.termText())) {
boolean hasNext = ts.incrementToken();
if (hasNext) {
if (stopWords.contains(termAtt.term())) {
extraIncrement++; // filter this word
continue;
}
if (extraIncrement>0) {
t.setPositionIncrement(t.getPositionIncrement()+extraIncrement);
posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement()+extraIncrement);
}
}
return t;
return hasNext;
}
}
};
@ -268,5 +270,336 @@ the source code of any one of the many samples located in this package.
same position as that token, and so would they be seen by phrase and proximity searches.</li>
</ol>
</p>
<h2>New TokenStream API</h2>
<p>
With Lucene 2.9 we introduce a new TokenStream API. The old API used to produce Tokens. A Token
has getter and setter methods for different properties like positionIncrement and termText.
While this approach was sufficient for the default indexing format, it is not versatile enough for
Flexible Indexing, a term which summarizes the effort of making the Lucene indexer pluggable and extensible for custom
index formats.
</p>
<p>
A fully customizable indexer means that users will be able to store custom data structures on disk. Therefore an API
is necessary that can transport custom types of data from the documents to the indexer.
</p>
<h3>Attribute and AttributeSource</h3>
Lucene 2.9 therefore introduces a new pair of classes called {@link org.apache.lucene.util.Attribute} and
{@link org.apache.lucene.util.AttributeSource}. An Attribute serves as a
particular piece of information about a text token. For example, {@link org.apache.lucene.analysis.tokenattributes.TermAttribute}
contains the term text of a token, and {@link org.apache.lucene.analysis.tokenattributes.OffsetAttribute} contains the start and end character offsets of a token.
An AttributeSource is a collection of Attributes with a restriction: there may be only one instance of each attribute type. TokenStream now extends AttributeSource, which
means that one can add Attributes to a TokenStream. Since TokenFilter extends TokenStream, all filters are also
AttributeSources.
<p>
Lucene now provides six Attributes out of the box, which replace the variables the Token class has:
<ul>
<li>{@link org.apache.lucene.analysis.tokenattributes.TermAttribute}<p>The term text of a token.</p></li>
<li>{@link org.apache.lucene.analysis.tokenattributes.OffsetAttribute}<p>The start and end offset of token in characters.</p></li>
<li>{@link org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute}<p>See above for detailed information about position increment.</p></li>
<li>{@link org.apache.lucene.analysis.tokenattributes.PayloadAttribute}<p>The payload that a Token can optionally have.</p></li>
<li>{@link org.apache.lucene.analysis.tokenattributes.TypeAttribute}<p>The type of the token. Default is 'word'.</p></li>
<li>{@link org.apache.lucene.analysis.tokenattributes.FlagsAttribute}<p>Optional flags a token can have.</p></li>
</ul>
</p>
<h3>Using the new TokenStream API</h3>
There are a few important things to know in order to use the new API efficiently which are summarized here. You may want
to walk through the example below first and come back to this section afterwards.
<ol><li>
Please keep in mind that an AttributeSource can only have one instance of a particular Attribute. Furthermore, if
a chain of a TokenStream and multiple TokenFilters is used, then all TokenFilters in that chain share the Attributes
with the TokenStream.
</li>
<br>
<li>
Attribute instances are reused for all tokens of a document. Thus, a TokenStream/-Filter needs to update
the appropriate Attribute(s) in incrementToken(). The consumer, commonly the Lucene indexer, consumes the data in the
Attributes and then calls incrementToken() again until it retuns false, which indicates that the end of the stream
was reached. This means that in each call of incrementToken() a TokenStream/-Filter can safely overwrite the data in
the Attribute instances.
</li>
<br>
<li>
For performance reasons a TokenStream/-Filter should add/get Attributes during instantiation; i.e., create an attribute in the
constructor and store references to it in an instance variable. Using an instance variable instead of calling addAttribute()/getAttribute()
in incrementToken() will avoid expensive casting and attribute lookups for every token in the document.
</li>
<br>
<li>
All methods in AttributeSource are idempotent, which means calling them multiple times always yields the same
result. This is especially important to know for addAttribute(). The method takes the <b>type</b> (<code>Class</code>)
of an Attribute as an argument and returns an <b>instance</b>. If an Attribute of the same type was previously added, then
the already existing instance is returned, otherwise a new instance is created and returned. Therefore TokenStreams/-Filters
can safely call addAttribute() with the same Attribute type multiple times.
</li></ol>
<h3>Example</h3>
In this example we will create a WhiteSpaceTokenizer and use a LengthFilter to suppress all words that only
have two or less characters. The LengthFilter is part of the Lucene core and its implementation will be explained
here to illustrate the usage of the new TokenStream API.<br>
Then we will develop a custom Attribute, a PartOfSpeechAttribute, and add another filter to the chain which
utilizes the new custom attribute, and call it PartOfSpeechTaggingFilter.
<h4>Whitespace tokenization</h4>
<pre>
public class MyAnalyzer extends Analyzer {
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream stream = new WhitespaceTokenizer(reader);
return stream;
}
public static void main(String[] args) throws IOException {
// text to tokenize
final String text = "This is a demo of the new TokenStream API";
MyAnalyzer analyzer = new MyAnalyzer();
TokenStream stream = analyzer.tokenStream("field", new StringReader(text));
// get the TermAttribute from the TokenStream
TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class);
// print all tokens until stream is exhausted
while (stream.incrementToken()) {
System.out.println(termAtt.term());
}
}
}
</pre>
In this easy example a simple white space tokenization is performed. In main() a loop consumes the stream and
prints the term text of the tokens by accessing the TermAttribute that the WhitespaceTokenizer provides.
Here is the output:
<pre>
This
is
a
demo
of
the
new
TokenStream
API
</pre>
<h4>Adding a LengthFilter</h4>
We want to suppress all tokens that have 2 or less characters. We can do that easily by adding a LengthFilter
to the chain. Only the tokenStream() method in our analyzer needs to be changed:
<pre>
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream stream = new WhitespaceTokenizer(reader);
stream = new LengthFilter(stream, 3, Integer.MAX_VALUE);
return stream;
}
</pre>
Note how now only words with 3 or more characters are contained in the output:
<pre>
This
demo
the
new
TokenStream
API
</pre>
Now let's take a look how the LengthFilter is implemented (it is part of Lucene's core):
<pre>
public final class LengthFilter extends TokenFilter {
final int min;
final int max;
private TermAttribute termAtt;
/**
* Build a filter that removes words that are too long or too
* short from the text.
*/
public LengthFilter(TokenStream in, int min, int max)
{
super(in);
this.min = min;
this.max = max;
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
}
/**
* Returns the next input Token whose term() is the right len
*/
public final boolean incrementToken() throws IOException
{
assert termAtt != null;
// return the first non-stop word found
while (input.incrementToken()) {
int len = termAtt.termLength();
if (len >= min && len <= max) {
return true;
}
// note: else we ignore it but should we index each part of it?
}
// reached EOS -- return null
return false;
}
}
</pre>
The TermAttribute is added in the constructor and stored in the instance variable <code>termAtt</code>.
Remember that there can only be a single instance of TermAttribute in the chain, so in our example the
<code>addAttribute()</code> call in LengthFilter returns the TermAttribute that the WhitespaceTokenizer already added. The tokens
are retrieved from the input stream in the <code>incrementToken()</code> method. By looking at the term text
in the TermAttribute the length of the term can be determined and too short or too long tokens are skipped.
Note how <code>incrementToken()</code> can efficiently access the instance variable; no attribute lookup or downcasting
is neccessary. The same is true for the consumer, which can simply use local references to the Attributes.
<h4>Adding a custom Attribute</h4>
Now we're going to implement our own custom Attribute for part-of-speech tagging and call it consequently
<code>PartOfSpeechAttribute</code>:
<pre>
public static enum PartOfSpeech {
Noun, Verb, Adjective, Adverb, Pronoun, Preposition, Conjunction, Article, Unknown
}
public static final class PartOfSpeechAttribute extends Attribute {
private PartOfSpeech pos = PartOfSpeech.Unknown;
public void setPartOfSpeech(PartOfSpeech pos) {
this.pos = pos;
}
public PartOfSpeech getPartOfSpeech() {
return pos;
}
public void clear() {
pos = PartOfSpeech.Unknown;
}
public void copyTo(Attribute target) {
((PartOfSpeechAttribute) target).pos = pos;
}
public boolean equals(Object other) {
if (other == this) {
return true;
}
if (other instanceof PartOfSpeechAttribute) {
return pos == ((PartOfSpeechAttribute) other).pos;
}
return false;
}
public int hashCode() {
return pos.ordinal();
}
public String toString() {
return "PartOfSpeech=" + pos;
}
}
</pre>
This is a simple Attribute that has only a single variable that stores the part-of-speech of a token. It extends the
new <code>Attribute</code> class and therefore implements its abstract methods <code>clear(), copyTo(), equals(), hashCode(), toString()</code>.
Now we need a TokenFilter that can set this new PartOfSpeechAttribute for each token. In this example we show a very naive filter
that tags every word with a leading upper-case letter as a 'Noun' and all other words as 'Unknown'.
<pre>
public static class PartOfSpeechTaggingFilter extends TokenFilter {
PartOfSpeechAttribute posAtt;
TermAttribute termAtt;
protected PartOfSpeechTaggingFilter(TokenStream input) {
super(input);
posAtt = (PartOfSpeechAttribute) addAttribute(PartOfSpeechAttribute.class);
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
}
public boolean incrementToken() throws IOException {
if (!input.incrementToken()) {return false;}
posAtt.setPartOfSpeech(determinePOS(termAtt.termBuffer(), 0, termAtt.termLength()));
return true;
}
// determine the part of speech for the given term
protected PartOfSpeech determinePOS(char[] term, int offset, int length) {
// naive implementation that tags every uppercased word as noun
if (length > 0 && Character.isUpperCase(term[0])) {
return PartOfSpeech.Noun;
}
return PartOfSpeech.Unknown;
}
}
</pre>
Just like the LengthFilter, this new filter accesses the attributes it needs in the constructor and
stores references in instance variables. Now we need to add the filter to the chain:
<pre>
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream stream = new WhitespaceTokenizer(reader);
stream = new LengthFilter(stream, 3, Integer.MAX_VALUE);
stream = new PartOfSpeechTaggingFilter(stream);
return stream;
}
</pre>
Now let's look at the output:
<pre>
This
demo
the
new
TokenStream
API
</pre>
Apparently it hasn't changed, which shows that adding a custom attribute to a TokenStream/Filter chain does not
affect any existing consumers, simply because they don't know the new Attribute. Now let's change the consumer
to make use of the new PartOfSpeechAttribute and print it out:
<pre>
public static void main(String[] args) throws IOException {
// text to tokenize
final String text = "This is a demo of the new TokenStream API";
MyAnalyzer analyzer = new MyAnalyzer();
TokenStream stream = analyzer.tokenStream("field", new StringReader(text));
// get the TermAttribute from the TokenStream
TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class);
// get the PartOfSpeechAttribute from the TokenStream
PartOfSpeechAttribute posAtt = (PartOfSpeechAttribute) stream.getAttribute(PartOfSpeechAttribute.class);
// print all tokens until stream is exhausted
while (stream.incrementToken()) {
System.out.println(termAtt.term() + ": " + posAtt.getPartOfSpeech());
}
}
</pre>
The change that was made is to get the PartOfSpeechAttribute from the TokenStream and print out its contents in
the while loop that consumes the stream. Here is the new output:
<pre>
This: Noun
demo: Unknown
the: Unknown
new: Unknown
TokenStream: Noun
API: Noun
</pre>
Each word is now followed by its assigned PartOfSpeech tag. Of course this is a naive
part-of-speech tagging. The word 'This' should not even be tagged as noun; it is only spelled capitalized because it
is the first word of a sentence. Actually this is a good opportunity for an excerise. To practice the usage of the new
API the reader could now write an Attribute and TokenFilter that can specify for each word if it was the first token
of a sentence or not. Then the PartOfSpeechTaggingFilter can make use of this knowledge and only tag capitalized words
as nouns if not the first word of a sentence (we know, this is still not a correct behavior, but hey, it's a good exercise).
As a small hint, this is how the new Attribute class could begin:
<pre>
public class FirstTokenOfSentenceAttribute extends Attribute {
private boolean firstToken;
public void setFirstToken(boolean firstToken) {
this.firstToken = firstToken;
}
public boolean getFirstToken() {
return firstToken;
}
public void clear() {
firstToken = false;
}
...
</pre>
</body>
</html>

View File

@ -17,9 +17,11 @@ package org.apache.lucene.analysis.standard;
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
/** Normalizes tokens extracted with {@link StandardTokenizer}. */
@ -29,15 +31,54 @@ public final class StandardFilter extends TokenFilter {
/** Construct filtering <i>in</i>. */
public StandardFilter(TokenStream in) {
super(in);
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
}
private static final String APOSTROPHE_TYPE = StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.APOSTROPHE];
private static final String ACRONYM_TYPE = StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM];
// this filters uses attribute type
private TypeAttribute typeAtt;
private TermAttribute termAtt;
/** Returns the next token in the stream, or null at EOS.
* <p>Removes <tt>'s</tt> from the end of words.
* <p>Removes dots from acronyms.
*/
public final boolean incrementToken() throws java.io.IOException {
if (!input.incrementToken()) {
return false;
}
char[] buffer = termAtt.termBuffer();
final int bufferLength = termAtt.termLength();
final String type = typeAtt.type();
if (type == APOSTROPHE_TYPE && // remove 's
bufferLength >= 2 &&
buffer[bufferLength-2] == '\'' &&
(buffer[bufferLength-1] == 's' || buffer[bufferLength-1] == 'S')) {
// Strip last 2 characters off
termAtt.setTermLength(bufferLength - 2);
} else if (type == ACRONYM_TYPE) { // remove dots
int upto = 0;
for(int i=0;i<bufferLength;i++) {
char c = buffer[i];
if (c != '.')
buffer[upto++] = c;
}
termAtt.setTermLength(upto);
}
return true;
}
/** Returns the next token in the stream, or null at EOS.
* <p>Removes <tt>'s</tt> from the end of words.
* <p>Removes dots from acronyms.
* @deprecated
*/
public final Token next(final Token reusableToken) throws java.io.IOException {
assert reusableToken != null;
Token nextToken = input.next(reusableToken);

View File

@ -22,6 +22,10 @@ import java.io.Reader;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
/** A grammar-based tokenizer constructed with JFlex
*
@ -84,7 +88,7 @@ public class StandardTokenizer extends Tokenizer {
*
* @deprecated this should be removed in the next release (3.0).
*/
private boolean replaceInvalidAcronym = false;
private boolean replaceInvalidAcronym;
void setInput(Reader reader) {
this.input = reader;
@ -103,14 +107,13 @@ public class StandardTokenizer extends Tokenizer {
return maxTokenLength;
}
/**
* Creates a new instance of the {@link StandardTokenizer}. Attaches the
* <code>input</code> to a newly created JFlex scanner.
*/
public StandardTokenizer(Reader input) {
this.input = input;
this.scanner = new StandardTokenizerImpl(input);
}
/**
* Creates a new instance of the {@link StandardTokenizer}. Attaches the
* <code>input</code> to a newly created JFlex scanner.
*/
public StandardTokenizer(Reader input) {
this(input, false);
}
/**
* Creates a new instance of the {@link org.apache.lucene.analysis.standard.StandardTokenizer}. Attaches
@ -125,6 +128,60 @@ public class StandardTokenizer extends Tokenizer {
this.replaceInvalidAcronym = replaceInvalidAcronym;
this.input = input;
this.scanner = new StandardTokenizerImpl(input);
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
}
// this tokenizer generates three attributes:
// offset, positionIncrement and type
private TermAttribute termAtt;
private OffsetAttribute offsetAtt;
private PositionIncrementAttribute posIncrAtt;
private TypeAttribute typeAtt;
/*
* (non-Javadoc)
*
* @see org.apache.lucene.analysis.TokenStream#next()
*/
public boolean incrementToken() throws IOException {
int posIncr = 1;
while(true) {
int tokenType = scanner.getNextToken();
if (tokenType == StandardTokenizerImpl.YYEOF) {
return false;
}
if (scanner.yylength() <= maxTokenLength) {
termAtt.clear();
posIncrAtt.setPositionIncrement(posIncr);
scanner.getText(termAtt);
final int start = scanner.yychar();
offsetAtt.setStartOffset(start);
offsetAtt.setEndOffset(start+termAtt.termLength());
// This 'if' should be removed in the next release. For now, it converts
// invalid acronyms to HOST. When removed, only the 'else' part should
// remain.
if (tokenType == StandardTokenizerImpl.ACRONYM_DEP) {
if (replaceInvalidAcronym) {
typeAtt.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]);
termAtt.setTermLength(termAtt.termLength() - 1); // remove extra '.'
} else {
typeAtt.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]);
}
} else {
typeAtt.setType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]);
}
return true;
} else
// When we skip a too-long term, we still increment the
// position increment
posIncr++;
}
}
/*
@ -132,6 +189,7 @@ public class StandardTokenizer extends Tokenizer {
*
* @see org.apache.lucene.analysis.TokenStream#next()
*/
/** @deprecated */
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
int posIncr = 1;

View File

@ -30,6 +30,7 @@ NOTE: if you change this file and need to regenerate the tokenizer,
*/
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/**
@ -368,6 +369,13 @@ final void getText(Token t) {
t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
}
/**
* Fills TermAttribute with the current token text.
*/
final void getText(TermAttribute t) {
t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
}
/**
* Creates a new scanner

View File

@ -29,6 +29,7 @@ NOTE: if you change StandardTokenizerImpl.jflex and need to regenerate
*/
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
%%
@ -69,6 +70,14 @@ public final int yychar()
final void getText(Token t) {
t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
}
/**
* Fills TermAttribute with the current token text.
*/
final void getText(TermAttribute t) {
t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
}
%}
THAI = [\u0E00-\u0E59]

View File

@ -0,0 +1,86 @@
package org.apache.lucene.analysis.tokenattributes;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Serializable;
import org.apache.lucene.util.Attribute;
/**
* This attribute can be used to pass different flags down the tokenizer chain,
* e. g. from one TokenFilter to another one.
*
* <p><font color="#FF0000">
* WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental.
* The APIs introduced in these classes with Lucene 2.9 might change in the future.
* We will make our best efforts to keep the APIs backwards-compatible.</font>
*/
public class FlagsAttribute extends Attribute implements Cloneable, Serializable {
private int flags = 0;
/**
* EXPERIMENTAL: While we think this is here to stay, we may want to change it to be a long.
* <p/>
*
* Get the bitset for any bits that have been set. This is completely distinct from {@link TypeAttribute#type()}, although they do share similar purposes.
* The flags can be used to encode information about the token for use by other {@link org.apache.lucene.analysis.TokenFilter}s.
*
*
* @return The bits
*/
public int getFlags() {
return flags;
}
/**
* @see #getFlags()
*/
public void setFlags(int flags) {
this.flags = flags;
}
public void clear() {
flags = 0;
}
public String toString() {
return "flags=" + flags;
}
public boolean equals(Object other) {
if (this == other) {
return true;
}
if (other instanceof FlagsAttribute) {
return ((FlagsAttribute) other).flags == flags;
}
return false;
}
public int hashCode() {
return flags;
}
public void copyTo(Attribute target) {
FlagsAttribute t = (FlagsAttribute) target;
t.setFlags(flags);
}
}

View File

@ -0,0 +1,98 @@
package org.apache.lucene.analysis.tokenattributes;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Serializable;
import org.apache.lucene.util.Attribute;
/**
* The start and end character offset of a Token.
*
* <p><font color="#FF0000">
* WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental.
* The APIs introduced in these classes with Lucene 2.9 might change in the future.
* We will make our best efforts to keep the APIs backwards-compatible.</font>
*/
public class OffsetAttribute extends Attribute implements Cloneable, Serializable {
private int startOffset;
private int endOffset;
/** Returns this Token's starting offset, the position of the first character
corresponding to this token in the source text.
Note that the difference between endOffset() and startOffset() may not be
equal to termText.length(), as the term text may have been altered by a
stemmer or some other filter. */
public int startOffset() {
return startOffset;
}
/** Set the starting offset.
@see #startOffset() */
public void setStartOffset(int offset) {
this.startOffset = offset;
}
/** Returns this Token's ending offset, one greater than the position of the
last character corresponding to this token in the source text. The length
of the token in the source text is (endOffset - startOffset). */
public int endOffset() {
return endOffset;
}
/** Set the ending offset.
@see #endOffset() */
public void setEndOffset(int offset) {
this.endOffset = offset;
}
public void clear() {
startOffset = 0;
endOffset = 0;
}
public String toString() {
return "start=" + startOffset + ",end=" + endOffset;
}
public boolean equals(Object other) {
if (other == this) {
return true;
}
if (other instanceof OffsetAttribute) {
OffsetAttribute o = (OffsetAttribute) other;
return o.startOffset == startOffset && o.endOffset == endOffset;
}
return false;
}
public int hashCode() {
int code = startOffset;
code = code * 31 + endOffset;
return code;
}
public void copyTo(Attribute target) {
OffsetAttribute t = (OffsetAttribute) target;
t.setStartOffset(startOffset);
t.setEndOffset(endOffset);
}
}

View File

@ -0,0 +1,109 @@
package org.apache.lucene.analysis.tokenattributes;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Serializable;
import org.apache.lucene.index.Payload;
import org.apache.lucene.util.Attribute;
/**
* The payload of a Token. See also {@link Payload}.
*
* <p><font color="#FF0000">
* WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental.
* The APIs introduced in these classes with Lucene 2.9 might change in the future.
* We will make our best efforts to keep the APIs backwards-compatible.</font>
*/
public class PayloadAttribute extends Attribute implements Cloneable, Serializable {
private Payload payload;
/**
* Initialize this attribute with no payload.
*/
public PayloadAttribute() {}
/**
* Initialize this attribute with the given payload.
*/
public PayloadAttribute(Payload payload) {
this.payload = payload;
}
/**
* Returns this Token's payload.
*/
public Payload getPayload() {
return this.payload;
}
/**
* Sets this Token's payload.
*/
public void setPayload(Payload payload) {
this.payload = payload;
}
public void clear() {
payload = null;
}
public String toString() {
if (payload == null) {
return "payload=null";
}
return "payload=" + payload.toString();
}
public Object clone() {
PayloadAttribute clone = (PayloadAttribute) super.clone();
if (payload != null) {
clone.payload = (Payload) payload.clone();
}
return clone;
}
public boolean equals(Object other) {
if (other == this) {
return true;
}
if (other instanceof PayloadAttribute) {
PayloadAttribute o = (PayloadAttribute) other;
if (o.payload == null || payload == null) {
return o.payload == null && payload == null;
}
return o.payload.equals(payload);
}
return false;
}
public int hashCode() {
return (payload == null) ? 0 : payload.hashCode();
}
public void copyTo(Attribute target) {
PayloadAttribute t = (PayloadAttribute) target;
t.setPayload((payload == null) ? null : (Payload) payload.clone());
}
}

View File

@ -0,0 +1,106 @@
package org.apache.lucene.analysis.tokenattributes;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Serializable;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.util.Attribute;
/** The positionIncrement determines the position of this token
* relative to the previous Token in a {@link TokenStream}, used in phrase
* searching.
*
* <p>The default value is one.
*
* <p>Some common uses for this are:<ul>
*
* <li>Set it to zero to put multiple terms in the same position. This is
* useful if, e.g., a word has multiple stems. Searches for phrases
* including either stem will match. In this case, all but the first stem's
* increment should be set to zero: the increment of the first instance
* should be one. Repeating a token with an increment of zero can also be
* used to boost the scores of matches on that token.
*
* <li>Set it to values greater than one to inhibit exact phrase matches.
* If, for example, one does not want phrases to match across removed stop
* words, then one could build a stop word filter that removes stop words and
* also sets the increment to the number of stop words removed before each
* non-stop word. Then exact phrase queries will only match when the terms
* occur with no intervening stop words.
*
* </ul>
*
* <p><font color="#FF0000">
* WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental.
* The APIs introduced in these classes with Lucene 2.9 might change in the future.
* We will make our best efforts to keep the APIs backwards-compatible.</font>
*
* @see org.apache.lucene.index.TermPositions
*/
public class PositionIncrementAttribute extends Attribute implements Cloneable, Serializable {
private int positionIncrement = 1;
/** Set the position increment. The default value is one.
*
* @param positionIncrement the distance from the prior term
*/
public void setPositionIncrement(int positionIncrement) {
if (positionIncrement < 0)
throw new IllegalArgumentException
("Increment must be zero or greater: " + positionIncrement);
this.positionIncrement = positionIncrement;
}
/** Returns the position increment of this Token.
* @see #setPositionIncrement
*/
public int getPositionIncrement() {
return positionIncrement;
}
public void clear() {
this.positionIncrement = 1;
}
public String toString() {
return "positionIncrement=" + positionIncrement;
}
public boolean equals(Object other) {
if (other == this) {
return true;
}
if (other instanceof PositionIncrementAttribute) {
return positionIncrement == ((PositionIncrementAttribute) other).positionIncrement;
}
return false;
}
public int hashCode() {
return positionIncrement;
}
public void copyTo(Attribute target) {
PositionIncrementAttribute t = (PositionIncrementAttribute) target;
t.setPositionIncrement(positionIncrement);
}
}

View File

@ -0,0 +1,242 @@
package org.apache.lucene.analysis.tokenattributes;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Serializable;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Attribute;
/**
* The term text of a Token.
*
* <p><font color="#FF0000">
* WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental.
* The APIs introduced in these classes with Lucene 2.9 might change in the future.
* We will make our best efforts to keep the APIs backwards-compatible.</font>
*/
public class TermAttribute extends Attribute implements Cloneable, Serializable {
private static int MIN_BUFFER_SIZE = 10;
private char[] termBuffer;
private int termLength;
/** Returns the Token's term text.
*
* This method has a performance penalty
* because the text is stored internally in a char[]. If
* possible, use {@link #termBuffer()} and {@link
* #termLength()} directly instead. If you really need a
* String, use this method, which is nothing more than
* a convenience call to <b>new String(token.termBuffer(), 0, token.termLength())</b>
*/
public String term() {
initTermBuffer();
return new String(termBuffer, 0, termLength);
}
/** Copies the contents of buffer, starting at offset for
* length characters, into the termBuffer array.
* @param buffer the buffer to copy
* @param offset the index in the buffer of the first character to copy
* @param length the number of characters to copy
*/
public void setTermBuffer(char[] buffer, int offset, int length) {
char[] newCharBuffer = growTermBuffer(length);
if (newCharBuffer != null) {
termBuffer = newCharBuffer;
}
System.arraycopy(buffer, offset, termBuffer, 0, length);
termLength = length;
}
/** Copies the contents of buffer into the termBuffer array.
* @param buffer the buffer to copy
*/
public void setTermBuffer(String buffer) {
int length = buffer.length();
char[] newCharBuffer = growTermBuffer(length);
if (newCharBuffer != null) {
termBuffer = newCharBuffer;
}
buffer.getChars(0, length, termBuffer, 0);
termLength = length;
}
/** Copies the contents of buffer, starting at offset and continuing
* for length characters, into the termBuffer array.
* @param buffer the buffer to copy
* @param offset the index in the buffer of the first character to copy
* @param length the number of characters to copy
*/
public void setTermBuffer(String buffer, int offset, int length) {
assert offset <= buffer.length();
assert offset + length <= buffer.length();
char[] newCharBuffer = growTermBuffer(length);
if (newCharBuffer != null) {
termBuffer = newCharBuffer;
}
buffer.getChars(offset, offset + length, termBuffer, 0);
termLength = length;
}
/** Returns the internal termBuffer character array which
* you can then directly alter. If the array is too
* small for your token, use {@link
* #resizeTermBuffer(int)} to increase it. After
* altering the buffer be sure to call {@link
* #setTermLength} to record the number of valid
* characters that were placed into the termBuffer. */
public char[] termBuffer() {
initTermBuffer();
return termBuffer;
}
/** Grows the termBuffer to at least size newSize, preserving the
* existing content. Note: If the next operation is to change
* the contents of the term buffer use
* {@link #setTermBuffer(char[], int, int)},
* {@link #setTermBuffer(String)}, or
* {@link #setTermBuffer(String, int, int)}
* to optimally combine the resize with the setting of the termBuffer.
* @param newSize minimum size of the new termBuffer
* @return newly created termBuffer with length >= newSize
*/
public char[] resizeTermBuffer(int newSize) {
char[] newCharBuffer = growTermBuffer(newSize);
if (termBuffer == null) {
// If there were termText, then preserve it.
// note that if termBuffer is null then newCharBuffer cannot be null
assert newCharBuffer != null;
termBuffer = newCharBuffer;
} else if (newCharBuffer != null) {
// Note: if newCharBuffer != null then termBuffer needs to grow.
// If there were a termBuffer, then preserve it
System.arraycopy(termBuffer, 0, newCharBuffer, 0, termBuffer.length);
termBuffer = newCharBuffer;
}
return termBuffer;
}
/** Allocates a buffer char[] of at least newSize
* @param newSize minimum size of the buffer
* @return newly created buffer with length >= newSize or null if the current termBuffer is big enough
*/
private char[] growTermBuffer(int newSize) {
if (termBuffer != null) {
if (termBuffer.length >= newSize)
// Already big enough
return null;
else
// Not big enough; create a new array with slight
// over allocation:
return new char[ArrayUtil.getNextSize(newSize)];
} else {
// determine the best size
// The buffer is always at least MIN_BUFFER_SIZE
if (newSize < MIN_BUFFER_SIZE) {
newSize = MIN_BUFFER_SIZE;
}
return new char[newSize];
}
}
// TODO: once we remove the deprecated termText() method
// and switch entirely to char[] termBuffer we don't need
// to use this method anymore
private void initTermBuffer() {
if (termBuffer == null) {
termBuffer = new char[MIN_BUFFER_SIZE];
termLength = 0;
}
}
/** Return number of valid characters (length of the term)
* in the termBuffer array. */
public int termLength() {
initTermBuffer();
return termLength;
}
/** Set number of valid characters (length of the term) in
* the termBuffer array. Use this to truncate the termBuffer
* or to synchronize with external manipulation of the termBuffer.
* Note: to grow the size of the array,
* use {@link #resizeTermBuffer(int)} first.
* @param length the truncated length
*/
public void setTermLength(int length) {
initTermBuffer();
if (length > termBuffer.length)
throw new IllegalArgumentException("length " + length + " exceeds the size of the termBuffer (" + termBuffer.length + ")");
termLength = length;
}
public int hashCode() {
initTermBuffer();
int code = termLength;
code = code * 31 + ArrayUtil.hashCode(termBuffer, 0, termLength);
return code;
}
public void clear() {
termLength = 0;
}
public Object clone() {
TermAttribute t = (TermAttribute)super.clone();
// Do a deep clone
if (termBuffer != null) {
t.termBuffer = (char[]) termBuffer.clone();
}
return t;
}
public boolean equals(Object other) {
if (other == this) {
return true;
}
if (other instanceof TermAttribute) {
initTermBuffer();
TermAttribute o = ((TermAttribute) other);
o.initTermBuffer();
for(int i=0;i<termLength;i++) {
if (termBuffer[i] != o.termBuffer[i]) {
return false;
}
}
return true;
}
return false;
}
public String toString() {
initTermBuffer();
return "term=" + new String(termBuffer, 0, termLength);
}
public void copyTo(Attribute target) {
TermAttribute t = (TermAttribute) target;
t.setTermBuffer(termBuffer, 0, termLength);
}
}

View File

@ -0,0 +1,83 @@
package org.apache.lucene.analysis.tokenattributes;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Serializable;
import org.apache.lucene.util.Attribute;
/**
* A Token's lexical type. The Default value is "word".
*
* <p><font color="#FF0000">
* WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental.
* The APIs introduced in these classes with Lucene 2.9 might change in the future.
* We will make our best efforts to keep the APIs backwards-compatible.</font>
*/
public class TypeAttribute extends Attribute implements Cloneable, Serializable {
private String type;
public static final String DEFAULT_TYPE = "word";
public TypeAttribute() {
this(DEFAULT_TYPE);
}
public TypeAttribute(String type) {
this.type = type;
}
/** Returns this Token's lexical type. Defaults to "word". */
public String type() {
return type;
}
/** Set the lexical type.
@see #type() */
public void setType(String type) {
this.type = type;
}
public void clear() {
type = DEFAULT_TYPE;
}
public String toString() {
return "type=" + type;
}
public boolean equals(Object other) {
if (other == this) {
return true;
}
if (other instanceof TypeAttribute) {
return type.equals(((TypeAttribute) other).type);
}
return false;
}
public int hashCode() {
return type.hashCode();
}
public void copyTo(Attribute target) {
TypeAttribute t = (TypeAttribute) target;
t.setType(new String(type));
}
}

View File

@ -17,12 +17,14 @@ package org.apache.lucene.index;
* limitations under the License.
*/
import java.util.Map;
import java.io.IOException;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Collection;
import java.util.Iterator;
import java.io.IOException;
import java.util.Map;
import org.apache.lucene.util.AttributeSource;
/** This is a DocFieldConsumer that inverts each field,
* separately, from a Document, and accepts a

View File

@ -22,6 +22,8 @@ import java.io.Reader;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
/**
* Holds state for inverting all occurrences of a single
@ -79,10 +81,14 @@ final class DocInverterPerField extends DocFieldConsumerPerField {
if (!field.isTokenized()) { // un-tokenized field
String stringValue = field.stringValue();
final int valueLength = stringValue.length();
Token token = perThread.localToken.reinit(stringValue, 0, valueLength);
perThread.singleTokenTokenStream.reinit(stringValue, 0, valueLength);
fieldState.attributeSource = perThread.singleTokenTokenStream;
perThread.localTokenStream.reset();
consumer.start(field);
boolean success = false;
try {
consumer.add(token);
consumer.add();
success = true;
} finally {
if (!success)
@ -122,7 +128,22 @@ final class DocInverterPerField extends DocFieldConsumerPerField {
try {
int offsetEnd = fieldState.offset-1;
final Token localToken = perThread.localToken;
boolean useNewTokenStreamAPI = stream.useNewAPI();
Token localToken = null;
if (useNewTokenStreamAPI) {
fieldState.attributeSource = stream;
} else {
fieldState.attributeSource = perThread.localTokenStream;
localToken = perThread.localToken;
}
consumer.start(field);
OffsetAttribute offsetAttribute = (OffsetAttribute) fieldState.attributeSource.addAttribute(OffsetAttribute.class);
PositionIncrementAttribute posIncrAttribute = (PositionIncrementAttribute) fieldState.attributeSource.addAttribute(PositionIncrementAttribute.class);
for(;;) {
// If we hit an exception in stream.next below
@ -131,10 +152,16 @@ final class DocInverterPerField extends DocFieldConsumerPerField {
// non-aborting and (above) this one document
// will be marked as deleted, but still
// consume a docID
Token token = stream.next(localToken);
Token token = null;
if (useNewTokenStreamAPI) {
if (!stream.incrementToken()) break;
} else {
token = stream.next(localToken);
if (token == null) break;
perThread.localTokenStream.set(token);
}
if (token == null) break;
final int posIncr = token.getPositionIncrement();
final int posIncr = posIncrAttribute.getPositionIncrement();
fieldState.position += posIncr - 1;
if (posIncr == 0)
fieldState.numOverlap++;
@ -147,14 +174,14 @@ final class DocInverterPerField extends DocFieldConsumerPerField {
// internal state of the consumer is now
// corrupt and should not be flushed to a
// new segment:
consumer.add(token);
consumer.add();
success = true;
} finally {
if (!success)
docState.docWriter.setAborting();
}
fieldState.position++;
offsetEnd = fieldState.offset + token.endOffset();
offsetEnd = fieldState.offset + offsetAttribute.endOffset();
if (++fieldState.length >= maxFieldLength) {
if (docState.infoStream != null)
docState.infoStream.println("maxFieldLength " +maxFieldLength+ " reached for field " + fieldInfo.name + ", ignoring following tokens");

View File

@ -20,6 +20,14 @@ package org.apache.lucene.index;
import java.io.IOException;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.Attribute;
/** This is a DocFieldConsumer that inverts each field,
* separately, from a Document, and accepts a
@ -30,6 +38,94 @@ final class DocInverterPerThread extends DocFieldConsumerPerThread {
final InvertedDocConsumerPerThread consumer;
final InvertedDocEndConsumerPerThread endConsumer;
final Token localToken = new Token();
//TODO: change to SingleTokenTokenStream after Token was removed
final SingleTokenTokenStream singleTokenTokenStream = new SingleTokenTokenStream();
final BackwardsCompatibilityStream localTokenStream = new BackwardsCompatibilityStream();
static class SingleTokenTokenStream extends TokenStream {
TermAttribute termAttribute;
OffsetAttribute offsetAttribute;
SingleTokenTokenStream() {
termAttribute = (TermAttribute) addAttribute(TermAttribute.class);
offsetAttribute = (OffsetAttribute) addAttribute(OffsetAttribute.class);
}
public void reinit(String stringValue, int startOffset, int endOffset) {
termAttribute.setTermBuffer(stringValue);
offsetAttribute.setStartOffset(startOffset);
offsetAttribute.setEndOffset(endOffset);
}
}
/** This stream wrapper is only used to maintain backwards compatibility with the
* old TokenStream API and can be removed in Lucene 3.0
* @deprecated
*/
static class BackwardsCompatibilityStream extends TokenStream {
private Token token;
TermAttribute termAttribute = new TermAttribute() {
public String term() {
return token.term();
}
public char[] termBuffer() {
return token.termBuffer();
}
public int termLength() {
return token.termLength();
}
};
OffsetAttribute offsetAttribute = new OffsetAttribute() {
public int startOffset() {
return token.startOffset();
}
public int endOffset() {
return token.endOffset();
}
};
PositionIncrementAttribute positionIncrementAttribute = new PositionIncrementAttribute() {
public int getPositionIncrement() {
return token.getPositionIncrement();
}
};
FlagsAttribute flagsAttribute = new FlagsAttribute() {
public int getFlags() {
return token.getFlags();
}
};
PayloadAttribute payloadAttribute = new PayloadAttribute() {
public Payload getPayload() {
return token.getPayload();
}
};
TypeAttribute typeAttribute = new TypeAttribute() {
public String type() {
return token.type();
}
};
BackwardsCompatibilityStream() {
attributes.put(TermAttribute.class, termAttribute);
attributes.put(OffsetAttribute.class, offsetAttribute);
attributes.put(PositionIncrementAttribute.class, positionIncrementAttribute);
attributes.put(FlagsAttribute.class, flagsAttribute);
attributes.put(PayloadAttribute.class, payloadAttribute);
attributes.put(TypeAttribute.class, typeAttribute);
}
public void set(Token token) {
this.token = token;
}
};
final DocumentsWriter.DocState docState;
final FieldInvertState fieldState = new FieldInvertState();

View File

@ -17,6 +17,7 @@
package org.apache.lucene.index;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.util.AttributeSource;
/**
* This class tracks the number and position / offset parameters of terms
@ -32,6 +33,7 @@ public final class FieldInvertState {
int numOverlap;
int offset;
float boost;
AttributeSource attributeSource;
public FieldInvertState() {
}
@ -54,6 +56,7 @@ public final class FieldInvertState {
numOverlap = 0;
offset = 0;
boost = docBoost;
attributeSource = null;
}
/**
@ -97,4 +100,8 @@ public final class FieldInvertState {
public float getBoost() {
return boost;
}
public AttributeSource getAttributeSource() {
return attributeSource;
}
}

View File

@ -19,7 +19,7 @@ package org.apache.lucene.index;
import java.io.IOException;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
// TODO: break into separate freq and prox writers as
// codecs; make separate container (tii/tis/skip/*) that can
@ -32,6 +32,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
final DocumentsWriter.DocState docState;
final FieldInvertState fieldState;
boolean omitTf;
PayloadAttribute payloadAttribute;
public FreqProxTermsWriterPerField(TermsHashPerField termsHashPerField, FreqProxTermsWriterPerThread perThread, FieldInfo fieldInfo) {
this.termsHashPerField = termsHashPerField;
@ -53,7 +54,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
boolean hasPayloads;
void skippingLongTerm(Token t) throws IOException {}
void skippingLongTerm() throws IOException {}
public int compareTo(Object other0) {
FreqProxTermsWriterPerField other = (FreqProxTermsWriterPerField) other0;
@ -64,6 +65,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
// Record, up front, whether our in-RAM format will be
// with or without term freqs:
omitTf = fieldInfo.omitTf;
payloadAttribute = null;
}
boolean start(Fieldable[] fields, int count) {
@ -73,8 +75,22 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
return false;
}
final void writeProx(Token t, FreqProxTermsWriter.PostingList p, int proxCode) {
final Payload payload = t.getPayload();
void start(Fieldable f) {
if (fieldState.attributeSource.hasAttribute(PayloadAttribute.class)) {
payloadAttribute = (PayloadAttribute) fieldState.attributeSource.getAttribute(PayloadAttribute.class);
} else {
payloadAttribute = null;
}
}
final void writeProx(FreqProxTermsWriter.PostingList p, int proxCode) {
final Payload payload;
if (payloadAttribute == null) {
payload = null;
} else {
payload = payloadAttribute.getPayload();
}
if (payload != null && payload.length > 0) {
termsHashPerField.writeVInt(1, (proxCode<<1)|1);
termsHashPerField.writeVInt(1, payload.length);
@ -85,7 +101,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
p.lastPosition = fieldState.position;
}
final void newTerm(Token t, RawPostingList p0) {
final void newTerm(RawPostingList p0) {
// First time we're seeing this term since the last
// flush
assert docState.testPoint("FreqProxTermsWriterPerField.newTerm start");
@ -96,11 +112,11 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
} else {
p.lastDocCode = docState.docID << 1;
p.docFreq = 1;
writeProx(t, p, fieldState.position);
writeProx(p, fieldState.position);
}
}
final void addTerm(Token t, RawPostingList p0) {
final void addTerm(RawPostingList p0) {
assert docState.testPoint("FreqProxTermsWriterPerField.addTerm start");
@ -132,10 +148,10 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
p.docFreq = 1;
p.lastDocCode = (docState.docID - p.lastDocID) << 1;
p.lastDocID = docState.docID;
writeProx(t, p, fieldState.position);
writeProx(p, fieldState.position);
} else {
p.docFreq++;
writeProx(t, p, fieldState.position-p.lastPosition);
writeProx(p, fieldState.position-p.lastPosition);
}
}
}

View File

@ -17,10 +17,10 @@ package org.apache.lucene.index;
* limitations under the License.
*/
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.analysis.Token;
import java.io.IOException;
import org.apache.lucene.document.Fieldable;
abstract class InvertedDocConsumerPerField {
// Called once per field, and is given all Fieldable
@ -29,8 +29,11 @@ abstract class InvertedDocConsumerPerField {
// fields:
abstract boolean start(Fieldable[] fields, int count) throws IOException;
// Called before a field instance is being processed
abstract void start(Fieldable field);
// Called once per inverted token
abstract void add(Token token) throws IOException;
abstract void add() throws IOException;
// Called once per field per document, after all Fieldable
// occurrences are inverted

View File

@ -19,7 +19,6 @@ package org.apache.lucene.index;
import java.io.Serializable;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.util.ArrayUtil;
@ -29,7 +28,7 @@ import org.apache.lucene.util.ArrayUtil;
* specific term.
* <p>
* To store payloads in the index a {@link TokenStream} has to be used that
* produces {@link Token}s containing payload data.
* produces payload data.
* <p>
* Use {@link TermPositions#getPayloadLength()} and {@link TermPositions#getPayload(byte[], int)}
* to retrieve the payloads from the index.<br>

View File

@ -18,10 +18,11 @@ package org.apache.lucene.index;
*/
import java.io.IOException;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.UnicodeUtil;
final class TermVectorsTermsWriterPerField extends TermsHashConsumerPerField {
@ -37,6 +38,7 @@ final class TermVectorsTermsWriterPerField extends TermsHashConsumerPerField {
boolean doVectorOffsets;
int maxNumPostings;
OffsetAttribute offsetAttribute = null;
public TermVectorsTermsWriterPerField(TermsHashPerField termsHashPerField, TermVectorsTermsWriterPerThread perThread, FieldInfo fieldInfo) {
this.termsHashPerField = termsHashPerField;
@ -192,7 +194,15 @@ final class TermVectorsTermsWriterPerField extends TermsHashConsumerPerField {
maxNumPostings = 0;
}
void newTerm(Token t, RawPostingList p0) {
void start(Fieldable f) {
if (doVectorOffsets && fieldState.attributeSource.hasAttribute(OffsetAttribute.class)) {
offsetAttribute = (OffsetAttribute) fieldState.attributeSource.getAttribute(OffsetAttribute.class);
} else {
offsetAttribute = null;
}
}
void newTerm(RawPostingList p0) {
assert docState.testPoint("TermVectorsTermsWriterPerField.newTerm start");
@ -201,8 +211,9 @@ final class TermVectorsTermsWriterPerField extends TermsHashConsumerPerField {
p.freq = 1;
if (doVectorOffsets) {
final int startOffset = fieldState.offset + t.startOffset();
final int endOffset = fieldState.offset + t.endOffset();
int startOffset = fieldState.offset + offsetAttribute.startOffset();;
int endOffset = fieldState.offset + offsetAttribute.endOffset();
termsHashPerField.writeVInt(1, startOffset);
termsHashPerField.writeVInt(1, endOffset - startOffset);
p.lastOffset = endOffset;
@ -214,7 +225,7 @@ final class TermVectorsTermsWriterPerField extends TermsHashConsumerPerField {
}
}
void addTerm(Token t, RawPostingList p0) {
void addTerm(RawPostingList p0) {
assert docState.testPoint("TermVectorsTermsWriterPerField.addTerm start");
@ -222,8 +233,9 @@ final class TermVectorsTermsWriterPerField extends TermsHashConsumerPerField {
p.freq++;
if (doVectorOffsets) {
final int startOffset = fieldState.offset + t.startOffset();
final int endOffset = fieldState.offset + t.endOffset();
int startOffset = fieldState.offset + offsetAttribute.startOffset();;
int endOffset = fieldState.offset + offsetAttribute.endOffset();
termsHashPerField.writeVInt(1, startOffset - p.lastOffset);
termsHashPerField.writeVInt(1, endOffset - startOffset);
p.lastOffset = endOffset;
@ -235,5 +247,5 @@ final class TermVectorsTermsWriterPerField extends TermsHashConsumerPerField {
}
}
void skippingLongTerm(Token t) {}
void skippingLongTerm() {}
}

View File

@ -23,14 +23,15 @@ package org.apache.lucene.index;
* multiple streams for each unique Token. */
import java.io.IOException;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.analysis.Token;
abstract class TermsHashConsumerPerField {
abstract boolean start(Fieldable[] fields, int count) throws IOException;
abstract void finish() throws IOException;
abstract void skippingLongTerm(Token t) throws IOException;
abstract void newTerm(Token t, RawPostingList p) throws IOException;
abstract void addTerm(Token t, RawPostingList p) throws IOException;
abstract void skippingLongTerm() throws IOException;
abstract void start(Fieldable field);
abstract void newTerm(RawPostingList p) throws IOException;
abstract void addTerm(RawPostingList p) throws IOException;
abstract int getStreamCount();
}

View File

@ -20,8 +20,8 @@ package org.apache.lucene.index;
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.util.UnicodeUtil;
final class TermsHashPerField extends InvertedDocConsumerPerField {
@ -31,6 +31,7 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
final TermsHashPerThread perThread;
final DocumentsWriter.DocState docState;
final FieldInvertState fieldState;
TermAttribute termAtt;
// Copied from our perThread
final CharBlockPool charPool;
@ -247,6 +248,14 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
private boolean doCall;
private boolean doNextCall;
void start(Fieldable f) {
termAtt = (TermAttribute) fieldState.attributeSource.getAttribute(TermAttribute.class);
consumer.start(f);
if (nextPerField != null) {
nextPerField.start(f);
}
}
boolean start(Fieldable[] fields, int count) throws IOException {
doCall = consumer.start(fields, count);
if (nextPerField != null)
@ -257,7 +266,7 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
// Secondary entry point (for 2nd & subsequent TermsHash),
// because token text has already been "interned" into
// textStart, so we hash by textStart
public void add(Token token, int textStart) throws IOException {
public void add(int textStart) throws IOException {
int code = textStart;
@ -320,17 +329,17 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
}
p.byteStart = intUptos[intUptoStart];
consumer.newTerm(token, p);
consumer.newTerm(p);
} else {
intUptos = intPool.buffers[p.intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
intUptoStart = p.intStart & DocumentsWriter.INT_BLOCK_MASK;
consumer.addTerm(token, p);
consumer.addTerm(p);
}
}
// Primary entry point (for first TermsHash)
void add(Token token) throws IOException {
void add() throws IOException {
assert !postingsCompacted;
@ -338,8 +347,8 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
// term text into textStart address
// Get the text of this term.
final char[] tokenText = token.termBuffer();
final int tokenTextLen = token.termLength();
final char[] tokenText = termAtt.termBuffer();;
final int tokenTextLen = termAtt.termLength();
// Compute hashcode & replace any invalid UTF16 sequences
int downto = tokenTextLen;
@ -403,7 +412,7 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
if (docState.maxTermPrefix == null)
docState.maxTermPrefix = new String(tokenText, 0, 30);
consumer.skippingLongTerm(token);
consumer.skippingLongTerm();
return;
}
charPool.nextBuffer();
@ -450,16 +459,16 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
}
p.byteStart = intUptos[intUptoStart];
consumer.newTerm(token, p);
consumer.newTerm(p);
} else {
intUptos = intPool.buffers[p.intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
intUptoStart = p.intStart & DocumentsWriter.INT_BLOCK_MASK;
consumer.addTerm(token, p);
consumer.addTerm(p);
}
if (doNextCall)
nextPerField.add(token, p.textStart);
nextPerField.add(p.textStart);
}
int[] intUptos;

View File

@ -3,8 +3,8 @@ package org.apache.lucene.queryParser;
import java.io.IOException;
import java.io.StringReader;
import java.text.DateFormat;
import java.text.Collator;
import java.text.DateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
@ -15,7 +15,10 @@ import java.util.Map;
import java.util.Vector;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CachingTokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.document.DateField;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.index.Term;
@ -518,48 +521,126 @@ public class QueryParser implements QueryParserConstants {
// PhraseQuery, or nothing based on the term count
TokenStream source = analyzer.tokenStream(field, new StringReader(queryText));
List list = new ArrayList();
final org.apache.lucene.analysis.Token reusableToken = new org.apache.lucene.analysis.Token();
org.apache.lucene.analysis.Token nextToken;
CachingTokenFilter buffer = new CachingTokenFilter(source);
TermAttribute termAtt = null;
PositionIncrementAttribute posIncrAtt = null;
int numTokens = 0;
org.apache.lucene.analysis.Token reusableToken = null;
org.apache.lucene.analysis.Token nextToken = null;
boolean useNewAPI = TokenStream.useNewAPIDefault();
if (useNewAPI) {
boolean success = false;
try {
buffer.reset();
success = true;
} catch (IOException e) {
// success==false if we hit an exception
}
if (success) {
if (buffer.hasAttribute(TermAttribute.class)) {
termAtt = (TermAttribute) buffer.getAttribute(TermAttribute.class);
}
if (buffer.hasAttribute(PositionIncrementAttribute.class)) {
posIncrAtt = (PositionIncrementAttribute) buffer.getAttribute(PositionIncrementAttribute.class);
}
}
} else {
reusableToken = new org.apache.lucene.analysis.Token();
}
int positionCount = 0;
boolean severalTokensAtSamePosition = false;
while (true) {
try {
nextToken = source.next(reusableToken);
if (useNewAPI) {
if (termAtt != null) {
try {
while (buffer.incrementToken()) {
numTokens++;
int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1;
if (positionIncrement != 0) {
positionCount += positionIncrement;
} else {
severalTokensAtSamePosition = true;
}
}
} catch (IOException e) {
// ignore
}
}
catch (IOException e) {
nextToken = null;
} else {
while (true) {
try {
nextToken = buffer.next(reusableToken);
}
catch (IOException e) {
nextToken = null;
}
if (nextToken == null)
break;
numTokens++;
if (nextToken.getPositionIncrement() != 0)
positionCount += nextToken.getPositionIncrement();
else
severalTokensAtSamePosition = true;
}
if (nextToken == null)
break;
list.add(nextToken.clone());
if (nextToken.getPositionIncrement() != 0)
positionCount += nextToken.getPositionIncrement();
else
severalTokensAtSamePosition = true;
}
try {
// rewind the buffer stream
buffer.reset();
// close original stream - all tokens buffered
source.close();
}
catch (IOException e) {
// ignore
}
if (list.size() == 0)
if (numTokens == 0)
return null;
else if (list.size() == 1) {
nextToken = (org.apache.lucene.analysis.Token) list.get(0);
return newTermQuery(new Term(field, nextToken.term()));
else if (numTokens == 1) {
String term = null;
try {
if (useNewAPI) {
boolean hasNext = buffer.incrementToken();
assert hasNext == true;
term = termAtt.term();
} else {
nextToken = buffer.next(reusableToken);
assert nextToken != null;
term = nextToken.term();
}
} catch (IOException e) {
// safe to ignore, because we know the number of tokens
}
return newTermQuery(new Term(field, term));
} else {
if (severalTokensAtSamePosition) {
if (positionCount == 1) {
// no phrase query:
BooleanQuery q = newBooleanQuery(true);
for (int i = 0; i < list.size(); i++) {
nextToken = (org.apache.lucene.analysis.Token) list.get(i);
for (int i = 0; i < numTokens; i++) {
String term = null;
try {
if (useNewAPI) {
boolean hasNext = buffer.incrementToken();
assert hasNext == true;
term = termAtt.term();
} else {
nextToken = buffer.next(reusableToken);
assert nextToken != null;
term = nextToken.term();
}
} catch (IOException e) {
// safe to ignore, because we know the number of tokens
}
Query currentQuery = newTermQuery(
new Term(field, nextToken.term()));
new Term(field, term));
q.add(currentQuery, BooleanClause.Occur.SHOULD);
}
return q;
@ -570,9 +651,28 @@ public class QueryParser implements QueryParserConstants {
mpq.setSlop(phraseSlop);
List multiTerms = new ArrayList();
int position = -1;
for (int i = 0; i < list.size(); i++) {
nextToken = (org.apache.lucene.analysis.Token) list.get(i);
if (nextToken.getPositionIncrement() > 0 && multiTerms.size() > 0) {
for (int i = 0; i < numTokens; i++) {
String term = null;
int positionIncrement = 1;
try {
if (useNewAPI) {
boolean hasNext = buffer.incrementToken();
assert hasNext == true;
term = termAtt.term();
if (posIncrAtt != null) {
positionIncrement = posIncrAtt.getPositionIncrement();
}
} else {
nextToken = buffer.next(reusableToken);
assert nextToken != null;
term = nextToken.term();
positionIncrement = nextToken.getPositionIncrement();
}
} catch (IOException e) {
// safe to ignore, because we know the number of tokens
}
if (positionIncrement > 0 && multiTerms.size() > 0) {
if (enablePositionIncrements) {
mpq.add((Term[])multiTerms.toArray(new Term[0]),position);
} else {
@ -580,8 +680,8 @@ public class QueryParser implements QueryParserConstants {
}
multiTerms.clear();
}
position += nextToken.getPositionIncrement();
multiTerms.add(new Term(field, nextToken.term()));
position += positionIncrement;
multiTerms.add(new Term(field, term));
}
if (enablePositionIncrements) {
mpq.add((Term[])multiTerms.toArray(new Term[0]),position);
@ -595,13 +695,36 @@ public class QueryParser implements QueryParserConstants {
PhraseQuery pq = newPhraseQuery();
pq.setSlop(phraseSlop);
int position = -1;
for (int i = 0; i < list.size(); i++) {
nextToken = (org.apache.lucene.analysis.Token) list.get(i);
for (int i = 0; i < numTokens; i++) {
String term = null;
int positionIncrement = 1;
try {
if (useNewAPI) {
boolean hasNext = buffer.incrementToken();
assert hasNext == true;
term = termAtt.term();
if (posIncrAtt != null) {
positionIncrement = posIncrAtt.getPositionIncrement();
}
} else {
nextToken = buffer.next(reusableToken);
assert nextToken != null;
term = nextToken.term();
positionIncrement = nextToken.getPositionIncrement();
}
} catch (IOException e) {
// safe to ignore, because we know the number of tokens
}
if (enablePositionIncrements) {
position += nextToken.getPositionIncrement();
pq.add(new Term(field, nextToken.term()),position);
position += positionIncrement;
pq.add(new Term(field, term),position);
} else {
pq.add(new Term(field, nextToken.term()));
pq.add(new Term(field, term));
}
}
return pq;
@ -610,6 +733,7 @@ public class QueryParser implements QueryParserConstants {
}
/**
* Base implementation delegates to {@link #getFieldQuery(String,String)}.
* This method may be overridden, for example, to return
@ -1503,12 +1627,6 @@ public class QueryParser implements QueryParserConstants {
finally { jj_save(0, xla); }
}
private boolean jj_3R_3() {
if (jj_scan_token(STAR)) return true;
if (jj_scan_token(COLON)) return true;
return false;
}
private boolean jj_3R_2() {
if (jj_scan_token(TERM)) return true;
if (jj_scan_token(COLON)) return true;
@ -1525,6 +1643,12 @@ public class QueryParser implements QueryParserConstants {
return false;
}
private boolean jj_3R_3() {
if (jj_scan_token(STAR)) return true;
if (jj_scan_token(COLON)) return true;
return false;
}
/** Generated Token Manager. */
public QueryParserTokenManager token_source;
/** Current token. */

View File

@ -27,8 +27,8 @@ package org.apache.lucene.queryParser;
import java.io.IOException;
import java.io.StringReader;
import java.text.DateFormat;
import java.text.Collator;
import java.text.DateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
@ -39,7 +39,10 @@ import java.util.Map;
import java.util.Vector;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CachingTokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.document.DateField;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.index.Term;
@ -542,48 +545,126 @@ public class QueryParser {
// PhraseQuery, or nothing based on the term count
TokenStream source = analyzer.tokenStream(field, new StringReader(queryText));
List list = new ArrayList();
final org.apache.lucene.analysis.Token reusableToken = new org.apache.lucene.analysis.Token();
org.apache.lucene.analysis.Token nextToken;
CachingTokenFilter buffer = new CachingTokenFilter(source);
TermAttribute termAtt = null;
PositionIncrementAttribute posIncrAtt = null;
int numTokens = 0;
org.apache.lucene.analysis.Token reusableToken = null;
org.apache.lucene.analysis.Token nextToken = null;
boolean useNewAPI = TokenStream.useNewAPI();
if (useNewAPI) {
boolean success = false;
try {
buffer.start();
success = true;
} catch (IOException e) {
// success==false if we hit an exception
}
if (success) {
if (buffer.hasAttribute(TermAttribute.class)) {
termAtt = (TermAttribute) buffer.getAttribute(TermAttribute.class);
}
if (buffer.hasAttribute(PositionIncrementAttribute.class)) {
posIncrAtt = (PositionIncrementAttribute) buffer.getAttribute(PositionIncrementAttribute.class);
}
}
} else {
reusableToken = new org.apache.lucene.analysis.Token();
}
int positionCount = 0;
boolean severalTokensAtSamePosition = false;
while (true) {
try {
nextToken = source.next(reusableToken);
if (useNewAPI) {
if (termAtt != null) {
try {
while (buffer.incrementToken()) {
numTokens++;
int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1;
if (positionIncrement != 0) {
positionCount += positionIncrement;
} else {
severalTokensAtSamePosition = true;
}
}
} catch (IOException e) {
// ignore
}
}
catch (IOException e) {
nextToken = null;
} else {
while (true) {
try {
nextToken = buffer.next(reusableToken);
}
catch (IOException e) {
nextToken = null;
}
if (nextToken == null)
break;
numTokens++;
if (nextToken.getPositionIncrement() != 0)
positionCount += nextToken.getPositionIncrement();
else
severalTokensAtSamePosition = true;
}
if (nextToken == null)
break;
list.add(nextToken.clone());
if (nextToken.getPositionIncrement() != 0)
positionCount += nextToken.getPositionIncrement();
else
severalTokensAtSamePosition = true;
}
try {
// rewind the buffer stream
buffer.reset();
// close original stream - all tokens buffered
source.close();
}
catch (IOException e) {
// ignore
}
if (list.size() == 0)
if (numTokens == 0)
return null;
else if (list.size() == 1) {
nextToken = (org.apache.lucene.analysis.Token) list.get(0);
return newTermQuery(new Term(field, nextToken.term()));
else if (numTokens == 1) {
String term = null;
try {
if (useNewAPI) {
boolean hasNext = buffer.incrementToken();
assert hasNext == true;
term = termAtt.term();
} else {
nextToken = buffer.next(reusableToken);
assert nextToken != null;
term = nextToken.term();
}
} catch (IOException e) {
// safe to ignore, because we know the number of tokens
}
return newTermQuery(new Term(field, term));
} else {
if (severalTokensAtSamePosition) {
if (positionCount == 1) {
// no phrase query:
BooleanQuery q = newBooleanQuery(true);
for (int i = 0; i < list.size(); i++) {
nextToken = (org.apache.lucene.analysis.Token) list.get(i);
for (int i = 0; i < numTokens; i++) {
String term = null;
try {
if (useNewAPI) {
boolean hasNext = buffer.incrementToken();
assert hasNext == true;
term = termAtt.term();
} else {
nextToken = buffer.next(reusableToken);
assert nextToken != null;
term = nextToken.term();
}
} catch (IOException e) {
// safe to ignore, because we know the number of tokens
}
Query currentQuery = newTermQuery(
new Term(field, nextToken.term()));
new Term(field, term));
q.add(currentQuery, BooleanClause.Occur.SHOULD);
}
return q;
@ -594,9 +675,28 @@ public class QueryParser {
mpq.setSlop(phraseSlop);
List multiTerms = new ArrayList();
int position = -1;
for (int i = 0; i < list.size(); i++) {
nextToken = (org.apache.lucene.analysis.Token) list.get(i);
if (nextToken.getPositionIncrement() > 0 && multiTerms.size() > 0) {
for (int i = 0; i < numTokens; i++) {
String term = null;
int positionIncrement = 1;
try {
if (useNewAPI) {
boolean hasNext = buffer.incrementToken();
assert hasNext == true;
term = termAtt.term();
if (posIncrAtt != null) {
positionIncrement = posIncrAtt.getPositionIncrement();
}
} else {
nextToken = buffer.next(reusableToken);
assert nextToken != null;
term = nextToken.term();
positionIncrement = nextToken.getPositionIncrement();
}
} catch (IOException e) {
// safe to ignore, because we know the number of tokens
}
if (positionIncrement > 0 && multiTerms.size() > 0) {
if (enablePositionIncrements) {
mpq.add((Term[])multiTerms.toArray(new Term[0]),position);
} else {
@ -604,8 +704,8 @@ public class QueryParser {
}
multiTerms.clear();
}
position += nextToken.getPositionIncrement();
multiTerms.add(new Term(field, nextToken.term()));
position += positionIncrement;
multiTerms.add(new Term(field, term));
}
if (enablePositionIncrements) {
mpq.add((Term[])multiTerms.toArray(new Term[0]),position);
@ -619,13 +719,36 @@ public class QueryParser {
PhraseQuery pq = newPhraseQuery();
pq.setSlop(phraseSlop);
int position = -1;
for (int i = 0; i < list.size(); i++) {
nextToken = (org.apache.lucene.analysis.Token) list.get(i);
for (int i = 0; i < numTokens; i++) {
String term = null;
int positionIncrement = 1;
try {
if (useNewAPI) {
boolean hasNext = buffer.incrementToken();
assert hasNext == true;
term = termAtt.term();
if (posIncrAtt != null) {
positionIncrement = posIncrAtt.getPositionIncrement();
}
} else {
nextToken = buffer.next(reusableToken);
assert nextToken != null;
term = nextToken.term();
positionIncrement = nextToken.getPositionIncrement();
}
} catch (IOException e) {
// safe to ignore, because we know the number of tokens
}
if (enablePositionIncrements) {
position += nextToken.getPositionIncrement();
pq.add(new Term(field, nextToken.term()),position);
position += positionIncrement;
pq.add(new Term(field, term),position);
} else {
pq.add(new Term(field, nextToken.term()));
pq.add(new Term(field, term));
}
}
return pq;
@ -634,6 +757,7 @@ public class QueryParser {
}
/**
* Base implementation delegates to {@link #getFieldQuery(String,String)}.
* This method may be overridden, for example, to return

View File

@ -2,8 +2,8 @@
package org.apache.lucene.queryParser;
import java.io.IOException;
import java.io.StringReader;
import java.text.DateFormat;
import java.text.Collator;
import java.text.DateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
@ -13,7 +13,10 @@ import java.util.Locale;
import java.util.Map;
import java.util.Vector;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CachingTokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.document.DateField;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.index.Term;

View File

@ -29,6 +29,7 @@ import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.index.TermFreqVector;
/**
@ -58,9 +59,17 @@ public class QueryTermVector implements TermFreqVector {
{
List terms = new ArrayList();
try {
final Token reusableToken = new Token();
for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) {
terms.add(nextToken.term());
if (stream.useNewAPI()) {
stream.reset();
TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class);
while (stream.incrementToken()) {
terms.add(termAtt.term());
}
} else {
final Token reusableToken = new Token();
for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) {
terms.add(nextToken.term());
}
}
processTerms((String[])terms.toArray(new String[terms.size()]));
} catch (IOException e) {

View File

@ -0,0 +1,95 @@
package org.apache.lucene.util;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Serializable;
/**
* Base class for Attributes that can be added to a
* {@link org.apache.lucene.util.AttributeSource}.
* <p>
* Attributes are used to add data in a dynamic, yet type-safe way to a source
* of usually streamed objects, e. g. a {@link org.apache.lucene.analysis.TokenStream}.
* <p><font color="#FF0000">
* WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental.
* The APIs introduced in these classes with Lucene 2.9 might change in the future.
* We will make our best efforts to keep the APIs backwards-compatible.</font>
*/
public abstract class Attribute implements Cloneable, Serializable {
/**
* Clears the values in this Attribute and resets it to its
* default value.
*/
public abstract void clear();
/**
* Subclasses must implement this method and should follow a syntax
* similar to this one:
*
* <pre>
* public String toString() {
* return "start=" + startOffset + ",end=" + endOffset;
* }
* </pre>
*/
public abstract String toString();
/**
* Subclasses must implement this method and should compute
* a hashCode similar to this:
* <pre>
* public int hashCode() {
* int code = startOffset;
* code = code * 31 + endOffset;
* return code;
* }
* </pre>
*
* see also {@link #equals(Object)}
*/
public abstract int hashCode();
/**
* All values used for computation of {@link #hashCode()}
* should be checked here for equality.
*
* see also {@link Object#equals(Object)}
*/
public abstract boolean equals(Object other);
/**
* Copies the values from this Attribute into the passed-in
* target attribute. The type of the target must match the type
* of this attribute.
*/
public abstract void copyTo(Attribute target);
/**
* Shallow clone. Subclasses must override this if they
* need to clone any members deeply,
*/
public Object clone() {
Object clone = null;
try {
clone = super.clone();
} catch (CloneNotSupportedException e) {
throw new RuntimeException(e); // shouldn't happen
}
return clone;
}
}

View File

@ -0,0 +1,274 @@
package org.apache.lucene.util;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
/**
* An AttributeSource contains a list of different {@link Attribute}s,
* and methods to add and get them. There can only be a single instance
* of an attribute in the same AttributeSource instance. This is ensured
* by passing in the actual type of the Attribute (Class&lt;Attribute&gt;) to
* the {@link #addAttribute(Class)}, which then checks if an instance of
* that type is already present. If yes, it returns the instance, otherwise
* it creates a new instance and returns it.
*
* <p><font color="#FF0000">
* WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental.
* The APIs introduced in these classes with Lucene 2.9 might change in the future.
* We will make our best efforts to keep the APIs backwards-compatible.</font>
*/
public class AttributeSource {
/**
* An AttributeAcceptor defines only a single method {@link #accept(Class)}.
* It can be used for e. g. buffering purposes to specify which attributes
* to buffer.
*/
public static abstract class AttributeAcceptor {
/** Return true, to accept this attribute; false otherwise */
public abstract boolean accept(Class attClass);
}
/**
* Default AttributeAcceptor that accepts all attributes.
*/
public static final AttributeAcceptor AllAcceptor = new AttributeAcceptor() {
public boolean accept(Class attClass) {return true;}
};
/**
* Holds the Class&lt;Attribute&gt; -> Attribute mapping
*/
protected Map attributes;
public AttributeSource() {
this.attributes = new LinkedHashMap();
}
public AttributeSource(AttributeSource input) {
this.attributes = input.attributes;
}
/** Returns an iterator that iterates the attributes
* in the same order they were added in.
*/
public Iterator getAttributesIterator() {
return attributes.values().iterator();
}
/**
* The caller must pass in a Class&lt;? extends Attribute&gt; value.
* This method first checks if an instance of that class is
* already in this AttributeSource and returns it. Otherwise a
* new instance is created, added to this AttributeSource and returned.
*/
public Attribute addAttribute(Class attClass) {
Attribute att = (Attribute) attributes.get(attClass);
if (att == null) {
try {
att = (Attribute) attClass.newInstance();
} catch (InstantiationException e) {
throw new IllegalArgumentException("Could not instantiate class " + attClass);
} catch (IllegalAccessException e) {
throw new IllegalArgumentException("Could not instantiate class " + attClass);
}
attributes.put(attClass, att);
}
return att;
}
/** Returns true, iff this AttributeSource has any attributes */
public boolean hasAttributes() {
return !this.attributes.isEmpty();
}
/**
* The caller must pass in a Class&lt;? extends Attribute&gt; value.
* Returns true, iff this AttributeSource contains the passed-in Attribute.
*/
public boolean hasAttribute(Class attClass) {
return this.attributes.containsKey(attClass);
}
/**
* The caller must pass in a Class&lt;? extends Attribute&gt; value.
* Returns the instance of the passed in Attribute contained in this AttributeSource
*
* @throws IllegalArgumentException if this AttributeSource does not contain the
* Attribute
*/
public Attribute getAttribute(Class attClass) {
Attribute att = (Attribute) this.attributes.get(attClass);
if (att == null) {
throw new IllegalArgumentException("This token does not have the attribute '" + attClass + "'.");
}
return att;
}
/**
* Resets all Attributes in this AttributeSource by calling
* {@link Attribute#clear()} on each Attribute.
*/
public void clearAttributes() {
Iterator it = getAttributesIterator();
while (it.hasNext()) {
((Attribute) it.next()).clear();
}
}
/**
* Captures the current state of the passed in TokenStream.
* <p>
* This state will contain all of the passed in TokenStream's
* {@link Attribute}s. If only a subset of the attributes is needed
* please use {@link #captureState(AttributeAcceptor)}
*/
public AttributeSource captureState() {
return captureState(AllAcceptor);
}
/**
* Captures the current state of the passed in TokenStream.
* <p>
* This state will contain all of the passed in TokenStream's
* {@link Attribute}s which the {@link AttributeAcceptor} accepts.
*/
public AttributeSource captureState(AttributeAcceptor acceptor) {
AttributeSource state = new AttributeSource();
Iterator it = getAttributesIterator();
while(it.hasNext()) {
Attribute att = (Attribute) it.next();
if (acceptor.accept(att.getClass())) {
Attribute clone = (Attribute) att.clone();
state.attributes.put(att.getClass(), clone);
}
}
return state;
}
/**
* Restores this state by copying the values of all attributes
* that this state contains into the attributes of the targetStream.
* The targetStream must contain a corresponding instance for each argument
* contained in this state.
* <p>
* Note that this method does not affect attributes of the targetStream
* that are not contained in this state. In other words, if for example
* the targetStream contains an OffsetAttribute, but this state doesn't, then
* the value of the OffsetAttribute remains unchanged. It might be desirable to
* reset its value to the default, in which case the caller should first
* call {@link TokenStream#clearAttributes()} on the targetStream.
*/
public void restoreState(AttributeSource target) {
Iterator it = getAttributesIterator();
while (it.hasNext()) {
Attribute att = (Attribute) it.next();
Attribute targetAtt = target.getAttribute(att.getClass());
att.copyTo(targetAtt);
}
}
public int hashCode() {
int code = 0;
if (hasAttributes()) {
Iterator it = getAttributesIterator();
while (it.hasNext()) {
code = code * 31 + it.next().hashCode();
}
}
return code;
}
public boolean equals(Object obj) {
if (obj == this) {
return true;
}
if (obj instanceof AttributeSource) {
AttributeSource other = (AttributeSource) obj;
if (hasAttributes()) {
if (!other.hasAttributes()) {
return false;
}
if (attributes.size() != other.attributes.size()) {
return false;
}
Iterator it = getAttributesIterator();
while (it.hasNext()) {
Class attName = it.next().getClass();
Attribute otherAtt = (Attribute) other.attributes.get(attName);
if (otherAtt == null || !otherAtt.equals(attributes.get(attName))) {
return false;
}
}
return true;
} else {
return !other.hasAttributes();
}
} else
return false;
}
// TODO: Java 1.5
// private Map<Class<? extends Attribute>, Attribute> attributes;
// public <T extends Attribute> T addAttribute(Class<T> attClass) {
// T att = (T) attributes.get(attClass);
// if (att == null) {
// try {
// att = attClass.newInstance();
// } catch (InstantiationException e) {
// throw new IllegalArgumentException("Could not instantiate class " + attClass);
// } catch (IllegalAccessException e) {
// throw new IllegalArgumentException("Could not instantiate class " + attClass);
// }
//
// attributes.put(attClass, att);
// }
// return att;
// }
//
// public boolean hasAttribute(Class<? extends Attribute> attClass) {
// return this.attributes.containsKey(attClass);
// }
//
// public <T extends Attribute> T getAttribute(Class<T> attClass) {
// Attribute att = this.attributes.get(attClass);
// if (att == null) {
// throw new IllegalArgumentException("This token does not have the attribute '" + attClass + "'.");
// }
//
// return (T) att;
// }
//
}

View File

@ -17,19 +17,20 @@ package org.apache.lucene;
* limitations under the License.
*/
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
import java.io.Reader;
import java.io.StringReader;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.util.Date;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
class AnalysisTest {
static File tmpFile;
public static void main(String[] args) {
@ -70,12 +71,15 @@ class AnalysisTest {
Date start = new Date();
int count = 0;
final Token reusableToken = new Token();
for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) {
stream.reset();
TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class);
OffsetAttribute offsetAtt = (OffsetAttribute) stream.getAttribute(OffsetAttribute.class);
while (stream.incrementToken()) {
if (verbose) {
System.out.println("Text=" + nextToken.term()
+ " start=" + nextToken.startOffset()
+ " end=" + nextToken.endOffset());
System.out.println("Text=" + termAtt.term()
+ " start=" + offsetAtt.startOffset()
+ " end=" + offsetAtt.endOffset());
}
count++;
}

View File

@ -18,6 +18,9 @@ package org.apache.lucene.analysis;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.English;
import org.apache.lucene.util.LuceneTestCase;
@ -40,7 +43,8 @@ public class TeeSinkTokenTest extends LuceneTestCase {
super(s);
}
protected void setUp() {
protected void setUp() throws Exception {
super.setUp();
tokens1 = new String[]{"The", "quick", "Burgundy", "Fox", "jumped", "over", "the", "lazy", "Red", "Dogs"};
tokens2 = new String[]{"The", "Lazy", "Dogs", "should", "stay", "on", "the", "porch"};
buffer1 = new StringBuffer();
@ -62,24 +66,29 @@ public class TeeSinkTokenTest extends LuceneTestCase {
public void test() throws IOException {
SinkTokenizer sink1 = new SinkTokenizer(null) {
public void add(Token t) {
if (t != null && t.term().equalsIgnoreCase("The")) {
super.add(t);
public void add(AttributeSource a) throws IOException {
TermAttribute termAtt = null;
if (a.hasAttribute(TermAttribute.class)) {
termAtt = (TermAttribute) a.getAttribute(TermAttribute.class);
}
if (termAtt != null && termAtt.term().equalsIgnoreCase("The")) {
super.add(a);
}
}
};
TokenStream source = new TeeTokenFilter(new WhitespaceTokenizer(new StringReader(buffer1.toString())), sink1);
int i = 0;
final Token reusableToken = new Token();
for (Token nextToken = source.next(reusableToken); nextToken != null; nextToken = source.next(reusableToken)) {
assertTrue(nextToken.term() + " is not equal to " + tokens1[i], nextToken.term().equals(tokens1[i]) == true);
TermAttribute termAtt = (TermAttribute) source.getAttribute(TermAttribute.class);
while (source.incrementToken()) {
assertTrue(termAtt.term() + " is not equal to " + tokens1[i], termAtt.term().equals(tokens1[i]) == true);
i++;
}
assertTrue(i + " does not equal: " + tokens1.length, i == tokens1.length);
assertTrue("sink1 Size: " + sink1.getTokens().size() + " is not: " + 2, sink1.getTokens().size() == 2);
i = 0;
for (Token token = sink1.next(reusableToken); token != null; token = sink1.next(reusableToken)) {
assertTrue(token.term() + " is not equal to " + "The", token.term().equalsIgnoreCase("The") == true);
termAtt = (TermAttribute) sink1.getAttribute(TermAttribute.class);
while (sink1.incrementToken()) {
assertTrue(termAtt.term() + " is not equal to " + "The", termAtt.term().equalsIgnoreCase("The") == true);
i++;
}
assertTrue(i + " does not equal: " + sink1.getTokens().size(), i == sink1.getTokens().size());
@ -87,55 +96,67 @@ public class TeeSinkTokenTest extends LuceneTestCase {
public void testMultipleSources() throws Exception {
SinkTokenizer theDetector = new SinkTokenizer(null) {
public void add(Token t) {
if (t != null && t.term().equalsIgnoreCase("The")) {
super.add(t);
public void add(AttributeSource a) throws IOException {
TermAttribute termAtt = null;
if (a.hasAttribute(TermAttribute.class)) {
termAtt = (TermAttribute) a.getAttribute(TermAttribute.class);
}
if (termAtt != null && termAtt.term().equalsIgnoreCase("The")) {
super.add(a);
}
}
};
SinkTokenizer dogDetector = new SinkTokenizer(null) {
public void add(Token t) {
if (t != null && t.term().equalsIgnoreCase("Dogs")) {
super.add(t);
public void add(AttributeSource a) throws IOException {
TermAttribute termAtt = null;
if (a.hasAttribute(TermAttribute.class)) {
termAtt = (TermAttribute) a.getAttribute(TermAttribute.class);
}
if (termAtt != null && termAtt.term().equalsIgnoreCase("Dogs")) {
super.add(a);
}
}
};
TokenStream source1 = new CachingTokenFilter(new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(new StringReader(buffer1.toString())), theDetector), dogDetector));
TokenStream source2 = new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(new StringReader(buffer2.toString())), theDetector), dogDetector);
int i = 0;
final Token reusableToken = new Token();
for (Token nextToken = source1.next(reusableToken); nextToken != null; nextToken = source1.next(reusableToken)) {
assertTrue(nextToken.term() + " is not equal to " + tokens1[i], nextToken.term().equals(tokens1[i]) == true);
TermAttribute termAtt = (TermAttribute) source1.getAttribute(TermAttribute.class);
while (source1.incrementToken()) {
assertTrue(termAtt.term() + " is not equal to " + tokens1[i], termAtt.term().equals(tokens1[i]) == true);
i++;
}
assertTrue(i + " does not equal: " + tokens1.length, i == tokens1.length);
assertTrue("theDetector Size: " + theDetector.getTokens().size() + " is not: " + 2, theDetector.getTokens().size() == 2);
assertTrue("dogDetector Size: " + dogDetector.getTokens().size() + " is not: " + 1, dogDetector.getTokens().size() == 1);
i = 0;
for (Token nextToken = source2.next(reusableToken); nextToken != null; nextToken = source2.next(reusableToken)) {
assertTrue(nextToken.term() + " is not equal to " + tokens2[i], nextToken.term().equals(tokens2[i]) == true);
termAtt = (TermAttribute) source2.getAttribute(TermAttribute.class);
while (source2.incrementToken()) {
assertTrue(termAtt.term() + " is not equal to " + tokens2[i], termAtt.term().equals(tokens2[i]) == true);
i++;
}
assertTrue(i + " does not equal: " + tokens2.length, i == tokens2.length);
assertTrue("theDetector Size: " + theDetector.getTokens().size() + " is not: " + 4, theDetector.getTokens().size() == 4);
assertTrue("dogDetector Size: " + dogDetector.getTokens().size() + " is not: " + 2, dogDetector.getTokens().size() == 2);
i = 0;
for (Token nextToken = theDetector.next(reusableToken); nextToken != null; nextToken = theDetector.next(reusableToken)) {
assertTrue(nextToken.term() + " is not equal to " + "The", nextToken.term().equalsIgnoreCase("The") == true);
termAtt = (TermAttribute) theDetector.getAttribute(TermAttribute.class);
while (theDetector.incrementToken()) {
assertTrue(termAtt.term() + " is not equal to " + "The", termAtt.term().equalsIgnoreCase("The") == true);
i++;
}
assertTrue(i + " does not equal: " + theDetector.getTokens().size(), i == theDetector.getTokens().size());
i = 0;
for (Token nextToken = dogDetector.next(reusableToken); nextToken != null; nextToken = dogDetector.next(reusableToken)) {
assertTrue(nextToken.term() + " is not equal to " + "Dogs", nextToken.term().equalsIgnoreCase("Dogs") == true);
termAtt = (TermAttribute) dogDetector.getAttribute(TermAttribute.class);
while (dogDetector.incrementToken()) {
assertTrue(termAtt.term() + " is not equal to " + "Dogs", termAtt.term().equalsIgnoreCase("Dogs") == true);
i++;
}
assertTrue(i + " does not equal: " + dogDetector.getTokens().size(), i == dogDetector.getTokens().size());
source1.reset();
TokenStream lowerCasing = new LowerCaseFilter(source1);
i = 0;
for (Token nextToken = lowerCasing.next(reusableToken); nextToken != null; nextToken = lowerCasing.next(reusableToken)) {
assertTrue(nextToken.term() + " is not equal to " + tokens1[i].toLowerCase(), nextToken.term().equals(tokens1[i].toLowerCase()) == true);
termAtt = (TermAttribute) lowerCasing.getAttribute(TermAttribute.class);
while (lowerCasing.incrementToken()) {
assertTrue(termAtt.term() + " is not equal to " + tokens1[i].toLowerCase(), termAtt.term().equals(tokens1[i].toLowerCase()) == true);
i++;
}
assertTrue(i + " does not equal: " + tokens1.length, i == tokens1.length);
@ -157,21 +178,20 @@ public class TeeSinkTokenTest extends LuceneTestCase {
}
//make sure we produce the same tokens
ModuloSinkTokenizer sink = new ModuloSinkTokenizer(tokCount[k], 100);
final Token reusableToken = new Token();
TokenStream stream = new TeeTokenFilter(new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString()))), sink);
while (stream.next(reusableToken) != null) {
while (stream.incrementToken()) {
}
stream = new ModuloTokenFilter(new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString()))), 100);
List tmp = new ArrayList();
for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) {
tmp.add(nextToken.clone());
while (stream.incrementToken()) {
tmp.add(stream.captureState());
}
List sinkList = sink.getTokens();
assertTrue("tmp Size: " + tmp.size() + " is not: " + sinkList.size(), tmp.size() == sinkList.size());
for (int i = 0; i < tmp.size(); i++) {
Token tfTok = (Token) tmp.get(i);
Token sinkTok = (Token) sinkList.get(i);
assertTrue(tfTok.term() + " is not equal to " + sinkTok.term() + " at token: " + i, tfTok.term().equals(sinkTok.term()) == true);
AttributeSource tfTok = (AttributeSource) tmp.get(i);
AttributeSource sinkTok = (AttributeSource) sinkList.get(i);
assertTrue(tfTok + " is not equal to " + sinkTok + " at token: " + i, tfTok.equals(sinkTok) == true);
}
//simulate two fields, each being analyzed once, for 20 documents
@ -180,12 +200,14 @@ public class TeeSinkTokenTest extends LuceneTestCase {
long start = System.currentTimeMillis();
for (int i = 0; i < 20; i++) {
stream = new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString())));
for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) {
tfPos += nextToken.getPositionIncrement();
PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) stream.getAttribute(PositionIncrementAttribute.class);
while (stream.incrementToken()) {
tfPos += posIncrAtt.getPositionIncrement();
}
stream = new ModuloTokenFilter(new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString()))), modCounts[j]);
for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) {
tfPos += nextToken.getPositionIncrement();
posIncrAtt = (PositionIncrementAttribute) stream.getAttribute(PositionIncrementAttribute.class);
while (stream.incrementToken()) {
tfPos += posIncrAtt.getPositionIncrement();
}
}
long finish = System.currentTimeMillis();
@ -196,13 +218,15 @@ public class TeeSinkTokenTest extends LuceneTestCase {
for (int i = 0; i < 20; i++) {
sink = new ModuloSinkTokenizer(tokCount[k], modCounts[j]);
stream = new TeeTokenFilter(new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString()))), sink);
for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) {
sinkPos += nextToken.getPositionIncrement();
PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) stream.getAttribute(PositionIncrementAttribute.class);
while (stream.incrementToken()) {
sinkPos += posIncrAtt.getPositionIncrement();
}
//System.out.println("Modulo--------");
stream = sink;
for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) {
sinkPos += nextToken.getPositionIncrement();
posIncrAtt = (PositionIncrementAttribute) stream.getAttribute(PositionIncrementAttribute.class);
while (stream.incrementToken()) {
sinkPos += posIncrAtt.getPositionIncrement();
}
}
finish = System.currentTimeMillis();
@ -228,15 +252,15 @@ public class TeeSinkTokenTest extends LuceneTestCase {
int count = 0;
//return every 100 tokens
public Token next(final Token reusableToken) throws IOException {
Token nextToken = null;
for (nextToken = input.next(reusableToken);
nextToken != null && count % modCount != 0;
nextToken = input.next(reusableToken)) {
public boolean incrementToken() throws IOException {
boolean hasNext;
for (hasNext = input.incrementToken();
hasNext && count % modCount != 0;
hasNext = input.incrementToken()) {
count++;
}
count++;
return nextToken;
return hasNext;
}
}
@ -250,9 +274,9 @@ public class TeeSinkTokenTest extends LuceneTestCase {
lst = new ArrayList(numToks % mc);
}
public void add(Token t) {
if (t != null && count % modCount == 0) {
super.add(t);
public void add(AttributeSource a) throws IOException {
if (a != null && count % modCount == 0) {
super.add(a);
}
count++;
}

View File

@ -19,10 +19,10 @@ package org.apache.lucene.analysis;
import java.io.IOException;
import java.io.StringReader;
import java.util.LinkedList;
import java.util.List;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.index.Payload;
import org.apache.lucene.util.LuceneTestCase;
@ -36,13 +36,12 @@ public class TestAnalyzers extends LuceneTestCase {
String input,
String[] output) throws Exception {
TokenStream ts = a.tokenStream("dummy", new StringReader(input));
final Token reusableToken = new Token();
TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class);
for (int i=0; i<output.length; i++) {
Token nextToken = ts.next(reusableToken);
assertNotNull(nextToken);
assertEquals(nextToken.term(), output[i]);
assertTrue(ts.incrementToken());
assertEquals(termAtt.term(), output[i]);
}
assertNull(ts.next(reusableToken));
assertFalse(ts.incrementToken());
ts.close();
}
@ -95,14 +94,13 @@ public class TestAnalyzers extends LuceneTestCase {
}
void verifyPayload(TokenStream ts) throws IOException {
final Token reusableToken = new Token();
PayloadAttribute payloadAtt = (PayloadAttribute) ts.getAttribute(PayloadAttribute.class);
for(byte b=1;;b++) {
reusableToken.clear();
Token nextToken = ts.next(reusableToken);
if (nextToken==null) break;
boolean hasNext = ts.incrementToken();
if (!hasNext) break;
// System.out.println("id="+System.identityHashCode(nextToken) + " " + t);
// System.out.println("payload=" + (int)nextToken.getPayload().toByteArray()[0]);
assertEquals(b, nextToken.getPayload().toByteArray()[0]);
assertEquals(b, payloadAtt.getPayload().toByteArray()[0]);
}
}
@ -111,13 +109,11 @@ public class TestAnalyzers extends LuceneTestCase {
String s = "how now brown cow";
TokenStream ts;
ts = new WhitespaceTokenizer(new StringReader(s));
ts = new BuffTokenFilter(ts);
ts = new PayloadSetter(ts);
verifyPayload(ts);
ts = new WhitespaceTokenizer(new StringReader(s));
ts = new PayloadSetter(ts);
ts = new BuffTokenFilter(ts);
verifyPayload(ts);
}
@ -136,38 +132,21 @@ public class TestAnalyzers extends LuceneTestCase {
}
}
class BuffTokenFilter extends TokenFilter {
List lst;
public BuffTokenFilter(TokenStream input) {
super(input);
}
public Token next(final Token reusableToken) throws IOException {
if (lst == null) {
lst = new LinkedList();
for(Token nextToken = input.next(reusableToken); nextToken != null; nextToken = input.next(reusableToken)) {
lst.add(nextToken.clone());
}
}
return lst.size()==0 ? null : (Token)lst.remove(0);
}
}
class PayloadSetter extends TokenFilter {
PayloadAttribute payloadAtt;
public PayloadSetter(TokenStream input) {
super(input);
payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
}
byte[] data = new byte[1];
Payload p = new Payload(data,0,1);
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
Token nextToken = input.next(reusableToken);
if (nextToken==null) return null;
nextToken.setPayload(p); // reuse the payload / byte[]
public boolean incrementToken() throws IOException {
boolean hasNext = input.incrementToken();
if (!hasNext) return false;
payloadAtt.setPayload(p); // reuse the payload / byte[]
data[0]++;
return nextToken;
return true;
}
}

View File

@ -22,6 +22,8 @@ import java.io.IOException;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.TermVector;
@ -41,13 +43,17 @@ public class TestCachingTokenFilter extends LuceneTestCase {
Document doc = new Document();
TokenStream stream = new TokenStream() {
private int index = 0;
private TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class);
private OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
public boolean incrementToken() throws IOException {
if (index == tokens.length) {
return null;
return false;
} else {
return reusableToken.reinit(tokens[index++], 0, 0);
termAtt.setTermBuffer(tokens[index++]);
offsetAtt.setStartOffset(0);
offsetAtt.setEndOffset(0);
return true;
}
}
@ -92,10 +98,12 @@ public class TestCachingTokenFilter extends LuceneTestCase {
private void checkTokens(TokenStream stream) throws IOException {
int count = 0;
final Token reusableToken = new Token();
for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) {
TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class);
assertNotNull(termAtt);
while (stream.incrementToken()) {
assertTrue(count < tokens.length);
assertEquals(tokens[count], nextToken.term());
assertEquals(tokens[count], termAtt.term());
count++;
}

View File

@ -17,6 +17,7 @@ package org.apache.lucene.analysis;
* limitations under the License.
*/
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.util.LuceneTestCase;
import java.io.StringReader;
@ -25,82 +26,87 @@ public class TestISOLatin1AccentFilter extends LuceneTestCase {
public void testU() throws Exception {
TokenStream stream = new WhitespaceTokenizer(new StringReader("Des mot clés À LA CHAÎNE À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï IJ Ð Ñ Ò Ó Ô Õ Ö Ø Œ Þ Ù Ú Û Ü Ý Ÿ à á â ã ä å æ ç è é ê ë ì í î ï ij ð ñ ò ó ô õ ö ø œ ß þ ù ú û ü ý ÿ fi fl"));
ISOLatin1AccentFilter filter = new ISOLatin1AccentFilter(stream);
final Token reusableToken = new Token();
assertEquals("Des", filter.next(reusableToken).term());
assertEquals("mot", filter.next(reusableToken).term());
assertEquals("cles", filter.next(reusableToken).term());
assertEquals("A", filter.next(reusableToken).term());
assertEquals("LA", filter.next(reusableToken).term());
assertEquals("CHAINE", filter.next(reusableToken).term());
assertEquals("A", filter.next(reusableToken).term());
assertEquals("A", filter.next(reusableToken).term());
assertEquals("A", filter.next(reusableToken).term());
assertEquals("A", filter.next(reusableToken).term());
assertEquals("A", filter.next(reusableToken).term());
assertEquals("A", filter.next(reusableToken).term());
assertEquals("AE", filter.next(reusableToken).term());
assertEquals("C", filter.next(reusableToken).term());
assertEquals("E", filter.next(reusableToken).term());
assertEquals("E", filter.next(reusableToken).term());
assertEquals("E", filter.next(reusableToken).term());
assertEquals("E", filter.next(reusableToken).term());
assertEquals("I", filter.next(reusableToken).term());
assertEquals("I", filter.next(reusableToken).term());
assertEquals("I", filter.next(reusableToken).term());
assertEquals("I", filter.next(reusableToken).term());
assertEquals("IJ", filter.next(reusableToken).term());
assertEquals("D", filter.next(reusableToken).term());
assertEquals("N", filter.next(reusableToken).term());
assertEquals("O", filter.next(reusableToken).term());
assertEquals("O", filter.next(reusableToken).term());
assertEquals("O", filter.next(reusableToken).term());
assertEquals("O", filter.next(reusableToken).term());
assertEquals("O", filter.next(reusableToken).term());
assertEquals("O", filter.next(reusableToken).term());
assertEquals("OE", filter.next(reusableToken).term());
assertEquals("TH", filter.next(reusableToken).term());
assertEquals("U", filter.next(reusableToken).term());
assertEquals("U", filter.next(reusableToken).term());
assertEquals("U", filter.next(reusableToken).term());
assertEquals("U", filter.next(reusableToken).term());
assertEquals("Y", filter.next(reusableToken).term());
assertEquals("Y", filter.next(reusableToken).term());
assertEquals("a", filter.next(reusableToken).term());
assertEquals("a", filter.next(reusableToken).term());
assertEquals("a", filter.next(reusableToken).term());
assertEquals("a", filter.next(reusableToken).term());
assertEquals("a", filter.next(reusableToken).term());
assertEquals("a", filter.next(reusableToken).term());
assertEquals("ae", filter.next(reusableToken).term());
assertEquals("c", filter.next(reusableToken).term());
assertEquals("e", filter.next(reusableToken).term());
assertEquals("e", filter.next(reusableToken).term());
assertEquals("e", filter.next(reusableToken).term());
assertEquals("e", filter.next(reusableToken).term());
assertEquals("i", filter.next(reusableToken).term());
assertEquals("i", filter.next(reusableToken).term());
assertEquals("i", filter.next(reusableToken).term());
assertEquals("i", filter.next(reusableToken).term());
assertEquals("ij", filter.next(reusableToken).term());
assertEquals("d", filter.next(reusableToken).term());
assertEquals("n", filter.next(reusableToken).term());
assertEquals("o", filter.next(reusableToken).term());
assertEquals("o", filter.next(reusableToken).term());
assertEquals("o", filter.next(reusableToken).term());
assertEquals("o", filter.next(reusableToken).term());
assertEquals("o", filter.next(reusableToken).term());
assertEquals("o", filter.next(reusableToken).term());
assertEquals("oe", filter.next(reusableToken).term());
assertEquals("ss", filter.next(reusableToken).term());
assertEquals("th", filter.next(reusableToken).term());
assertEquals("u", filter.next(reusableToken).term());
assertEquals("u", filter.next(reusableToken).term());
assertEquals("u", filter.next(reusableToken).term());
assertEquals("u", filter.next(reusableToken).term());
assertEquals("y", filter.next(reusableToken).term());
assertEquals("y", filter.next(reusableToken).term());
assertEquals("fi", filter.next(reusableToken).term());
assertEquals("fl", filter.next(reusableToken).term());
assertNull(filter.next(reusableToken));
TermAttribute termAtt = (TermAttribute) filter.getAttribute(TermAttribute.class);
assertTermEquals("Des", filter, termAtt);
assertTermEquals("mot", filter, termAtt);
assertTermEquals("cles", filter, termAtt);
assertTermEquals("A", filter, termAtt);
assertTermEquals("LA", filter, termAtt);
assertTermEquals("CHAINE", filter, termAtt);
assertTermEquals("A", filter, termAtt);
assertTermEquals("A", filter, termAtt);
assertTermEquals("A", filter, termAtt);
assertTermEquals("A", filter, termAtt);
assertTermEquals("A", filter, termAtt);
assertTermEquals("A", filter, termAtt);
assertTermEquals("AE", filter, termAtt);
assertTermEquals("C", filter, termAtt);
assertTermEquals("E", filter, termAtt);
assertTermEquals("E", filter, termAtt);
assertTermEquals("E", filter, termAtt);
assertTermEquals("E", filter, termAtt);
assertTermEquals("I", filter, termAtt);
assertTermEquals("I", filter, termAtt);
assertTermEquals("I", filter, termAtt);
assertTermEquals("I", filter, termAtt);
assertTermEquals("IJ", filter, termAtt);
assertTermEquals("D", filter, termAtt);
assertTermEquals("N", filter, termAtt);
assertTermEquals("O", filter, termAtt);
assertTermEquals("O", filter, termAtt);
assertTermEquals("O", filter, termAtt);
assertTermEquals("O", filter, termAtt);
assertTermEquals("O", filter, termAtt);
assertTermEquals("O", filter, termAtt);
assertTermEquals("OE", filter, termAtt);
assertTermEquals("TH", filter, termAtt);
assertTermEquals("U", filter, termAtt);
assertTermEquals("U", filter, termAtt);
assertTermEquals("U", filter, termAtt);
assertTermEquals("U", filter, termAtt);
assertTermEquals("Y", filter, termAtt);
assertTermEquals("Y", filter, termAtt);
assertTermEquals("a", filter, termAtt);
assertTermEquals("a", filter, termAtt);
assertTermEquals("a", filter, termAtt);
assertTermEquals("a", filter, termAtt);
assertTermEquals("a", filter, termAtt);
assertTermEquals("a", filter, termAtt);
assertTermEquals("ae", filter, termAtt);
assertTermEquals("c", filter, termAtt);
assertTermEquals("e", filter, termAtt);
assertTermEquals("e", filter, termAtt);
assertTermEquals("e", filter, termAtt);
assertTermEquals("e", filter, termAtt);
assertTermEquals("i", filter, termAtt);
assertTermEquals("i", filter, termAtt);
assertTermEquals("i", filter, termAtt);
assertTermEquals("i", filter, termAtt);
assertTermEquals("ij", filter, termAtt);
assertTermEquals("d", filter, termAtt);
assertTermEquals("n", filter, termAtt);
assertTermEquals("o", filter, termAtt);
assertTermEquals("o", filter, termAtt);
assertTermEquals("o", filter, termAtt);
assertTermEquals("o", filter, termAtt);
assertTermEquals("o", filter, termAtt);
assertTermEquals("o", filter, termAtt);
assertTermEquals("oe", filter, termAtt);
assertTermEquals("ss", filter, termAtt);
assertTermEquals("th", filter, termAtt);
assertTermEquals("u", filter, termAtt);
assertTermEquals("u", filter, termAtt);
assertTermEquals("u", filter, termAtt);
assertTermEquals("u", filter, termAtt);
assertTermEquals("y", filter, termAtt);
assertTermEquals("y", filter, termAtt);
assertTermEquals("fi", filter, termAtt);
assertTermEquals("fl", filter, termAtt);
assertFalse(filter.incrementToken());
}
void assertTermEquals(String expected, TokenStream stream, TermAttribute termAtt) throws Exception {
assertTrue(stream.incrementToken());
assertEquals(expected, termAtt.term());
}
}

View File

@ -19,6 +19,7 @@ package org.apache.lucene.analysis;
import java.io.StringReader;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
@ -88,9 +89,9 @@ public class TestKeywordAnalyzer extends LuceneTestCase {
// LUCENE-1441
public void testOffsets() throws Exception {
TokenStream stream = new KeywordAnalyzer().tokenStream("field", new StringReader("abcd"));
Token token = new Token();
assertTrue(stream.next(token) != null);
assertEquals(0, token.startOffset);
assertEquals(4, token.endOffset);
OffsetAttribute offsetAtt = (OffsetAttribute) stream.addAttribute(OffsetAttribute.class);
assertTrue(stream.incrementToken());
assertEquals(0, offsetAtt.startOffset());
assertEquals(4, offsetAtt.endOffset());
}
}

View File

@ -17,6 +17,7 @@ package org.apache.lucene.analysis;
* limitations under the License.
*/
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.util.LuceneTestCase;
import java.io.StringReader;
@ -27,11 +28,15 @@ public class TestLengthFilter extends LuceneTestCase {
TokenStream stream = new WhitespaceTokenizer(
new StringReader("short toolong evenmuchlongertext a ab toolong foo"));
LengthFilter filter = new LengthFilter(stream, 2, 6);
final Token reusableToken = new Token();
assertEquals("short", filter.next(reusableToken).term());
assertEquals("ab", filter.next(reusableToken).term());
assertEquals("foo", filter.next(reusableToken).term());
assertNull(filter.next(reusableToken));
TermAttribute termAtt = (TermAttribute) filter.getAttribute(TermAttribute.class);
assertTrue(filter.incrementToken());
assertEquals("short", termAtt.term());
assertTrue(filter.incrementToken());
assertEquals("ab", termAtt.term());
assertTrue(filter.incrementToken());
assertEquals("foo", termAtt.term());
assertFalse(filter.incrementToken());
}
}

View File

@ -1,8 +1,10 @@
package org.apache.lucene.analysis;
import org.apache.lucene.util.LuceneTestCase;
import java.io.StringReader;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.util.LuceneTestCase;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@ -29,17 +31,19 @@ public class TestPerFieldAnalzyerWrapper extends LuceneTestCase {
TokenStream tokenStream = analyzer.tokenStream("field",
new StringReader(text));
final Token reusableToken = new Token();
Token nextToken = tokenStream.next(reusableToken);
TermAttribute termAtt = (TermAttribute) tokenStream.getAttribute(TermAttribute.class);
assertTrue(tokenStream.incrementToken());
assertEquals("WhitespaceAnalyzer does not lowercase",
"Qwerty",
nextToken.term());
termAtt.term());
tokenStream = analyzer.tokenStream("special",
new StringReader(text));
nextToken = tokenStream.next(reusableToken);
termAtt = (TermAttribute) tokenStream.getAttribute(TermAttribute.class);
assertTrue(tokenStream.incrementToken());
assertEquals("SimpleAnalyzer lowercases",
"qwerty",
nextToken.term());
termAtt.term());
}
}

View File

@ -1,6 +1,10 @@
package org.apache.lucene.analysis;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.LuceneTestCase;
import java.io.StringReader;
@ -35,19 +39,25 @@ public class TestStandardAnalyzer extends LuceneTestCase {
public void assertAnalyzesTo(Analyzer a, String input, String[] expectedImages, String[] expectedTypes, int[] expectedPosIncrs) throws Exception {
TokenStream ts = a.tokenStream("dummy", new StringReader(input));
final Token reusableToken = new Token();
// TODO Java 1.5
//final TypeAttribute typeAtt = reusableToken.getAttribute(TypeAttribute.class);
//final PositionIncrementAttribute posIncrAtt = reusableToken.getAttribute(PositionIncrementAttribute.class);
final TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class);
final TypeAttribute typeAtt = (TypeAttribute) ts.getAttribute(TypeAttribute.class);
final PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) ts.getAttribute(PositionIncrementAttribute.class);
for (int i = 0; i < expectedImages.length; i++) {
Token nextToken = ts.next(reusableToken);
assertNotNull(nextToken);
assertEquals(expectedImages[i], nextToken.term());
assertTrue(ts.incrementToken());
assertEquals(expectedImages[i], new String(termAtt.termBuffer(), 0, termAtt.termLength()));
if (expectedTypes != null) {
assertEquals(expectedTypes[i], nextToken.type());
assertEquals(expectedTypes[i], typeAtt.type());
}
if (expectedPosIncrs != null) {
assertEquals(expectedPosIncrs[i], nextToken.getPositionIncrement());
assertEquals(expectedPosIncrs[i], posIncrAtt.getPositionIncrement());
}
}
assertNull(ts.next(reusableToken));
assertFalse(ts.incrementToken());
ts.close();
}

View File

@ -17,6 +17,8 @@ package org.apache.lucene.analysis;
* limitations under the License.
*/
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.util.LuceneTestCase;
import java.io.StringReader;
@ -45,9 +47,10 @@ public class TestStopAnalyzer extends LuceneTestCase {
StringReader reader = new StringReader("This is a test of the english stop analyzer");
TokenStream stream = stop.tokenStream("test", reader);
assertTrue(stream != null);
final Token reusableToken = new Token();
for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) {
assertFalse(inValidTokens.contains(nextToken.term()));
TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class);
while (stream.incrementToken()) {
assertFalse(inValidTokens.contains(termAtt.term()));
}
}
@ -60,11 +63,13 @@ public class TestStopAnalyzer extends LuceneTestCase {
StringReader reader = new StringReader("This is a good test of the english stop analyzer");
TokenStream stream = newStop.tokenStream("test", reader);
assertNotNull(stream);
final Token reusableToken = new Token();
for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) {
String text = nextToken.term();
TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class);
PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) stream.addAttribute(PositionIncrementAttribute.class);
while (stream.incrementToken()) {
String text = termAtt.term();
assertFalse(stopWordsSet.contains(text));
assertEquals(1,nextToken.getPositionIncrement()); // by default stop tokenizer does not apply increments.
assertEquals(1,posIncrAtt.getPositionIncrement()); // by default stop tokenizer does not apply increments.
}
}
@ -82,11 +87,13 @@ public class TestStopAnalyzer extends LuceneTestCase {
TokenStream stream = newStop.tokenStream("test", reader);
assertNotNull(stream);
int i = 0;
final Token reusableToken = new Token();
for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) {
String text = nextToken.term();
TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class);
PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) stream.addAttribute(PositionIncrementAttribute.class);
while (stream.incrementToken()) {
String text = termAtt.term();
assertFalse(stopWordsSet.contains(text));
assertEquals(expectedIncr[i++],nextToken.getPositionIncrement());
assertEquals(expectedIncr[i++],posIncrAtt.getPositionIncrement());
}
} finally {
StopFilter.setEnablePositionIncrementsDefault(defaultEnable);

View File

@ -16,6 +16,8 @@ package org.apache.lucene.analysis;
* limitations under the License.
*/
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.util.English;
import org.apache.lucene.util.LuceneTestCase;
@ -35,19 +37,22 @@ public class TestStopFilter extends LuceneTestCase {
StringReader reader = new StringReader("Now is The Time");
String[] stopWords = new String[] { "is", "the", "Time" };
TokenStream stream = new StopFilter(new WhitespaceTokenizer(reader), stopWords);
final Token reusableToken = new Token();
assertEquals("Now", stream.next(reusableToken).term());
assertEquals("The", stream.next(reusableToken).term());
assertEquals(null, stream.next(reusableToken));
final TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class);
assertTrue(stream.incrementToken());
assertEquals("Now", termAtt.term());
assertTrue(stream.incrementToken());
assertEquals("The", termAtt.term());
assertFalse(stream.incrementToken());
}
public void testIgnoreCase() throws IOException {
StringReader reader = new StringReader("Now is The Time");
String[] stopWords = new String[] { "is", "the", "Time" };
TokenStream stream = new StopFilter(new WhitespaceTokenizer(reader), stopWords, true);
final Token reusableToken = new Token();
assertEquals("Now", stream.next(reusableToken).term());
assertEquals(null,stream.next(reusableToken));
final TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class);
assertTrue(stream.incrementToken());
assertEquals("Now", termAtt.term());
assertFalse(stream.incrementToken());
}
public void testStopFilt() throws IOException {
@ -55,10 +60,12 @@ public class TestStopFilter extends LuceneTestCase {
String[] stopWords = new String[] { "is", "the", "Time" };
Set stopSet = StopFilter.makeStopSet(stopWords);
TokenStream stream = new StopFilter(new WhitespaceTokenizer(reader), stopSet);
final Token reusableToken = new Token();
assertEquals("Now", stream.next(reusableToken).term());
assertEquals("The", stream.next(reusableToken).term());
assertEquals(null, stream.next(reusableToken));
final TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class);
assertTrue(stream.incrementToken());
assertEquals("Now", termAtt.term());
assertTrue(stream.incrementToken());
assertEquals("The", termAtt.term());
assertFalse(stream.incrementToken());
}
/**
@ -110,15 +117,16 @@ public class TestStopFilter extends LuceneTestCase {
private void doTestStopPositons(StopFilter stpf, boolean enableIcrements) throws IOException {
log("---> test with enable-increments-"+(enableIcrements?"enabled":"disabled"));
stpf.setEnablePositionIncrements(enableIcrements);
final Token reusableToken = new Token();
TermAttribute termAtt = (TermAttribute) stpf.getAttribute(TermAttribute.class);
PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) stpf.getAttribute(PositionIncrementAttribute.class);
for (int i=0; i<20; i+=3) {
Token nextToken = stpf.next(reusableToken);
log("Token "+i+": "+nextToken);
assertTrue(stpf.incrementToken());
log("Token "+i+": "+stpf);
String w = English.intToEnglish(i).trim();
assertEquals("expecting token "+i+" to be "+w,w,nextToken.term());
assertEquals("all but first token must have position increment of 3",enableIcrements?(i==0?1:3):1,nextToken.getPositionIncrement());
assertEquals("expecting token "+i+" to be "+w,w,termAtt.term());
assertEquals("all but first token must have position increment of 3",enableIcrements?(i==0?1:3):1,posIncrAtt.getPositionIncrement());
}
assertNull(stpf.next(reusableToken));
assertFalse(stpf.incrementToken());
}
// print debug info depending on VERBOSE

View File

@ -19,6 +19,7 @@ package org.apache.lucene.analysis;
import org.apache.lucene.util.LuceneTestCase;
/** @deprecated */
public class TestToken extends LuceneTestCase {
public TestToken(String name) {

View File

@ -22,12 +22,14 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Fieldable;
@ -35,6 +37,7 @@ import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.Field.TermVector;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil;
@ -138,33 +141,38 @@ public class TestDocumentWriter extends LuceneTestCase {
public TokenStream tokenStream(String fieldName, Reader reader) {
return new TokenFilter(new WhitespaceTokenizer(reader)) {
boolean first=true;
Token buffered;
AttributeSource state;
public Token next(final Token reusableToken) throws IOException {
if (buffered != null) {
Token nextToken = buffered;
buffered=null;
return nextToken;
public boolean incrementToken() throws IOException {
if (state != null) {
state.restoreState(this);
payloadAtt.setPayload(null);
posIncrAtt.setPositionIncrement(0);
termAtt.setTermBuffer(new char[]{'b'}, 0, 1);
state = null;
return true;
}
Token nextToken = input.next(reusableToken);
if (nextToken==null) return null;
if (Character.isDigit(nextToken.termBuffer()[0])) {
nextToken.setPositionIncrement(nextToken.termBuffer()[0] - '0');
boolean hasNext = input.incrementToken();
if (!hasNext) return false;
if (Character.isDigit(termAtt.termBuffer()[0])) {
posIncrAtt.setPositionIncrement(termAtt.termBuffer()[0] - '0');
}
if (first) {
// set payload on first position only
nextToken.setPayload(new Payload(new byte[]{100}));
payloadAtt.setPayload(new Payload(new byte[]{100}));
first = false;
}
// index a "synonym" for every token
buffered = (Token)nextToken.clone();
buffered.setPayload(null);
buffered.setPositionIncrement(0);
buffered.setTermBuffer(new char[]{'b'}, 0, 1);
state = captureState();
return true;
return nextToken;
}
TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class);
PayloadAttribute payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
};
}
};
@ -201,12 +209,14 @@ public class TestDocumentWriter extends LuceneTestCase {
private String[] tokens = new String[] {"term1", "term2", "term3", "term2"};
private int index = 0;
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
private TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class);
public boolean incrementToken() throws IOException {
if (index == tokens.length) {
return null;
return false;
} else {
return reusableToken.reinit(tokens[index++], 0, 0);
termAtt.setTermBuffer(tokens[index++]);
return true;
}
}

View File

@ -17,48 +17,48 @@ package org.apache.lucene.index;
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import java.io.File;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.PrintStream;
import java.util.Arrays;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Random;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.SinkTokenizer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.util._TestUtil;
import org.apache.lucene.store.MockRAMDirectory;
import org.apache.lucene.store.LockFactory;
import org.apache.lucene.store.Lock;
import org.apache.lucene.store.LockFactory;
import org.apache.lucene.store.MockRAMDirectory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.store.SingleInstanceLockFactory;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util._TestUtil;
/**
*
@ -1793,11 +1793,11 @@ public class TestIndexWriter extends LuceneTestCase
return new TokenFilter(new StandardTokenizer(reader)) {
private int count = 0;
public Token next(final Token reusableToken) throws IOException {
public boolean incrementToken() throws IOException {
if (count++ == 5) {
throw new IOException();
}
return input.next(reusableToken);
return input.incrementToken();
}
};
}
@ -1916,10 +1916,10 @@ public class TestIndexWriter extends LuceneTestCase
this.fieldName = fieldName;
}
public Token next(final Token reusableToken) throws IOException {
public boolean incrementToken() throws IOException {
if (this.fieldName.equals("crash") && count++ >= 4)
throw new IOException("I'm experiencing problems");
return input.next(reusableToken);
return input.incrementToken();
}
public void reset() throws IOException {
@ -3577,21 +3577,47 @@ public class TestIndexWriter extends LuceneTestCase
}
}
private static class MyAnalyzer extends Analyzer {
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream s = new WhitespaceTokenizer(reader);
s.addAttribute(PositionIncrementAttribute.class);
return s;
}
}
// LUCENE-1255
public void testNegativePositions() throws Throwable {
SinkTokenizer tokens = new SinkTokenizer();
Token t = new Token();
t.setTermBuffer("a");
t.setPositionIncrement(0);
tokens.add(t);
t.setTermBuffer("b");
t.setPositionIncrement(1);
tokens.add(t);
t.setTermBuffer("c");
tokens.add(t);
tokens.addAttribute(TermAttribute.class);
tokens.addAttribute(PositionIncrementAttribute.class);
AttributeSource state = new AttributeSource();
TermAttribute termAtt = (TermAttribute) state.addAttribute(TermAttribute.class);
PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) state.addAttribute(PositionIncrementAttribute.class);
termAtt.setTermBuffer("a");
posIncrAtt.setPositionIncrement(0);
tokens.add(state);
state = new AttributeSource();
termAtt = (TermAttribute) state.addAttribute(TermAttribute.class);
posIncrAtt = (PositionIncrementAttribute) state.addAttribute(PositionIncrementAttribute.class);
termAtt.setTermBuffer("b");
posIncrAtt.setPositionIncrement(1);
tokens.add(state);
state = new AttributeSource();
termAtt = (TermAttribute) state.addAttribute(TermAttribute.class);
posIncrAtt = (PositionIncrementAttribute) state.addAttribute(PositionIncrementAttribute.class);
termAtt.setTermBuffer("c");
posIncrAtt.setPositionIncrement(1);
tokens.add(state);
MockRAMDirectory dir = new MockRAMDirectory();
IndexWriter w = new IndexWriter(dir, new WhitespaceAnalyzer(), true, IndexWriter.MaxFieldLength.UNLIMITED);
IndexWriter w = new IndexWriter(dir, new MyAnalyzer(), true, IndexWriter.MaxFieldLength.UNLIMITED);
Document doc = new Document();
doc.add(new Field("field", tokens));
w.addDocument(doc);

View File

@ -20,19 +20,18 @@ package org.apache.lucene.index;
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LowerCaseTokenizer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.LuceneTestCase;
/**
* This testcase tests whether multi-level skipping is being used
@ -99,17 +98,19 @@ public class TestMultiLevelSkipList extends LuceneTestCase {
private static class PayloadFilter extends TokenFilter {
static int count = 0;
PayloadAttribute payloadAtt;
protected PayloadFilter(TokenStream input) {
super(input);
payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
}
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
Token nextToken = input.next(reusableToken);
if (nextToken != null) {
nextToken.setPayload(new Payload(new byte[] { (byte) count++ }));
public boolean incrementToken() throws IOException {
boolean hasNext = input.incrementToken();
if (hasNext) {
payloadAtt.setPayload(new Payload(new byte[] { (byte) count++ }));
}
return nextToken;
return hasNext;
}
}

View File

@ -27,20 +27,20 @@ import java.util.List;
import java.util.Map;
import java.util.Random;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.UnicodeUtil;
public class TestPayloads extends LuceneTestCase {
@ -442,32 +442,33 @@ public class TestPayloads extends LuceneTestCase {
private int length;
private int offset;
Payload payload = new Payload();
PayloadAttribute payloadAtt;
public PayloadFilter(TokenStream in, byte[] data, int offset, int length) {
super(in);
this.data = data;
this.length = length;
this.offset = offset;
payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
}
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
Token nextToken = input.next(reusableToken);
if (nextToken != null) {
public boolean incrementToken() throws IOException {
boolean hasNext = input.incrementToken();
if (hasNext) {
if (offset + length <= data.length) {
Payload p = null;
if (p == null) {
p = new Payload();
nextToken.setPayload(p);
payloadAtt.setPayload(p);
}
p.setData(data, offset, length);
offset += length;
} else {
nextToken.setPayload(null);
payloadAtt.setPayload(null);
}
}
return nextToken;
return hasNext;
}
}
@ -529,19 +530,25 @@ public class TestPayloads extends LuceneTestCase {
private boolean first;
private ByteArrayPool pool;
private String term;
TermAttribute termAtt;
PayloadAttribute payloadAtt;
PoolingPayloadTokenStream(ByteArrayPool pool) {
this.pool = pool;
payload = pool.get();
generateRandomData(payload);
term = pool.bytesToString(payload);
first = true;
payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
}
public Token next(final Token reusableToken) throws IOException {
if (!first) return null;
reusableToken.reinit(term, 0, 0);
reusableToken.setPayload(new Payload(payload));
return reusableToken;
public boolean incrementToken() throws IOException {
if (!first) return false;
termAtt.setTermBuffer(term);
payloadAtt.setPayload(new Payload(payload));
return true;
}
public void close() throws IOException {

View File

@ -17,14 +17,6 @@ package org.apache.lucene.index;
* limitations under the License.
*/
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.store.MockRAMDirectory;
import org.apache.lucene.util.LuceneTestCase;
import java.io.IOException;
import java.io.Reader;
import java.util.Arrays;
@ -32,6 +24,16 @@ import java.util.Iterator;
import java.util.Map;
import java.util.SortedSet;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.store.MockRAMDirectory;
import org.apache.lucene.util.LuceneTestCase;
public class TestTermVectorsReader extends LuceneTestCase {
//Must be lexicographically sorted, will do in setup, versus trying to maintain here
private String[] testFields = {"f1", "f2", "f3", "f4"};
@ -118,17 +120,31 @@ public class TestTermVectorsReader extends LuceneTestCase {
private class MyTokenStream extends TokenStream {
int tokenUpto;
public Token next(final Token reusableToken) {
TermAttribute termAtt;
PositionIncrementAttribute posIncrAtt;
OffsetAttribute offsetAtt;
public MyTokenStream() {
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
}
public boolean incrementToken() {
if (tokenUpto >= tokens.length)
return null;
return false;
else {
final TestToken testToken = tokens[tokenUpto++];
reusableToken.reinit(testToken.text, testToken.startOffset, testToken.endOffset);
if (tokenUpto > 1)
reusableToken.setPositionIncrement(testToken.pos - tokens[tokenUpto-2].pos);
else
reusableToken.setPositionIncrement(testToken.pos+1);
return reusableToken;
termAtt.setTermBuffer(testToken.text);
offsetAtt.setStartOffset(testToken.startOffset);
offsetAtt.setEndOffset(testToken.endOffset);
if (tokenUpto > 1) {
posIncrAtt.setPositionIncrement(testToken.pos - tokens[tokenUpto-2].pos);
} else {
posIncrAtt.setPositionIncrement(testToken.pos+1);
}
return true;
}
}
}

View File

@ -17,18 +17,18 @@ package org.apache.lucene.index;
*/
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import java.io.IOException;
import java.io.Reader;
import java.util.Random;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import java.io.Reader;
import java.io.IOException;
import java.util.Random;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.LuceneTestCase;
/**
* @version $Id$
@ -36,15 +36,21 @@ import java.util.Random;
class RepeatingTokenStream extends TokenStream {
public int num;
Token t;
TermAttribute termAtt;
String value;
public RepeatingTokenStream(String val) {
t = new Token(0,val.length());
t.setTermBuffer(val);
this.value = val;
this.termAtt = (TermAttribute) addAttribute(TermAttribute.class);
}
public Token next(final Token reusableToken) throws IOException {
return --num<0 ? null : (Token) t.clone();
public boolean incrementToken() throws IOException {
num--;
if (num >= 0) {
termAtt.setTermBuffer(value);
return true;
}
return false;
}
}

View File

@ -17,17 +17,20 @@ package org.apache.lucene.queryParser;
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.search.Query;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.LuceneTestCase;
/**
* Test QueryParser's ability to deal with Analyzers that return more
@ -140,34 +143,49 @@ public class TestMultiAnalyzer extends LuceneTestCase {
private final class TestFilter extends TokenFilter {
private Token prevToken;
private String prevType;
private int prevStartOffset;
private int prevEndOffset;
TermAttribute termAtt;
PositionIncrementAttribute posIncrAtt;
OffsetAttribute offsetAtt;
TypeAttribute typeAtt;
public TestFilter(TokenStream in) {
super(in);
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
}
public final Token next(final Token reusableToken) throws java.io.IOException {
public final boolean incrementToken() throws java.io.IOException {
if (multiToken > 0) {
reusableToken.reinit("multi"+(multiToken+1), prevToken.startOffset(), prevToken.endOffset(), prevToken.type());
reusableToken.setPositionIncrement(0);
termAtt.setTermBuffer("multi"+(multiToken+1));
offsetAtt.setStartOffset(prevStartOffset);
offsetAtt.setEndOffset(prevEndOffset);
typeAtt.setType(prevType);
posIncrAtt.setPositionIncrement(0);
multiToken--;
return reusableToken;
return true;
} else {
Token nextToken = input.next(reusableToken);
if (nextToken == null) {
prevToken = null;
return null;
boolean next = input.incrementToken();
if (next == false) {
return false;
}
prevToken = (Token) nextToken.clone();
String text = nextToken.term();
prevType = typeAtt.type();
prevStartOffset = offsetAtt.startOffset();
prevEndOffset = offsetAtt.endOffset();
String text = termAtt.term();
if (text.equals("triplemulti")) {
multiToken = 2;
return nextToken;
return true;
} else if (text.equals("multi")) {
multiToken = 1;
return nextToken;
return true;
} else {
return nextToken;
return true;
}
}
}
@ -192,23 +210,28 @@ public class TestMultiAnalyzer extends LuceneTestCase {
private final class TestPosIncrementFilter extends TokenFilter {
TermAttribute termAtt;
PositionIncrementAttribute posIncrAtt;
public TestPosIncrementFilter(TokenStream in) {
super(in);
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
}
public final Token next(final Token reusableToken) throws java.io.IOException {
for (Token nextToken = input.next(reusableToken); nextToken != null; nextToken = input.next(reusableToken)) {
if (nextToken.term().equals("the")) {
public final boolean incrementToken () throws java.io.IOException {
while(input.incrementToken()) {
if (termAtt.term().equals("the")) {
// stopword, do nothing
} else if (nextToken.term().equals("quick")) {
nextToken.setPositionIncrement(2);
return nextToken;
} else if (termAtt.term().equals("quick")) {
posIncrAtt.setPositionIncrement(2);
return true;
} else {
nextToken.setPositionIncrement(1);
return nextToken;
posIncrAtt.setPositionIncrement(1);
return true;
}
}
return null;
return false;
}
}

View File

@ -22,7 +22,6 @@ import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;

View File

@ -19,8 +19,8 @@ package org.apache.lucene.queryParser;
import java.io.IOException;
import java.io.Reader;
import java.text.DateFormat;
import java.text.Collator;
import java.text.DateFormat;
import java.util.Calendar;
import java.util.Date;
import java.util.Locale;
@ -31,11 +31,12 @@ import org.apache.lucene.analysis.LowerCaseTokenizer;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.analysis.StopAnalyzer;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.document.DateField;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document;
@ -64,36 +65,47 @@ public class TestQueryParser extends LuceneTestCase {
public static Analyzer qpAnalyzer = new QPTestAnalyzer();
public static class QPTestFilter extends TokenFilter {
TermAttribute termAtt;
OffsetAttribute offsetAtt;
/**
* Filter which discards the token 'stop' and which expands the
* token 'phrase' into 'phrase1 phrase2'
*/
public QPTestFilter(TokenStream in) {
super(in);
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
}
boolean inPhrase = false;
int savedStart = 0, savedEnd = 0;
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
public boolean incrementToken() throws IOException {
if (inPhrase) {
inPhrase = false;
return reusableToken.reinit("phrase2", savedStart, savedEnd);
termAtt.setTermBuffer("phrase2");
offsetAtt.setStartOffset(savedStart);
offsetAtt.setEndOffset(savedEnd);
return true;
} else
for (Token nextToken = input.next(reusableToken); nextToken != null; nextToken = input.next(reusableToken)) {
if (nextToken.term().equals("phrase")) {
while (input.incrementToken()) {
if (termAtt.term().equals("phrase")) {
inPhrase = true;
savedStart = nextToken.startOffset();
savedEnd = nextToken.endOffset();
return nextToken.reinit("phrase1", savedStart, savedEnd);
} else if (!nextToken.term().equals("stop"))
return nextToken;
savedStart = offsetAtt.startOffset();
savedEnd = offsetAtt.endOffset();
termAtt.setTermBuffer("phrase1");
offsetAtt.setStartOffset(savedStart);
offsetAtt.setEndOffset(savedEnd);
return true;
} else if (!termAtt.term().equals("stop"))
return true;
}
return null;
return false;
}
}
public static class QPTestAnalyzer extends Analyzer {
/** Filters LowerCaseTokenizer with StopFilter. */

View File

@ -17,14 +17,16 @@ package org.apache.lucene.search;
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
@ -49,14 +51,19 @@ public class TestPositionIncrement extends LuceneTestCase {
private final int[] INCREMENTS = {1, 2, 1, 0, 1};
private int i = 0;
public Token next(final Token reusableToken) {
assert reusableToken != null;
PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class);
OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
public boolean incrementToken() {
if (i == TOKENS.length)
return null;
reusableToken.reinit(TOKENS[i], i, i);
reusableToken.setPositionIncrement(INCREMENTS[i]);
return false;
termAtt.setTermBuffer(TOKENS[i]);
offsetAtt.setStartOffset(i);
offsetAtt.setEndOffset(i);
posIncrAtt.setPositionIncrement(INCREMENTS[i]);
i++;
return reusableToken;
return true;
}
};
}
@ -196,18 +203,4 @@ public class TestPositionIncrement extends LuceneTestCase {
StopFilter.setEnablePositionIncrementsDefault(dflt);
}
}
/**
* Basic analyzer behavior should be to keep sequential terms in one
* increment from one another.
*/
public void testIncrementingPositions() throws Exception {
Analyzer analyzer = new WhitespaceAnalyzer();
TokenStream ts = analyzer.tokenStream("field",
new StringReader("one two three four five"));
final Token reusableToken = new Token();
for (Token nextToken = ts.next(reusableToken); nextToken != null; nextToken = ts.next(reusableToken)) {
assertEquals(nextToken.term(), 1, nextToken.getPositionIncrement());
}
}
}

View File

@ -26,7 +26,7 @@ import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.util.LuceneTestCase;
import java.io.IOException;
@ -236,23 +236,25 @@ public class TestRangeQuery extends LuceneTestCase {
private static class SingleCharTokenizer extends Tokenizer {
char[] buffer = new char[1];
boolean done;
TermAttribute termAtt;
public SingleCharTokenizer(Reader r) {
super(r);
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
}
public final Token next(final Token reusableToken) throws IOException {
public boolean incrementToken() throws IOException {
int count = input.read(buffer);
if (done)
return null;
return false;
else {
done = true;
if (count == 1) {
reusableToken.termBuffer()[0] = buffer[0];
reusableToken.setTermLength(1);
termAtt.termBuffer()[0] = buffer[0];
termAtt.setTermLength(1);
} else
reusableToken.setTermLength(0);
return reusableToken;
termAtt.setTermLength(0);
return true;
}
}

View File

@ -2,6 +2,7 @@ package org.apache.lucene.search.payloads;
import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.index.Payload;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.RAMDirectory;
@ -41,34 +42,36 @@ public class PayloadHelper {
public class PayloadFilter extends TokenFilter {
String fieldName;
int numSeen = 0;
PayloadAttribute payloadAtt;
public PayloadFilter(TokenStream input, String fieldName) {
super(input);
this.fieldName = fieldName;
payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
}
public Token next() throws IOException {
Token result = input.next();
if (result != null) {
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
if (fieldName.equals(FIELD))
{
result.setPayload(new Payload(payloadField));
payloadAtt.setPayload(new Payload(payloadField));
}
else if (fieldName.equals(MULTI_FIELD))
{
if (numSeen % 2 == 0)
{
result.setPayload(new Payload(payloadMultiField1));
payloadAtt.setPayload(new Payload(payloadMultiField1));
}
else
{
result.setPayload(new Payload(payloadMultiField2));
payloadAtt.setPayload(new Payload(payloadMultiField2));
}
numSeen++;
}
return true;
}
return result;
return false;
}
}

View File

@ -21,9 +21,9 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LowerCaseTokenizer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
@ -67,28 +67,31 @@ public class TestBoostingTermQuery extends LuceneTestCase {
String fieldName;
int numSeen = 0;
PayloadAttribute payloadAtt;
public PayloadFilter(TokenStream input, String fieldName) {
super(input);
this.fieldName = fieldName;
payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
}
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
Token nextToken = input.next(reusableToken);
if (nextToken != null) {
public boolean incrementToken() throws IOException {
boolean hasNext = input.incrementToken();
if (hasNext) {
if (fieldName.equals("field")) {
nextToken.setPayload(new Payload(payloadField));
payloadAtt.setPayload(new Payload(payloadField));
} else if (fieldName.equals("multiField")) {
if (numSeen % 2 == 0) {
nextToken.setPayload(new Payload(payloadMultiField1));
payloadAtt.setPayload(new Payload(payloadMultiField1));
} else {
nextToken.setPayload(new Payload(payloadMultiField2));
payloadAtt.setPayload(new Payload(payloadMultiField2));
}
numSeen++;
}
return true;
} else {
return false;
}
return nextToken;
}
}

View File

@ -27,9 +27,11 @@ import junit.framework.TestCase;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LowerCaseTokenizer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
@ -43,8 +45,9 @@ import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.payloads.PayloadHelper;
import org.apache.lucene.search.payloads.PayloadSpanUtil;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.LuceneTestCase;
public class TestPayloadSpans extends TestCase {
public class TestPayloadSpans extends LuceneTestCase {
private final static boolean DEBUG = false;
private IndexSearcher searcher;
private Similarity similarity = new DefaultSimilarity();
@ -54,7 +57,8 @@ public class TestPayloadSpans extends TestCase {
super(s);
}
protected void setUp() throws IOException {
protected void setUp() throws Exception {
super.setUp();
PayloadHelper helper = new PayloadHelper();
searcher = helper.setUp(similarity, 1000);
indexReader = searcher.getIndexReader();
@ -345,6 +349,9 @@ public class TestPayloadSpans extends TestCase {
Set entities = new HashSet();
Set nopayload = new HashSet();
int pos;
PayloadAttribute payloadAtt;
TermAttribute termAtt;
PositionIncrementAttribute posIncrAtt;
public PayloadFilter(TokenStream input, String fieldName) {
super(input);
@ -354,24 +361,26 @@ public class TestPayloadSpans extends TestCase {
entities.add("one");
nopayload.add("nopayload");
nopayload.add("np");
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
}
public Token next() throws IOException {
Token result = input.next();
if (result != null) {
String token = new String(result.termBuffer(), 0, result.termLength());
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
String token = new String(termAtt.termBuffer(), 0, termAtt.termLength());
if (!nopayload.contains(token)) {
if (entities.contains(token)) {
result.setPayload(new Payload((token + ":Entity:"+ pos ).getBytes()));
payloadAtt.setPayload(new Payload((token + ":Entity:"+ pos ).getBytes()));
} else {
result.setPayload(new Payload((token + ":Noise:" + pos ).getBytes()));
payloadAtt.setPayload(new Payload((token + ":Noise:" + pos ).getBytes()));
}
}
pos += result.getPositionIncrement();
pos += posIncrAtt.getPositionIncrement();
return true;
}
return result;
return false;
}
}
}

View File

@ -17,6 +17,7 @@ package org.apache.lucene.util;
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.index.ConcurrentMergeScheduler;
import junit.framework.TestCase;
@ -42,6 +43,7 @@ public abstract class LuceneTestCase extends TestCase {
protected void setUp() throws Exception {
ConcurrentMergeScheduler.setTestMode();
TokenStream.setUseNewAPIDefault(true);
}
protected void tearDown() throws Exception {