mirror of https://github.com/apache/lucene.git
LUCENE-1422: New TokenStream API that uses a new class called AttributeSource instead of the now deprecated Token class. All attributes that the Token class had have been moved into separate classes: TermAttribute, OffsetAttribute, PositionIncrementAttribute, PayloadAttribute, TypeAttribute and FlagsAttribute.
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@718798 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
72e94add53
commit
898cfe87cd
|
@ -25,6 +25,15 @@ API Changes
|
|||
and deprecate FSDirectory.getDirectory(). FSDirectory instances
|
||||
are not required to be singletons per path. (yonik)
|
||||
|
||||
4. LUCENE-1422: New TokenStream API that uses a new class called
|
||||
AttributeSource instead of the now deprecated Token class. All attributes
|
||||
that the Token class had have been moved into separate classes:
|
||||
TermAttribute, OffsetAttribute, PositionIncrementAttribute,
|
||||
PayloadAttribute, TypeAttribute and FlagsAttribute. The new API
|
||||
is much more flexible; it allows to combine the Attributes arbitrarily
|
||||
and also to define custom Attributes. The new API has the same performance
|
||||
as the old next(Token) approach. (Michael Busch)
|
||||
|
||||
Bug fixes
|
||||
|
||||
1. LUCENE-1415: MultiPhraseQuery has incorrect hashCode() and equals()
|
||||
|
|
|
@ -22,6 +22,8 @@ import java.util.Iterator;
|
|||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
||||
/**
|
||||
* This class can be used if the Tokens of a TokenStream
|
||||
* are intended to be consumed more than once. It caches
|
||||
|
@ -40,6 +42,25 @@ public class CachingTokenFilter extends TokenFilter {
|
|||
super(input);
|
||||
}
|
||||
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (cache == null) {
|
||||
// fill cache lazily
|
||||
cache = new LinkedList();
|
||||
fillCache();
|
||||
iterator = cache.iterator();
|
||||
}
|
||||
|
||||
if (!iterator.hasNext()) {
|
||||
// the cache is exhausted, return null
|
||||
return false;
|
||||
}
|
||||
// Since the TokenFilter can be reset, the tokens need to be preserved as immutable.
|
||||
AttributeSource state = (AttributeSource) iterator.next();
|
||||
state.restoreState(this);
|
||||
return true;
|
||||
}
|
||||
|
||||
/** @deprecated */
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
if (cache == null) {
|
||||
|
@ -64,6 +85,13 @@ public class CachingTokenFilter extends TokenFilter {
|
|||
}
|
||||
}
|
||||
|
||||
private void fillCache() throws IOException {
|
||||
while(input.incrementToken()) {
|
||||
cache.add(captureState());
|
||||
}
|
||||
}
|
||||
|
||||
/** @deprecated */
|
||||
private void fillCache(final Token reusableToken) throws IOException {
|
||||
for (Token nextToken = input.next(reusableToken); nextToken != null; nextToken = input.next(reusableToken)) {
|
||||
cache.add(nextToken.clone());
|
||||
|
|
|
@ -20,10 +20,15 @@ package org.apache.lucene.analysis;
|
|||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
|
||||
/** An abstract base class for simple, character-oriented tokenizers.*/
|
||||
public abstract class CharTokenizer extends Tokenizer {
|
||||
public CharTokenizer(Reader input) {
|
||||
super(input);
|
||||
offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
}
|
||||
|
||||
private int offset = 0, bufferIndex = 0, dataLen = 0;
|
||||
|
@ -31,6 +36,9 @@ public abstract class CharTokenizer extends Tokenizer {
|
|||
private static final int IO_BUFFER_SIZE = 4096;
|
||||
private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
|
||||
|
||||
private TermAttribute termAtt;
|
||||
private OffsetAttribute offsetAtt;
|
||||
|
||||
/** Returns true iff a character should be included in a token. This
|
||||
* tokenizer generates as tokens adjacent sequences of characters which
|
||||
* satisfy this predicate. Characters for which this is false are used to
|
||||
|
@ -44,6 +52,50 @@ public abstract class CharTokenizer extends Tokenizer {
|
|||
return c;
|
||||
}
|
||||
|
||||
public final boolean incrementToken() throws IOException {
|
||||
clearAttributes();
|
||||
int length = 0;
|
||||
int start = bufferIndex;
|
||||
char[] buffer = termAtt.termBuffer();
|
||||
while (true) {
|
||||
|
||||
if (bufferIndex >= dataLen) {
|
||||
offset += dataLen;
|
||||
dataLen = input.read(ioBuffer);
|
||||
if (dataLen == -1) {
|
||||
if (length > 0)
|
||||
break;
|
||||
else
|
||||
return false;
|
||||
}
|
||||
bufferIndex = 0;
|
||||
}
|
||||
|
||||
final char c = ioBuffer[bufferIndex++];
|
||||
|
||||
if (isTokenChar(c)) { // if it's a token char
|
||||
|
||||
if (length == 0) // start of token
|
||||
start = offset + bufferIndex - 1;
|
||||
else if (length == buffer.length)
|
||||
buffer = termAtt.resizeTermBuffer(1+length);
|
||||
|
||||
buffer[length++] = normalize(c); // buffer it, normalized
|
||||
|
||||
if (length == MAX_WORD_LEN) // buffer overflow!
|
||||
break;
|
||||
|
||||
} else if (length > 0) // at non-Letter w/ chars
|
||||
break; // return 'em
|
||||
}
|
||||
|
||||
termAtt.setTermLength(length);
|
||||
offsetAtt.setStartOffset(start);
|
||||
offsetAtt.setEndOffset(start+length);
|
||||
return true;
|
||||
}
|
||||
|
||||
/** @deprecated */
|
||||
public final Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
reusableToken.clear();
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
|
@ -27,11 +29,33 @@ package org.apache.lucene.analysis;
|
|||
public class ISOLatin1AccentFilter extends TokenFilter {
|
||||
public ISOLatin1AccentFilter(TokenStream input) {
|
||||
super(input);
|
||||
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
}
|
||||
|
||||
private char[] output = new char[256];
|
||||
private int outputPos;
|
||||
private TermAttribute termAtt;
|
||||
|
||||
public final boolean incrementToken() throws java.io.IOException {
|
||||
if (input.incrementToken()) {
|
||||
final char[] buffer = termAtt.termBuffer();
|
||||
final int length = termAtt.termLength();
|
||||
// If no characters actually require rewriting then we
|
||||
// just return token as-is:
|
||||
for(int i=0;i<length;i++) {
|
||||
final char c = buffer[i];
|
||||
if (c >= '\u00c0' && c <= '\uFB06') {
|
||||
removeAccents(buffer, length);
|
||||
termAtt.setTermBuffer(output, 0, outputPos);
|
||||
break;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
} else
|
||||
return false;
|
||||
}
|
||||
|
||||
/** @deprecated */
|
||||
public final Token next(final Token reusableToken) throws java.io.IOException {
|
||||
assert reusableToken != null;
|
||||
Token nextToken = input.next(reusableToken);
|
||||
|
|
|
@ -20,6 +20,9 @@ package org.apache.lucene.analysis;
|
|||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
|
||||
/**
|
||||
* Emits the entire input as a single token.
|
||||
*/
|
||||
|
@ -28,6 +31,8 @@ public class KeywordTokenizer extends Tokenizer {
|
|||
private static final int DEFAULT_BUFFER_SIZE = 256;
|
||||
|
||||
private boolean done;
|
||||
private TermAttribute termAtt;
|
||||
private OffsetAttribute offsetAtt;
|
||||
|
||||
public KeywordTokenizer(Reader input) {
|
||||
this(input, DEFAULT_BUFFER_SIZE);
|
||||
|
@ -36,8 +41,32 @@ public class KeywordTokenizer extends Tokenizer {
|
|||
public KeywordTokenizer(Reader input, int bufferSize) {
|
||||
super(input);
|
||||
this.done = false;
|
||||
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||
}
|
||||
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (!done) {
|
||||
done = true;
|
||||
int upto = 0;
|
||||
termAtt.clear();
|
||||
char[] buffer = termAtt.termBuffer();
|
||||
while (true) {
|
||||
final int length = input.read(buffer, upto, buffer.length-upto);
|
||||
if (length == -1) break;
|
||||
upto += length;
|
||||
if (upto == buffer.length)
|
||||
buffer = termAtt.resizeTermBuffer(1+buffer.length);
|
||||
}
|
||||
termAtt.setTermLength(upto);
|
||||
offsetAtt.setStartOffset(0);
|
||||
offsetAtt.setEndOffset(upto);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/** @deprecated */
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
if (!done) {
|
||||
|
|
|
@ -19,6 +19,8 @@ package org.apache.lucene.analysis;
|
|||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
|
||||
/**
|
||||
* Removes words that are too long and too short from the stream.
|
||||
*
|
||||
|
@ -30,6 +32,8 @@ public final class LengthFilter extends TokenFilter {
|
|||
final int min;
|
||||
final int max;
|
||||
|
||||
private TermAttribute termAtt;
|
||||
|
||||
/**
|
||||
* Build a filter that removes words that are too long or too
|
||||
* short from the text.
|
||||
|
@ -39,11 +43,29 @@ public final class LengthFilter extends TokenFilter {
|
|||
super(in);
|
||||
this.min = min;
|
||||
this.max = max;
|
||||
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the next input Token whose term() is the right len
|
||||
*/
|
||||
public final boolean incrementToken() throws IOException {
|
||||
// return the first non-stop word found
|
||||
while (input.incrementToken()) {
|
||||
int len = termAtt.termLength();
|
||||
if (len >= min && len <= max) {
|
||||
return true;
|
||||
}
|
||||
// note: else we ignore it but should we index each part of it?
|
||||
}
|
||||
// reached EOS -- return null
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the next input Token whose term() is the right len
|
||||
* @deprecated
|
||||
*/
|
||||
public final Token next(final Token reusableToken) throws IOException
|
||||
{
|
||||
assert reusableToken != null;
|
||||
|
|
|
@ -19,6 +19,8 @@ package org.apache.lucene.analysis;
|
|||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
|
||||
/**
|
||||
* Normalizes token text to lower case.
|
||||
*
|
||||
|
@ -27,8 +29,25 @@ import java.io.IOException;
|
|||
public final class LowerCaseFilter extends TokenFilter {
|
||||
public LowerCaseFilter(TokenStream in) {
|
||||
super(in);
|
||||
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
}
|
||||
|
||||
private TermAttribute termAtt;
|
||||
|
||||
public final boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
|
||||
final char[] buffer = termAtt.termBuffer();
|
||||
final int length = termAtt.termLength();
|
||||
for(int i=0;i<length;i++)
|
||||
buffer[i] = Character.toLowerCase(buffer[i]);
|
||||
|
||||
return true;
|
||||
} else
|
||||
return false;
|
||||
}
|
||||
|
||||
/** @deprecated */
|
||||
public final Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
Token nextToken = input.next(reusableToken);
|
||||
|
|
|
@ -19,6 +19,8 @@ package org.apache.lucene.analysis;
|
|||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
|
||||
/** Transforms the token stream as per the Porter stemming algorithm.
|
||||
Note: the input to the stemming filter must already be in lower case,
|
||||
so you will need to use LowerCaseFilter or LowerCaseTokenizer farther
|
||||
|
@ -39,12 +41,24 @@ import java.io.IOException;
|
|||
*/
|
||||
public final class PorterStemFilter extends TokenFilter {
|
||||
private PorterStemmer stemmer;
|
||||
private TermAttribute termAtt;
|
||||
|
||||
public PorterStemFilter(TokenStream in) {
|
||||
super(in);
|
||||
stemmer = new PorterStemmer();
|
||||
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
}
|
||||
|
||||
public final boolean incrementToken() throws IOException {
|
||||
if (!input.incrementToken())
|
||||
return false;
|
||||
|
||||
if (stemmer.stem(termAtt.termBuffer(), 0, termAtt.termLength()))
|
||||
termAtt.setTermBuffer(stemmer.getResultBuffer(), 0, stemmer.getResultLength());
|
||||
return true;
|
||||
}
|
||||
|
||||
/** @deprecated */
|
||||
public final Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
Token nextToken = input.next(reusableToken);
|
||||
|
|
|
@ -22,6 +22,8 @@ import java.util.ArrayList;
|
|||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
||||
|
||||
/**
|
||||
* A SinkTokenizer can be used to cache Tokens for use in an Analyzer
|
||||
|
@ -61,10 +63,30 @@ public class SinkTokenizer extends Tokenizer {
|
|||
return lst;
|
||||
}
|
||||
|
||||
/**
|
||||
* Increments this stream to the next token out of the list of cached tokens
|
||||
* @throws IOException
|
||||
*/
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (iter == null) iter = lst.iterator();
|
||||
// Since this TokenStream can be reset we have to maintain the tokens as immutable
|
||||
if (iter.hasNext()) {
|
||||
AttributeSource state = (AttributeSource) iter.next();
|
||||
state.restoreState(this);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public void add(AttributeSource source) throws IOException {
|
||||
lst.add(source);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the next token out of the list of cached tokens
|
||||
* @return The next {@link org.apache.lucene.analysis.Token} in the Sink.
|
||||
* @throws IOException
|
||||
* @deprecated
|
||||
*/
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
|
@ -77,8 +99,6 @@ public class SinkTokenizer extends Tokenizer {
|
|||
return null;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Override this method to cache only certain tokens, or new tokens based
|
||||
* on the old tokens.
|
||||
|
|
|
@ -21,6 +21,9 @@ import java.io.IOException;
|
|||
import java.util.Arrays;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
|
||||
/**
|
||||
* Removes stop words from a token stream.
|
||||
*/
|
||||
|
@ -32,6 +35,9 @@ public final class StopFilter extends TokenFilter {
|
|||
private final CharArraySet stopWords;
|
||||
private boolean enablePositionIncrements = ENABLE_POSITION_INCREMENTS_DEFAULT;
|
||||
|
||||
private TermAttribute termAtt;
|
||||
private PositionIncrementAttribute posIncrAtt;
|
||||
|
||||
/**
|
||||
* Construct a token stream filtering the given input.
|
||||
*/
|
||||
|
@ -47,6 +53,7 @@ public final class StopFilter extends TokenFilter {
|
|||
public StopFilter(TokenStream in, String[] stopWords, boolean ignoreCase) {
|
||||
super(in);
|
||||
this.stopWords = (CharArraySet)makeStopSet(stopWords, ignoreCase);
|
||||
init();
|
||||
}
|
||||
|
||||
|
||||
|
@ -74,6 +81,7 @@ public final class StopFilter extends TokenFilter {
|
|||
this.stopWords = new CharArraySet(stopWords.size(), ignoreCase);
|
||||
this.stopWords.addAll(stopWords);
|
||||
}
|
||||
init();
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -86,6 +94,11 @@ public final class StopFilter extends TokenFilter {
|
|||
this(in, stopWords, false);
|
||||
}
|
||||
|
||||
public void init() {
|
||||
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds a Set from an array of stop words,
|
||||
* appropriate for passing into the StopFilter constructor.
|
||||
|
@ -113,6 +126,26 @@ public final class StopFilter extends TokenFilter {
|
|||
/**
|
||||
* Returns the next input Token whose term() is not a stop word.
|
||||
*/
|
||||
public final boolean incrementToken() throws IOException {
|
||||
// return the first non-stop word found
|
||||
int skippedPositions = 0;
|
||||
while (input.incrementToken()) {
|
||||
if (!stopWords.contains(termAtt.termBuffer(), 0, termAtt.termLength())) {
|
||||
if (enablePositionIncrements) {
|
||||
posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
skippedPositions += posIncrAtt.getPositionIncrement();
|
||||
}
|
||||
// reached EOS -- return null
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the next input Token whose term() is not a stop word.
|
||||
* @deprecated
|
||||
*/
|
||||
public final Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
// return the first non-stop word found
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Iterator;
|
||||
|
||||
|
||||
/**
|
||||
|
@ -60,8 +61,21 @@ public class TeeTokenFilter extends TokenFilter {
|
|||
public TeeTokenFilter(TokenStream input, SinkTokenizer sink) {
|
||||
super(input);
|
||||
this.sink = sink;
|
||||
Iterator it = getAttributesIterator();
|
||||
while (it.hasNext()) {
|
||||
sink.addAttribute(it.next().getClass());
|
||||
}
|
||||
}
|
||||
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
sink.add(captureState());
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/** @deprecated */
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
Token nextToken = input.next(reusableToken);
|
||||
|
|
|
@ -21,7 +21,11 @@ import org.apache.lucene.index.Payload;
|
|||
import org.apache.lucene.index.TermPositions; // for javadoc
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
|
||||
/** A Token is an occurrence of a term from the text of a field. It consists of
|
||||
/**
|
||||
This class is now deprecated and a new TokenStream API was introduced with Lucene 2.9.
|
||||
See Javadocs in {@link TokenStream} for further details.
|
||||
<p>
|
||||
A Token is an occurrence of a term from the text of a field. It consists of
|
||||
a term's text, the start and end offset of the term in the text of the field,
|
||||
and a type string.
|
||||
<p>
|
||||
|
@ -114,6 +118,8 @@ import org.apache.lucene.util.ArrayUtil;
|
|||
</p>
|
||||
|
||||
@see org.apache.lucene.index.Payload
|
||||
@deprecated A new TokenStream API was introduced with Lucene 2.9.
|
||||
See javadocs in {@link TokenStream} for further details.
|
||||
*/
|
||||
public class Token implements Cloneable {
|
||||
|
||||
|
|
|
@ -22,9 +22,16 @@ import java.io.IOException;
|
|||
/** A TokenFilter is a TokenStream whose input is another token stream.
|
||||
<p>
|
||||
This is an abstract class.
|
||||
NOTE: subclasses must override {@link #next(Token)}. It's
|
||||
also OK to instead override {@link #next()} but that
|
||||
method is now deprecated in favor of {@link #next(Token)}.
|
||||
NOTE: subclasses must override
|
||||
{@link #incrementToken()} if the new TokenStream API is used
|
||||
and {@link #next(Token)} or {@link #next()} if the old
|
||||
TokenStream API is used.
|
||||
* <p><font color="#FF0000">
|
||||
* WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental.
|
||||
* The APIs introduced in these classes with Lucene 2.9 might change in the future.
|
||||
* We will make our best efforts to keep the APIs backwards-compatible.</font>
|
||||
<p>
|
||||
See {@link TokenStream}
|
||||
*/
|
||||
public abstract class TokenFilter extends TokenStream {
|
||||
/** The source of tokens for this filter. */
|
||||
|
@ -32,6 +39,7 @@ public abstract class TokenFilter extends TokenStream {
|
|||
|
||||
/** Construct a token stream filtering the given input. */
|
||||
protected TokenFilter(TokenStream input) {
|
||||
super(input);
|
||||
this.input = input;
|
||||
}
|
||||
|
||||
|
@ -45,4 +53,17 @@ public abstract class TokenFilter extends TokenStream {
|
|||
super.reset();
|
||||
input.reset();
|
||||
}
|
||||
|
||||
public boolean useNewAPI() {
|
||||
return input.useNewAPI();
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets whether or not to use the new TokenStream API. Settings this
|
||||
* will apply to this Filter and all TokenStream/Filters upstream.
|
||||
*/
|
||||
public void setUseNewAPI(boolean use) {
|
||||
input.setUseNewAPI(use);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -17,9 +17,12 @@ package org.apache.lucene.analysis;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.index.Payload;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Iterator;
|
||||
|
||||
import org.apache.lucene.index.Payload;
|
||||
import org.apache.lucene.util.Attribute;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
||||
/** A TokenStream enumerates the sequence of tokens, either from
|
||||
fields of a document or from query text.
|
||||
|
@ -31,12 +34,139 @@ import java.io.IOException;
|
|||
<li>{@link TokenFilter}, a TokenStream
|
||||
whose input is another TokenStream.
|
||||
</ul>
|
||||
NOTE: subclasses must override {@link #next(Token)}. It's
|
||||
also OK to instead override {@link #next()} but that
|
||||
method is now deprecated in favor of {@link #next(Token)}.
|
||||
A new TokenStream API is introduced with Lucene 2.9. Since
|
||||
2.9 Token is deprecated and the preferred way to store
|
||||
the information of a token is to use {@link Attribute}s.
|
||||
<p>
|
||||
For that reason TokenStream extends {@link AttributeSource}
|
||||
now. Note that only one instance per {@link Attribute} is
|
||||
created and reused for every token. This approach reduces
|
||||
object creations and allows local caching of references to
|
||||
the {@link Attribute}s. See {@link #incrementToken()} for further details.
|
||||
<p>
|
||||
<b>The workflow of the new TokenStream API is as follows:</b>
|
||||
<ol>
|
||||
<li>Instantiation of TokenStream/TokenFilters which add/get attributes
|
||||
to/from the {@link AttributeSource}.
|
||||
<li>The consumer calls {@link TokenStream#reset()}.
|
||||
<li>the consumer retrieves attributes from the
|
||||
stream and stores local references to all attributes it wants to access
|
||||
<li>The consumer calls {@link #incrementToken()} until it returns false and
|
||||
consumes the attributes after each call.
|
||||
</ol>
|
||||
To make sure that filters and consumers know which attributes are available
|
||||
the attributes must be added in the during instantiation. Filters and
|
||||
consumers are not required to check for availability of attributes in {@link #incrementToken()}.
|
||||
<p>
|
||||
Sometimes it is desirable to capture a current state of a
|
||||
TokenStream, e. g. for buffering purposes (see {@link CachingTokenFilter},
|
||||
{@link TeeTokenFilter}/{@link SinkTokenizer}). For this usecase
|
||||
{@link AttributeSource#captureState()} and {@link AttributeSource#restoreState(AttributeSource)} can be used.
|
||||
<p>
|
||||
<b>NOTE:</b> In order to enable the new API the method
|
||||
{@link #useNewAPI()} has to be called with useNewAPI=true.
|
||||
Otherwise the deprecated method {@link #next(Token)} will
|
||||
be used by Lucene consumers (indexer and queryparser) to
|
||||
consume the tokens. {@link #next(Token)} will be removed
|
||||
in Lucene 3.0.
|
||||
<p>
|
||||
NOTE: To use the old API subclasses must override {@link #next(Token)}.
|
||||
It's also OK to instead override {@link #next()} but that
|
||||
method is slower compared to {@link #next(Token)}.
|
||||
* <p><font color="#FF0000">
|
||||
* WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental.
|
||||
* The APIs introduced in these classes with Lucene 2.9 might change in the future.
|
||||
* We will make our best efforts to keep the APIs backwards-compatible.</font>
|
||||
*/
|
||||
|
||||
public abstract class TokenStream {
|
||||
public abstract class TokenStream extends AttributeSource {
|
||||
private static boolean useNewAPIDefault = false;
|
||||
private boolean useNewAPI = useNewAPIDefault;
|
||||
|
||||
protected TokenStream() {
|
||||
super();
|
||||
}
|
||||
|
||||
protected TokenStream(AttributeSource input) {
|
||||
super(input);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns whether or not the new TokenStream APIs are used
|
||||
* by default.
|
||||
* (see {@link #incrementToken()}, {@link AttributeSource}).
|
||||
*/
|
||||
public static boolean useNewAPIDefault() {
|
||||
return useNewAPIDefault;
|
||||
}
|
||||
|
||||
/**
|
||||
* Use this API to enable or disable the new TokenStream API.
|
||||
* by default. Can be overridden by calling {@link #setUseNewAPI(boolean)}.
|
||||
* (see {@link #incrementToken()}, {@link AttributeSource}).
|
||||
* <p>
|
||||
* If set to true, the indexer will call {@link #incrementToken()}
|
||||
* to consume Tokens from this stream.
|
||||
* <p>
|
||||
* If set to false, the indexer will call {@link #next(Token)}
|
||||
* instead.
|
||||
*/
|
||||
public static void setUseNewAPIDefault(boolean use) {
|
||||
useNewAPIDefault = use;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns whether or not the new TokenStream APIs are used
|
||||
* for this stream.
|
||||
* (see {@link #incrementToken()}, {@link AttributeSource}).
|
||||
*/
|
||||
public boolean useNewAPI() {
|
||||
return useNewAPI;
|
||||
}
|
||||
|
||||
/**
|
||||
* Use this API to enable or disable the new TokenStream API
|
||||
* for this stream. Overrides {@link #setUseNewAPIDefault(boolean)}.
|
||||
* (see {@link #incrementToken()}, {@link AttributeSource}).
|
||||
* <p>
|
||||
* If set to true, the indexer will call {@link #incrementToken()}
|
||||
* to consume Tokens from this stream.
|
||||
* <p>
|
||||
* If set to false, the indexer will call {@link #next(Token)}
|
||||
* instead.
|
||||
* <p>
|
||||
* <b>NOTE: All streams and filters in one chain must use the
|
||||
* same API. </b>
|
||||
*/
|
||||
public void setUseNewAPI(boolean use) {
|
||||
useNewAPI = use;
|
||||
}
|
||||
|
||||
/**
|
||||
* Consumers (e. g. the indexer) use this method to advance the stream
|
||||
* to the next token. Implementing classes must implement this method
|
||||
* and update the appropriate {@link Attribute}s with content of the
|
||||
* next token.
|
||||
* <p>
|
||||
* This method is called for every token of a document, so an efficient
|
||||
* implementation is crucial for good performance. To avoid calls to
|
||||
* {@link #addAttribute(Class)} and {@link #getAttribute(Class)} and
|
||||
* downcasts, references to all {@link Attribute}s that this stream uses
|
||||
* should be retrieved during instantiation.
|
||||
* <p>
|
||||
* To make sure that filters and consumers know which attributes are available
|
||||
* the attributes must be added during instantiation. Filters and
|
||||
* consumers are not required to check for availability of attributes in {@link #incrementToken()}.
|
||||
*
|
||||
* @return false for end of stream; true otherwise
|
||||
*
|
||||
* <p>
|
||||
* <b>Note that this method will be defined abstract in Lucene 3.0.<b>
|
||||
*/
|
||||
public boolean incrementToken() throws IOException {
|
||||
// subclasses must implement this method; will be made abstract in Lucene 3.0
|
||||
return false;
|
||||
}
|
||||
|
||||
/** Returns the next token in the stream, or null at EOS.
|
||||
* @deprecated The returned Token is a "full private copy" (not
|
||||
|
@ -84,6 +214,8 @@ public abstract class TokenStream {
|
|||
* is not required to check for null before using it, but it is a
|
||||
* good idea to assert that it is not null.)
|
||||
* @return next token in the stream or null if end-of-stream was hit
|
||||
* @deprecated The new {@link #incrementToken()} and {@link AttributeSource}
|
||||
* APIs should be used instead. See also {@link #useNewAPI()}.
|
||||
*/
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
// We don't actually use inputToken, but still add this assert
|
||||
|
@ -107,4 +239,25 @@ public abstract class TokenStream {
|
|||
|
||||
/** Releases resources associated with this stream. */
|
||||
public void close() throws IOException {}
|
||||
|
||||
public String toString() {
|
||||
StringBuffer sb = new StringBuffer();
|
||||
sb.append('(');
|
||||
|
||||
if (hasAttributes()) {
|
||||
// TODO Java 1.5
|
||||
//Iterator<Attribute> it = attributes.values().iterator();
|
||||
Iterator it = getAttributesIterator();
|
||||
if (it.hasNext()) {
|
||||
sb.append(it.next().toString());
|
||||
}
|
||||
while (it.hasNext()) {
|
||||
sb.append(',');
|
||||
sb.append(it.next().toString());
|
||||
}
|
||||
}
|
||||
sb.append(')');
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -24,12 +24,23 @@ import java.io.IOException;
|
|||
<p>
|
||||
This is an abstract class.
|
||||
<p>
|
||||
NOTE: subclasses must override {@link #next(Token)}. It's
|
||||
also OK to instead override {@link #next()} but that
|
||||
method is now deprecated in favor of {@link #next(Token)}.
|
||||
<b>NOTE:</b> In order to enable the new API the method
|
||||
{@link #useNewAPI()} has to be called with useNewAPI=true.
|
||||
Otherwise the deprecated method {@link #next(Token)} will
|
||||
be used by Lucene consumers (indexer and queryparser) to
|
||||
consume the tokens. {@link #next(Token)} will be removed
|
||||
in Lucene 3.0.
|
||||
<p>
|
||||
NOTE: To use the old API subclasses must override {@link #next(Token)}.
|
||||
It's also OK to instead override {@link #next()} but that
|
||||
method is slower compared to {@link #next(Token)}.
|
||||
<p>
|
||||
NOTE: subclasses overriding {@link #next(Token)} must
|
||||
call {@link Token#clear()}.
|
||||
* <p><font color="#FF0000">
|
||||
* WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental.
|
||||
* The APIs introduced in these classes with Lucene 2.9 might change in the future.
|
||||
* We will make our best efforts to keep the APIs backwards-compatible.</font>
|
||||
*/
|
||||
|
||||
public abstract class Tokenizer extends TokenStream {
|
||||
|
|
|
@ -35,8 +35,7 @@ application using Lucene to use an appropriate <i>Parser</i> to convert the orig
|
|||
<h2>Tokenization</h2>
|
||||
<p>
|
||||
Plain text passed to Lucene for indexing goes through a process generally called tokenization – namely breaking of the
|
||||
input text into small indexing elements –
|
||||
{@link org.apache.lucene.analysis.Token Tokens}.
|
||||
input text into small indexing elements – tokens.
|
||||
The way input text is broken into tokens very
|
||||
much dictates further capabilities of search upon that text.
|
||||
For instance, sentences beginnings and endings can be identified to provide for more accurate phrase
|
||||
|
@ -72,12 +71,13 @@ providing for several functions, including (but not limited to):
|
|||
<li>{@link org.apache.lucene.analysis.Analyzer} – An Analyzer is responsible for building a {@link org.apache.lucene.analysis.TokenStream} which can be consumed
|
||||
by the indexing and searching processes. See below for more information on implementing your own Analyzer.</li>
|
||||
<li>{@link org.apache.lucene.analysis.Tokenizer} – A Tokenizer is a {@link org.apache.lucene.analysis.TokenStream} and is responsible for breaking
|
||||
up incoming text into {@link org.apache.lucene.analysis.Token}s. In most cases, an Analyzer will use a Tokenizer as the first step in
|
||||
up incoming text into tokens. In most cases, an Analyzer will use a Tokenizer as the first step in
|
||||
the analysis process.</li>
|
||||
<li>{@link org.apache.lucene.analysis.TokenFilter} – A TokenFilter is also a {@link org.apache.lucene.analysis.TokenStream} and is responsible
|
||||
for modifying {@link org.apache.lucene.analysis.Token}s that have been created by the Tokenizer. Common modifications performed by a
|
||||
for modifying tokenss that have been created by the Tokenizer. Common modifications performed by a
|
||||
TokenFilter are: deletion, stemming, synonym injection, and down casing. Not all Analyzers require TokenFilters</li>
|
||||
</ul>
|
||||
<b>Since Lucene 2.9 the TokenStream API was changed. Please see section "New TokenStream API" below for details.</b>
|
||||
</p>
|
||||
<h2>Hints, Tips and Traps</h2>
|
||||
<p>
|
||||
|
@ -140,9 +140,8 @@ providing for several functions, including (but not limited to):
|
|||
<PRE>
|
||||
Analyzer analyzer = new StandardAnalyzer(); // or any other analyzer
|
||||
TokenStream ts = analyzer.tokenStream("myfield",new StringReader("some text goes here"));
|
||||
Token t = ts.next();
|
||||
while (t!=null) {
|
||||
System.out.println("token: "+t));
|
||||
while (ts.incrementToken()) {
|
||||
System.out.println("token: "+ts));
|
||||
t = ts.next();
|
||||
}
|
||||
</PRE>
|
||||
|
@ -179,7 +178,7 @@ the source code of any one of the many samples located in this package.
|
|||
<p>
|
||||
The following sections discuss some aspects of implementing your own analyzer.
|
||||
</p>
|
||||
<h3>Field Section Boundaries</h2>
|
||||
<h3>Field Section Boundaries</h3>
|
||||
<p>
|
||||
When {@link org.apache.lucene.document.Document#add(org.apache.lucene.document.Fieldable) document.add(field)}
|
||||
is called multiple times for the same field name, we could say that each such call creates a new
|
||||
|
@ -208,10 +207,10 @@ the source code of any one of the many samples located in this package.
|
|||
};
|
||||
</PRE>
|
||||
</p>
|
||||
<h3>Token Position Increments</h2>
|
||||
<h3>Token Position Increments</h3>
|
||||
<p>
|
||||
By default, all tokens created by Analyzers and Tokenizers have a
|
||||
{@link org.apache.lucene.analysis.Token#getPositionIncrement() position increment} of one.
|
||||
{@link org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute#getPositionIncrement() position increment} of one.
|
||||
This means that the position stored for that token in the index would be one more than
|
||||
that of the previous token.
|
||||
Recall that phrase and proximity searches rely on position info.
|
||||
|
@ -227,26 +226,29 @@ the source code of any one of the many samples located in this package.
|
|||
If this behavior does not fit the application needs,
|
||||
a modified analyzer can be used, that would increment further the positions of
|
||||
tokens following a removed stop word, using
|
||||
{@link org.apache.lucene.analysis.Token#setPositionIncrement(int)}.
|
||||
{@link org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute#setPositionIncrement(int)}.
|
||||
This can be done with something like:
|
||||
<PRE>
|
||||
public TokenStream tokenStream(final String fieldName, Reader reader) {
|
||||
final TokenStream ts = someAnalyzer.tokenStream(fieldName, reader);
|
||||
TokenStream res = new TokenStream() {
|
||||
public Token next() throws IOException {
|
||||
TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
public boolean incrementToken() throws IOException {
|
||||
int extraIncrement = 0;
|
||||
while (true) {
|
||||
Token t = ts.next();
|
||||
if (t!=null) {
|
||||
if (stopWords.contains(t.termText())) {
|
||||
boolean hasNext = ts.incrementToken();
|
||||
if (hasNext) {
|
||||
if (stopWords.contains(termAtt.term())) {
|
||||
extraIncrement++; // filter this word
|
||||
continue;
|
||||
}
|
||||
if (extraIncrement>0) {
|
||||
t.setPositionIncrement(t.getPositionIncrement()+extraIncrement);
|
||||
posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement()+extraIncrement);
|
||||
}
|
||||
}
|
||||
return t;
|
||||
return hasNext;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
@ -268,5 +270,336 @@ the source code of any one of the many samples located in this package.
|
|||
same position as that token, and so would they be seen by phrase and proximity searches.</li>
|
||||
</ol>
|
||||
</p>
|
||||
<h2>New TokenStream API</h2>
|
||||
<p>
|
||||
With Lucene 2.9 we introduce a new TokenStream API. The old API used to produce Tokens. A Token
|
||||
has getter and setter methods for different properties like positionIncrement and termText.
|
||||
While this approach was sufficient for the default indexing format, it is not versatile enough for
|
||||
Flexible Indexing, a term which summarizes the effort of making the Lucene indexer pluggable and extensible for custom
|
||||
index formats.
|
||||
</p>
|
||||
<p>
|
||||
A fully customizable indexer means that users will be able to store custom data structures on disk. Therefore an API
|
||||
is necessary that can transport custom types of data from the documents to the indexer.
|
||||
</p>
|
||||
<h3>Attribute and AttributeSource</h3>
|
||||
Lucene 2.9 therefore introduces a new pair of classes called {@link org.apache.lucene.util.Attribute} and
|
||||
{@link org.apache.lucene.util.AttributeSource}. An Attribute serves as a
|
||||
particular piece of information about a text token. For example, {@link org.apache.lucene.analysis.tokenattributes.TermAttribute}
|
||||
contains the term text of a token, and {@link org.apache.lucene.analysis.tokenattributes.OffsetAttribute} contains the start and end character offsets of a token.
|
||||
An AttributeSource is a collection of Attributes with a restriction: there may be only one instance of each attribute type. TokenStream now extends AttributeSource, which
|
||||
means that one can add Attributes to a TokenStream. Since TokenFilter extends TokenStream, all filters are also
|
||||
AttributeSources.
|
||||
<p>
|
||||
Lucene now provides six Attributes out of the box, which replace the variables the Token class has:
|
||||
<ul>
|
||||
<li>{@link org.apache.lucene.analysis.tokenattributes.TermAttribute}<p>The term text of a token.</p></li>
|
||||
<li>{@link org.apache.lucene.analysis.tokenattributes.OffsetAttribute}<p>The start and end offset of token in characters.</p></li>
|
||||
<li>{@link org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute}<p>See above for detailed information about position increment.</p></li>
|
||||
<li>{@link org.apache.lucene.analysis.tokenattributes.PayloadAttribute}<p>The payload that a Token can optionally have.</p></li>
|
||||
<li>{@link org.apache.lucene.analysis.tokenattributes.TypeAttribute}<p>The type of the token. Default is 'word'.</p></li>
|
||||
<li>{@link org.apache.lucene.analysis.tokenattributes.FlagsAttribute}<p>Optional flags a token can have.</p></li>
|
||||
</ul>
|
||||
</p>
|
||||
<h3>Using the new TokenStream API</h3>
|
||||
There are a few important things to know in order to use the new API efficiently which are summarized here. You may want
|
||||
to walk through the example below first and come back to this section afterwards.
|
||||
<ol><li>
|
||||
Please keep in mind that an AttributeSource can only have one instance of a particular Attribute. Furthermore, if
|
||||
a chain of a TokenStream and multiple TokenFilters is used, then all TokenFilters in that chain share the Attributes
|
||||
with the TokenStream.
|
||||
</li>
|
||||
<br>
|
||||
<li>
|
||||
Attribute instances are reused for all tokens of a document. Thus, a TokenStream/-Filter needs to update
|
||||
the appropriate Attribute(s) in incrementToken(). The consumer, commonly the Lucene indexer, consumes the data in the
|
||||
Attributes and then calls incrementToken() again until it retuns false, which indicates that the end of the stream
|
||||
was reached. This means that in each call of incrementToken() a TokenStream/-Filter can safely overwrite the data in
|
||||
the Attribute instances.
|
||||
</li>
|
||||
<br>
|
||||
<li>
|
||||
For performance reasons a TokenStream/-Filter should add/get Attributes during instantiation; i.e., create an attribute in the
|
||||
constructor and store references to it in an instance variable. Using an instance variable instead of calling addAttribute()/getAttribute()
|
||||
in incrementToken() will avoid expensive casting and attribute lookups for every token in the document.
|
||||
</li>
|
||||
<br>
|
||||
<li>
|
||||
All methods in AttributeSource are idempotent, which means calling them multiple times always yields the same
|
||||
result. This is especially important to know for addAttribute(). The method takes the <b>type</b> (<code>Class</code>)
|
||||
of an Attribute as an argument and returns an <b>instance</b>. If an Attribute of the same type was previously added, then
|
||||
the already existing instance is returned, otherwise a new instance is created and returned. Therefore TokenStreams/-Filters
|
||||
can safely call addAttribute() with the same Attribute type multiple times.
|
||||
</li></ol>
|
||||
<h3>Example</h3>
|
||||
In this example we will create a WhiteSpaceTokenizer and use a LengthFilter to suppress all words that only
|
||||
have two or less characters. The LengthFilter is part of the Lucene core and its implementation will be explained
|
||||
here to illustrate the usage of the new TokenStream API.<br>
|
||||
Then we will develop a custom Attribute, a PartOfSpeechAttribute, and add another filter to the chain which
|
||||
utilizes the new custom attribute, and call it PartOfSpeechTaggingFilter.
|
||||
<h4>Whitespace tokenization</h4>
|
||||
<pre>
|
||||
public class MyAnalyzer extends Analyzer {
|
||||
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
TokenStream stream = new WhitespaceTokenizer(reader);
|
||||
return stream;
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws IOException {
|
||||
// text to tokenize
|
||||
final String text = "This is a demo of the new TokenStream API";
|
||||
|
||||
MyAnalyzer analyzer = new MyAnalyzer();
|
||||
TokenStream stream = analyzer.tokenStream("field", new StringReader(text));
|
||||
|
||||
// get the TermAttribute from the TokenStream
|
||||
TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class);
|
||||
|
||||
// print all tokens until stream is exhausted
|
||||
while (stream.incrementToken()) {
|
||||
System.out.println(termAtt.term());
|
||||
}
|
||||
}
|
||||
}
|
||||
</pre>
|
||||
In this easy example a simple white space tokenization is performed. In main() a loop consumes the stream and
|
||||
prints the term text of the tokens by accessing the TermAttribute that the WhitespaceTokenizer provides.
|
||||
Here is the output:
|
||||
<pre>
|
||||
This
|
||||
is
|
||||
a
|
||||
demo
|
||||
of
|
||||
the
|
||||
new
|
||||
TokenStream
|
||||
API
|
||||
</pre>
|
||||
<h4>Adding a LengthFilter</h4>
|
||||
We want to suppress all tokens that have 2 or less characters. We can do that easily by adding a LengthFilter
|
||||
to the chain. Only the tokenStream() method in our analyzer needs to be changed:
|
||||
<pre>
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
TokenStream stream = new WhitespaceTokenizer(reader);
|
||||
stream = new LengthFilter(stream, 3, Integer.MAX_VALUE);
|
||||
return stream;
|
||||
}
|
||||
</pre>
|
||||
Note how now only words with 3 or more characters are contained in the output:
|
||||
<pre>
|
||||
This
|
||||
demo
|
||||
the
|
||||
new
|
||||
TokenStream
|
||||
API
|
||||
</pre>
|
||||
Now let's take a look how the LengthFilter is implemented (it is part of Lucene's core):
|
||||
<pre>
|
||||
public final class LengthFilter extends TokenFilter {
|
||||
|
||||
final int min;
|
||||
final int max;
|
||||
|
||||
private TermAttribute termAtt;
|
||||
|
||||
/**
|
||||
* Build a filter that removes words that are too long or too
|
||||
* short from the text.
|
||||
*/
|
||||
public LengthFilter(TokenStream in, int min, int max)
|
||||
{
|
||||
super(in);
|
||||
this.min = min;
|
||||
this.max = max;
|
||||
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the next input Token whose term() is the right len
|
||||
*/
|
||||
public final boolean incrementToken() throws IOException
|
||||
{
|
||||
assert termAtt != null;
|
||||
// return the first non-stop word found
|
||||
while (input.incrementToken()) {
|
||||
int len = termAtt.termLength();
|
||||
if (len >= min && len <= max) {
|
||||
return true;
|
||||
}
|
||||
// note: else we ignore it but should we index each part of it?
|
||||
}
|
||||
// reached EOS -- return null
|
||||
return false;
|
||||
}
|
||||
}
|
||||
</pre>
|
||||
The TermAttribute is added in the constructor and stored in the instance variable <code>termAtt</code>.
|
||||
Remember that there can only be a single instance of TermAttribute in the chain, so in our example the
|
||||
<code>addAttribute()</code> call in LengthFilter returns the TermAttribute that the WhitespaceTokenizer already added. The tokens
|
||||
are retrieved from the input stream in the <code>incrementToken()</code> method. By looking at the term text
|
||||
in the TermAttribute the length of the term can be determined and too short or too long tokens are skipped.
|
||||
Note how <code>incrementToken()</code> can efficiently access the instance variable; no attribute lookup or downcasting
|
||||
is neccessary. The same is true for the consumer, which can simply use local references to the Attributes.
|
||||
<h4>Adding a custom Attribute</h4>
|
||||
Now we're going to implement our own custom Attribute for part-of-speech tagging and call it consequently
|
||||
<code>PartOfSpeechAttribute</code>:
|
||||
<pre>
|
||||
public static enum PartOfSpeech {
|
||||
Noun, Verb, Adjective, Adverb, Pronoun, Preposition, Conjunction, Article, Unknown
|
||||
}
|
||||
|
||||
public static final class PartOfSpeechAttribute extends Attribute {
|
||||
|
||||
private PartOfSpeech pos = PartOfSpeech.Unknown;
|
||||
|
||||
public void setPartOfSpeech(PartOfSpeech pos) {
|
||||
this.pos = pos;
|
||||
}
|
||||
|
||||
public PartOfSpeech getPartOfSpeech() {
|
||||
return pos;
|
||||
}
|
||||
|
||||
public void clear() {
|
||||
pos = PartOfSpeech.Unknown;
|
||||
}
|
||||
|
||||
public void copyTo(Attribute target) {
|
||||
((PartOfSpeechAttribute) target).pos = pos;
|
||||
}
|
||||
|
||||
public boolean equals(Object other) {
|
||||
if (other == this) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (other instanceof PartOfSpeechAttribute) {
|
||||
return pos == ((PartOfSpeechAttribute) other).pos;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
return pos.ordinal();
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "PartOfSpeech=" + pos;
|
||||
}
|
||||
}
|
||||
</pre>
|
||||
This is a simple Attribute that has only a single variable that stores the part-of-speech of a token. It extends the
|
||||
new <code>Attribute</code> class and therefore implements its abstract methods <code>clear(), copyTo(), equals(), hashCode(), toString()</code>.
|
||||
Now we need a TokenFilter that can set this new PartOfSpeechAttribute for each token. In this example we show a very naive filter
|
||||
that tags every word with a leading upper-case letter as a 'Noun' and all other words as 'Unknown'.
|
||||
<pre>
|
||||
public static class PartOfSpeechTaggingFilter extends TokenFilter {
|
||||
PartOfSpeechAttribute posAtt;
|
||||
TermAttribute termAtt;
|
||||
|
||||
protected PartOfSpeechTaggingFilter(TokenStream input) {
|
||||
super(input);
|
||||
posAtt = (PartOfSpeechAttribute) addAttribute(PartOfSpeechAttribute.class);
|
||||
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
}
|
||||
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (!input.incrementToken()) {return false;}
|
||||
posAtt.setPartOfSpeech(determinePOS(termAtt.termBuffer(), 0, termAtt.termLength()));
|
||||
return true;
|
||||
}
|
||||
|
||||
// determine the part of speech for the given term
|
||||
protected PartOfSpeech determinePOS(char[] term, int offset, int length) {
|
||||
// naive implementation that tags every uppercased word as noun
|
||||
if (length > 0 && Character.isUpperCase(term[0])) {
|
||||
return PartOfSpeech.Noun;
|
||||
}
|
||||
return PartOfSpeech.Unknown;
|
||||
}
|
||||
}
|
||||
</pre>
|
||||
Just like the LengthFilter, this new filter accesses the attributes it needs in the constructor and
|
||||
stores references in instance variables. Now we need to add the filter to the chain:
|
||||
<pre>
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
TokenStream stream = new WhitespaceTokenizer(reader);
|
||||
stream = new LengthFilter(stream, 3, Integer.MAX_VALUE);
|
||||
stream = new PartOfSpeechTaggingFilter(stream);
|
||||
return stream;
|
||||
}
|
||||
</pre>
|
||||
Now let's look at the output:
|
||||
<pre>
|
||||
This
|
||||
demo
|
||||
the
|
||||
new
|
||||
TokenStream
|
||||
API
|
||||
</pre>
|
||||
Apparently it hasn't changed, which shows that adding a custom attribute to a TokenStream/Filter chain does not
|
||||
affect any existing consumers, simply because they don't know the new Attribute. Now let's change the consumer
|
||||
to make use of the new PartOfSpeechAttribute and print it out:
|
||||
<pre>
|
||||
public static void main(String[] args) throws IOException {
|
||||
// text to tokenize
|
||||
final String text = "This is a demo of the new TokenStream API";
|
||||
|
||||
MyAnalyzer analyzer = new MyAnalyzer();
|
||||
TokenStream stream = analyzer.tokenStream("field", new StringReader(text));
|
||||
|
||||
// get the TermAttribute from the TokenStream
|
||||
TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class);
|
||||
|
||||
// get the PartOfSpeechAttribute from the TokenStream
|
||||
PartOfSpeechAttribute posAtt = (PartOfSpeechAttribute) stream.getAttribute(PartOfSpeechAttribute.class);
|
||||
|
||||
// print all tokens until stream is exhausted
|
||||
while (stream.incrementToken()) {
|
||||
System.out.println(termAtt.term() + ": " + posAtt.getPartOfSpeech());
|
||||
}
|
||||
}
|
||||
</pre>
|
||||
The change that was made is to get the PartOfSpeechAttribute from the TokenStream and print out its contents in
|
||||
the while loop that consumes the stream. Here is the new output:
|
||||
<pre>
|
||||
This: Noun
|
||||
demo: Unknown
|
||||
the: Unknown
|
||||
new: Unknown
|
||||
TokenStream: Noun
|
||||
API: Noun
|
||||
</pre>
|
||||
Each word is now followed by its assigned PartOfSpeech tag. Of course this is a naive
|
||||
part-of-speech tagging. The word 'This' should not even be tagged as noun; it is only spelled capitalized because it
|
||||
is the first word of a sentence. Actually this is a good opportunity for an excerise. To practice the usage of the new
|
||||
API the reader could now write an Attribute and TokenFilter that can specify for each word if it was the first token
|
||||
of a sentence or not. Then the PartOfSpeechTaggingFilter can make use of this knowledge and only tag capitalized words
|
||||
as nouns if not the first word of a sentence (we know, this is still not a correct behavior, but hey, it's a good exercise).
|
||||
As a small hint, this is how the new Attribute class could begin:
|
||||
<pre>
|
||||
public class FirstTokenOfSentenceAttribute extends Attribute {
|
||||
|
||||
private boolean firstToken;
|
||||
|
||||
public void setFirstToken(boolean firstToken) {
|
||||
this.firstToken = firstToken;
|
||||
}
|
||||
|
||||
public boolean getFirstToken() {
|
||||
return firstToken;
|
||||
}
|
||||
|
||||
public void clear() {
|
||||
firstToken = false;
|
||||
}
|
||||
|
||||
...
|
||||
</pre>
|
||||
</body>
|
||||
</html>
|
||||
|
|
|
@ -17,9 +17,11 @@ package org.apache.lucene.analysis.standard;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
|
||||
/** Normalizes tokens extracted with {@link StandardTokenizer}. */
|
||||
|
||||
|
@ -29,15 +31,54 @@ public final class StandardFilter extends TokenFilter {
|
|||
/** Construct filtering <i>in</i>. */
|
||||
public StandardFilter(TokenStream in) {
|
||||
super(in);
|
||||
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
|
||||
}
|
||||
|
||||
private static final String APOSTROPHE_TYPE = StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.APOSTROPHE];
|
||||
private static final String ACRONYM_TYPE = StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM];
|
||||
|
||||
// this filters uses attribute type
|
||||
private TypeAttribute typeAtt;
|
||||
private TermAttribute termAtt;
|
||||
|
||||
/** Returns the next token in the stream, or null at EOS.
|
||||
* <p>Removes <tt>'s</tt> from the end of words.
|
||||
* <p>Removes dots from acronyms.
|
||||
*/
|
||||
public final boolean incrementToken() throws java.io.IOException {
|
||||
if (!input.incrementToken()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
char[] buffer = termAtt.termBuffer();
|
||||
final int bufferLength = termAtt.termLength();
|
||||
final String type = typeAtt.type();
|
||||
|
||||
if (type == APOSTROPHE_TYPE && // remove 's
|
||||
bufferLength >= 2 &&
|
||||
buffer[bufferLength-2] == '\'' &&
|
||||
(buffer[bufferLength-1] == 's' || buffer[bufferLength-1] == 'S')) {
|
||||
// Strip last 2 characters off
|
||||
termAtt.setTermLength(bufferLength - 2);
|
||||
} else if (type == ACRONYM_TYPE) { // remove dots
|
||||
int upto = 0;
|
||||
for(int i=0;i<bufferLength;i++) {
|
||||
char c = buffer[i];
|
||||
if (c != '.')
|
||||
buffer[upto++] = c;
|
||||
}
|
||||
termAtt.setTermLength(upto);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/** Returns the next token in the stream, or null at EOS.
|
||||
* <p>Removes <tt>'s</tt> from the end of words.
|
||||
* <p>Removes dots from acronyms.
|
||||
* @deprecated
|
||||
*/
|
||||
public final Token next(final Token reusableToken) throws java.io.IOException {
|
||||
assert reusableToken != null;
|
||||
Token nextToken = input.next(reusableToken);
|
||||
|
|
|
@ -22,6 +22,10 @@ import java.io.Reader;
|
|||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
|
||||
/** A grammar-based tokenizer constructed with JFlex
|
||||
*
|
||||
|
@ -84,7 +88,7 @@ public class StandardTokenizer extends Tokenizer {
|
|||
*
|
||||
* @deprecated this should be removed in the next release (3.0).
|
||||
*/
|
||||
private boolean replaceInvalidAcronym = false;
|
||||
private boolean replaceInvalidAcronym;
|
||||
|
||||
void setInput(Reader reader) {
|
||||
this.input = reader;
|
||||
|
@ -108,8 +112,7 @@ public class StandardTokenizer extends Tokenizer {
|
|||
* <code>input</code> to a newly created JFlex scanner.
|
||||
*/
|
||||
public StandardTokenizer(Reader input) {
|
||||
this.input = input;
|
||||
this.scanner = new StandardTokenizerImpl(input);
|
||||
this(input, false);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -125,6 +128,60 @@ public class StandardTokenizer extends Tokenizer {
|
|||
this.replaceInvalidAcronym = replaceInvalidAcronym;
|
||||
this.input = input;
|
||||
this.scanner = new StandardTokenizerImpl(input);
|
||||
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||
posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
||||
typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
|
||||
}
|
||||
|
||||
// this tokenizer generates three attributes:
|
||||
// offset, positionIncrement and type
|
||||
private TermAttribute termAtt;
|
||||
private OffsetAttribute offsetAtt;
|
||||
private PositionIncrementAttribute posIncrAtt;
|
||||
private TypeAttribute typeAtt;
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see org.apache.lucene.analysis.TokenStream#next()
|
||||
*/
|
||||
public boolean incrementToken() throws IOException {
|
||||
int posIncr = 1;
|
||||
|
||||
while(true) {
|
||||
int tokenType = scanner.getNextToken();
|
||||
|
||||
if (tokenType == StandardTokenizerImpl.YYEOF) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (scanner.yylength() <= maxTokenLength) {
|
||||
termAtt.clear();
|
||||
posIncrAtt.setPositionIncrement(posIncr);
|
||||
scanner.getText(termAtt);
|
||||
final int start = scanner.yychar();
|
||||
offsetAtt.setStartOffset(start);
|
||||
offsetAtt.setEndOffset(start+termAtt.termLength());
|
||||
// This 'if' should be removed in the next release. For now, it converts
|
||||
// invalid acronyms to HOST. When removed, only the 'else' part should
|
||||
// remain.
|
||||
if (tokenType == StandardTokenizerImpl.ACRONYM_DEP) {
|
||||
if (replaceInvalidAcronym) {
|
||||
typeAtt.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]);
|
||||
termAtt.setTermLength(termAtt.termLength() - 1); // remove extra '.'
|
||||
} else {
|
||||
typeAtt.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]);
|
||||
}
|
||||
} else {
|
||||
typeAtt.setType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]);
|
||||
}
|
||||
return true;
|
||||
} else
|
||||
// When we skip a too-long term, we still increment the
|
||||
// position increment
|
||||
posIncr++;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -132,6 +189,7 @@ public class StandardTokenizer extends Tokenizer {
|
|||
*
|
||||
* @see org.apache.lucene.analysis.TokenStream#next()
|
||||
*/
|
||||
/** @deprecated */
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
int posIncr = 1;
|
||||
|
|
|
@ -30,6 +30,7 @@ NOTE: if you change this file and need to regenerate the tokenizer,
|
|||
*/
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
|
||||
|
||||
/**
|
||||
|
@ -368,6 +369,13 @@ final void getText(Token t) {
|
|||
t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
|
||||
}
|
||||
|
||||
/**
|
||||
* Fills TermAttribute with the current token text.
|
||||
*/
|
||||
final void getText(TermAttribute t) {
|
||||
t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Creates a new scanner
|
||||
|
|
|
@ -29,6 +29,7 @@ NOTE: if you change StandardTokenizerImpl.jflex and need to regenerate
|
|||
*/
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
|
||||
%%
|
||||
|
||||
|
@ -69,6 +70,14 @@ public final int yychar()
|
|||
final void getText(Token t) {
|
||||
t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
|
||||
}
|
||||
|
||||
/**
|
||||
* Fills TermAttribute with the current token text.
|
||||
*/
|
||||
final void getText(TermAttribute t) {
|
||||
t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
|
||||
}
|
||||
|
||||
%}
|
||||
|
||||
THAI = [\u0E00-\u0E59]
|
||||
|
|
|
@ -0,0 +1,86 @@
|
|||
package org.apache.lucene.analysis.tokenattributes;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import org.apache.lucene.util.Attribute;
|
||||
|
||||
/**
|
||||
* This attribute can be used to pass different flags down the tokenizer chain,
|
||||
* e. g. from one TokenFilter to another one.
|
||||
*
|
||||
* <p><font color="#FF0000">
|
||||
* WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental.
|
||||
* The APIs introduced in these classes with Lucene 2.9 might change in the future.
|
||||
* We will make our best efforts to keep the APIs backwards-compatible.</font>
|
||||
|
||||
*/
|
||||
public class FlagsAttribute extends Attribute implements Cloneable, Serializable {
|
||||
private int flags = 0;
|
||||
|
||||
/**
|
||||
* EXPERIMENTAL: While we think this is here to stay, we may want to change it to be a long.
|
||||
* <p/>
|
||||
*
|
||||
* Get the bitset for any bits that have been set. This is completely distinct from {@link TypeAttribute#type()}, although they do share similar purposes.
|
||||
* The flags can be used to encode information about the token for use by other {@link org.apache.lucene.analysis.TokenFilter}s.
|
||||
*
|
||||
*
|
||||
* @return The bits
|
||||
*/
|
||||
public int getFlags() {
|
||||
return flags;
|
||||
}
|
||||
|
||||
/**
|
||||
* @see #getFlags()
|
||||
*/
|
||||
public void setFlags(int flags) {
|
||||
this.flags = flags;
|
||||
}
|
||||
|
||||
public void clear() {
|
||||
flags = 0;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "flags=" + flags;
|
||||
}
|
||||
|
||||
public boolean equals(Object other) {
|
||||
if (this == other) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (other instanceof FlagsAttribute) {
|
||||
return ((FlagsAttribute) other).flags == flags;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
return flags;
|
||||
}
|
||||
|
||||
public void copyTo(Attribute target) {
|
||||
FlagsAttribute t = (FlagsAttribute) target;
|
||||
t.setFlags(flags);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,98 @@
|
|||
package org.apache.lucene.analysis.tokenattributes;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import org.apache.lucene.util.Attribute;
|
||||
|
||||
/**
|
||||
* The start and end character offset of a Token.
|
||||
*
|
||||
* <p><font color="#FF0000">
|
||||
* WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental.
|
||||
* The APIs introduced in these classes with Lucene 2.9 might change in the future.
|
||||
* We will make our best efforts to keep the APIs backwards-compatible.</font>
|
||||
*/
|
||||
public class OffsetAttribute extends Attribute implements Cloneable, Serializable {
|
||||
private int startOffset;
|
||||
private int endOffset;
|
||||
|
||||
/** Returns this Token's starting offset, the position of the first character
|
||||
corresponding to this token in the source text.
|
||||
|
||||
Note that the difference between endOffset() and startOffset() may not be
|
||||
equal to termText.length(), as the term text may have been altered by a
|
||||
stemmer or some other filter. */
|
||||
public int startOffset() {
|
||||
return startOffset;
|
||||
}
|
||||
|
||||
/** Set the starting offset.
|
||||
@see #startOffset() */
|
||||
public void setStartOffset(int offset) {
|
||||
this.startOffset = offset;
|
||||
}
|
||||
|
||||
/** Returns this Token's ending offset, one greater than the position of the
|
||||
last character corresponding to this token in the source text. The length
|
||||
of the token in the source text is (endOffset - startOffset). */
|
||||
public int endOffset() {
|
||||
return endOffset;
|
||||
}
|
||||
|
||||
/** Set the ending offset.
|
||||
@see #endOffset() */
|
||||
public void setEndOffset(int offset) {
|
||||
this.endOffset = offset;
|
||||
}
|
||||
|
||||
public void clear() {
|
||||
startOffset = 0;
|
||||
endOffset = 0;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "start=" + startOffset + ",end=" + endOffset;
|
||||
}
|
||||
|
||||
public boolean equals(Object other) {
|
||||
if (other == this) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (other instanceof OffsetAttribute) {
|
||||
OffsetAttribute o = (OffsetAttribute) other;
|
||||
return o.startOffset == startOffset && o.endOffset == endOffset;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
int code = startOffset;
|
||||
code = code * 31 + endOffset;
|
||||
return code;
|
||||
}
|
||||
|
||||
public void copyTo(Attribute target) {
|
||||
OffsetAttribute t = (OffsetAttribute) target;
|
||||
t.setStartOffset(startOffset);
|
||||
t.setEndOffset(endOffset);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,109 @@
|
|||
package org.apache.lucene.analysis.tokenattributes;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import org.apache.lucene.index.Payload;
|
||||
import org.apache.lucene.util.Attribute;
|
||||
|
||||
/**
|
||||
* The payload of a Token. See also {@link Payload}.
|
||||
*
|
||||
* <p><font color="#FF0000">
|
||||
* WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental.
|
||||
* The APIs introduced in these classes with Lucene 2.9 might change in the future.
|
||||
* We will make our best efforts to keep the APIs backwards-compatible.</font>
|
||||
*/
|
||||
public class PayloadAttribute extends Attribute implements Cloneable, Serializable {
|
||||
private Payload payload;
|
||||
|
||||
/**
|
||||
* Initialize this attribute with no payload.
|
||||
*/
|
||||
public PayloadAttribute() {}
|
||||
|
||||
/**
|
||||
* Initialize this attribute with the given payload.
|
||||
*/
|
||||
public PayloadAttribute(Payload payload) {
|
||||
this.payload = payload;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns this Token's payload.
|
||||
*/
|
||||
public Payload getPayload() {
|
||||
return this.payload;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets this Token's payload.
|
||||
*/
|
||||
public void setPayload(Payload payload) {
|
||||
this.payload = payload;
|
||||
}
|
||||
|
||||
public void clear() {
|
||||
payload = null;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
if (payload == null) {
|
||||
return "payload=null";
|
||||
}
|
||||
|
||||
return "payload=" + payload.toString();
|
||||
}
|
||||
|
||||
public Object clone() {
|
||||
PayloadAttribute clone = (PayloadAttribute) super.clone();
|
||||
if (payload != null) {
|
||||
clone.payload = (Payload) payload.clone();
|
||||
}
|
||||
return clone;
|
||||
}
|
||||
|
||||
public boolean equals(Object other) {
|
||||
if (other == this) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (other instanceof PayloadAttribute) {
|
||||
PayloadAttribute o = (PayloadAttribute) other;
|
||||
if (o.payload == null || payload == null) {
|
||||
return o.payload == null && payload == null;
|
||||
}
|
||||
|
||||
return o.payload.equals(payload);
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
return (payload == null) ? 0 : payload.hashCode();
|
||||
}
|
||||
|
||||
public void copyTo(Attribute target) {
|
||||
PayloadAttribute t = (PayloadAttribute) target;
|
||||
t.setPayload((payload == null) ? null : (Payload) payload.clone());
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -0,0 +1,106 @@
|
|||
package org.apache.lucene.analysis.tokenattributes;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.util.Attribute;
|
||||
|
||||
/** The positionIncrement determines the position of this token
|
||||
* relative to the previous Token in a {@link TokenStream}, used in phrase
|
||||
* searching.
|
||||
*
|
||||
* <p>The default value is one.
|
||||
*
|
||||
* <p>Some common uses for this are:<ul>
|
||||
*
|
||||
* <li>Set it to zero to put multiple terms in the same position. This is
|
||||
* useful if, e.g., a word has multiple stems. Searches for phrases
|
||||
* including either stem will match. In this case, all but the first stem's
|
||||
* increment should be set to zero: the increment of the first instance
|
||||
* should be one. Repeating a token with an increment of zero can also be
|
||||
* used to boost the scores of matches on that token.
|
||||
*
|
||||
* <li>Set it to values greater than one to inhibit exact phrase matches.
|
||||
* If, for example, one does not want phrases to match across removed stop
|
||||
* words, then one could build a stop word filter that removes stop words and
|
||||
* also sets the increment to the number of stop words removed before each
|
||||
* non-stop word. Then exact phrase queries will only match when the terms
|
||||
* occur with no intervening stop words.
|
||||
*
|
||||
* </ul>
|
||||
*
|
||||
* <p><font color="#FF0000">
|
||||
* WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental.
|
||||
* The APIs introduced in these classes with Lucene 2.9 might change in the future.
|
||||
* We will make our best efforts to keep the APIs backwards-compatible.</font>
|
||||
*
|
||||
* @see org.apache.lucene.index.TermPositions
|
||||
*/
|
||||
public class PositionIncrementAttribute extends Attribute implements Cloneable, Serializable {
|
||||
private int positionIncrement = 1;
|
||||
|
||||
/** Set the position increment. The default value is one.
|
||||
*
|
||||
* @param positionIncrement the distance from the prior term
|
||||
*/
|
||||
public void setPositionIncrement(int positionIncrement) {
|
||||
if (positionIncrement < 0)
|
||||
throw new IllegalArgumentException
|
||||
("Increment must be zero or greater: " + positionIncrement);
|
||||
this.positionIncrement = positionIncrement;
|
||||
}
|
||||
|
||||
/** Returns the position increment of this Token.
|
||||
* @see #setPositionIncrement
|
||||
*/
|
||||
public int getPositionIncrement() {
|
||||
return positionIncrement;
|
||||
}
|
||||
|
||||
public void clear() {
|
||||
this.positionIncrement = 1;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "positionIncrement=" + positionIncrement;
|
||||
}
|
||||
|
||||
public boolean equals(Object other) {
|
||||
if (other == this) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (other instanceof PositionIncrementAttribute) {
|
||||
return positionIncrement == ((PositionIncrementAttribute) other).positionIncrement;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
return positionIncrement;
|
||||
}
|
||||
|
||||
public void copyTo(Attribute target) {
|
||||
PositionIncrementAttribute t = (PositionIncrementAttribute) target;
|
||||
t.setPositionIncrement(positionIncrement);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,242 @@
|
|||
package org.apache.lucene.analysis.tokenattributes;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.Attribute;
|
||||
|
||||
/**
|
||||
* The term text of a Token.
|
||||
*
|
||||
* <p><font color="#FF0000">
|
||||
* WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental.
|
||||
* The APIs introduced in these classes with Lucene 2.9 might change in the future.
|
||||
* We will make our best efforts to keep the APIs backwards-compatible.</font>
|
||||
*/
|
||||
public class TermAttribute extends Attribute implements Cloneable, Serializable {
|
||||
private static int MIN_BUFFER_SIZE = 10;
|
||||
|
||||
private char[] termBuffer;
|
||||
private int termLength;
|
||||
|
||||
/** Returns the Token's term text.
|
||||
*
|
||||
* This method has a performance penalty
|
||||
* because the text is stored internally in a char[]. If
|
||||
* possible, use {@link #termBuffer()} and {@link
|
||||
* #termLength()} directly instead. If you really need a
|
||||
* String, use this method, which is nothing more than
|
||||
* a convenience call to <b>new String(token.termBuffer(), 0, token.termLength())</b>
|
||||
*/
|
||||
public String term() {
|
||||
initTermBuffer();
|
||||
return new String(termBuffer, 0, termLength);
|
||||
}
|
||||
|
||||
/** Copies the contents of buffer, starting at offset for
|
||||
* length characters, into the termBuffer array.
|
||||
* @param buffer the buffer to copy
|
||||
* @param offset the index in the buffer of the first character to copy
|
||||
* @param length the number of characters to copy
|
||||
*/
|
||||
public void setTermBuffer(char[] buffer, int offset, int length) {
|
||||
char[] newCharBuffer = growTermBuffer(length);
|
||||
if (newCharBuffer != null) {
|
||||
termBuffer = newCharBuffer;
|
||||
}
|
||||
System.arraycopy(buffer, offset, termBuffer, 0, length);
|
||||
termLength = length;
|
||||
}
|
||||
|
||||
/** Copies the contents of buffer into the termBuffer array.
|
||||
* @param buffer the buffer to copy
|
||||
*/
|
||||
public void setTermBuffer(String buffer) {
|
||||
int length = buffer.length();
|
||||
char[] newCharBuffer = growTermBuffer(length);
|
||||
if (newCharBuffer != null) {
|
||||
termBuffer = newCharBuffer;
|
||||
}
|
||||
buffer.getChars(0, length, termBuffer, 0);
|
||||
termLength = length;
|
||||
}
|
||||
|
||||
/** Copies the contents of buffer, starting at offset and continuing
|
||||
* for length characters, into the termBuffer array.
|
||||
* @param buffer the buffer to copy
|
||||
* @param offset the index in the buffer of the first character to copy
|
||||
* @param length the number of characters to copy
|
||||
*/
|
||||
public void setTermBuffer(String buffer, int offset, int length) {
|
||||
assert offset <= buffer.length();
|
||||
assert offset + length <= buffer.length();
|
||||
char[] newCharBuffer = growTermBuffer(length);
|
||||
if (newCharBuffer != null) {
|
||||
termBuffer = newCharBuffer;
|
||||
}
|
||||
buffer.getChars(offset, offset + length, termBuffer, 0);
|
||||
termLength = length;
|
||||
}
|
||||
|
||||
/** Returns the internal termBuffer character array which
|
||||
* you can then directly alter. If the array is too
|
||||
* small for your token, use {@link
|
||||
* #resizeTermBuffer(int)} to increase it. After
|
||||
* altering the buffer be sure to call {@link
|
||||
* #setTermLength} to record the number of valid
|
||||
* characters that were placed into the termBuffer. */
|
||||
public char[] termBuffer() {
|
||||
initTermBuffer();
|
||||
return termBuffer;
|
||||
}
|
||||
|
||||
/** Grows the termBuffer to at least size newSize, preserving the
|
||||
* existing content. Note: If the next operation is to change
|
||||
* the contents of the term buffer use
|
||||
* {@link #setTermBuffer(char[], int, int)},
|
||||
* {@link #setTermBuffer(String)}, or
|
||||
* {@link #setTermBuffer(String, int, int)}
|
||||
* to optimally combine the resize with the setting of the termBuffer.
|
||||
* @param newSize minimum size of the new termBuffer
|
||||
* @return newly created termBuffer with length >= newSize
|
||||
*/
|
||||
public char[] resizeTermBuffer(int newSize) {
|
||||
char[] newCharBuffer = growTermBuffer(newSize);
|
||||
if (termBuffer == null) {
|
||||
// If there were termText, then preserve it.
|
||||
// note that if termBuffer is null then newCharBuffer cannot be null
|
||||
assert newCharBuffer != null;
|
||||
termBuffer = newCharBuffer;
|
||||
} else if (newCharBuffer != null) {
|
||||
// Note: if newCharBuffer != null then termBuffer needs to grow.
|
||||
// If there were a termBuffer, then preserve it
|
||||
System.arraycopy(termBuffer, 0, newCharBuffer, 0, termBuffer.length);
|
||||
termBuffer = newCharBuffer;
|
||||
}
|
||||
return termBuffer;
|
||||
}
|
||||
|
||||
/** Allocates a buffer char[] of at least newSize
|
||||
* @param newSize minimum size of the buffer
|
||||
* @return newly created buffer with length >= newSize or null if the current termBuffer is big enough
|
||||
*/
|
||||
private char[] growTermBuffer(int newSize) {
|
||||
if (termBuffer != null) {
|
||||
if (termBuffer.length >= newSize)
|
||||
// Already big enough
|
||||
return null;
|
||||
else
|
||||
// Not big enough; create a new array with slight
|
||||
// over allocation:
|
||||
return new char[ArrayUtil.getNextSize(newSize)];
|
||||
} else {
|
||||
|
||||
// determine the best size
|
||||
// The buffer is always at least MIN_BUFFER_SIZE
|
||||
if (newSize < MIN_BUFFER_SIZE) {
|
||||
newSize = MIN_BUFFER_SIZE;
|
||||
}
|
||||
|
||||
return new char[newSize];
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: once we remove the deprecated termText() method
|
||||
// and switch entirely to char[] termBuffer we don't need
|
||||
// to use this method anymore
|
||||
private void initTermBuffer() {
|
||||
if (termBuffer == null) {
|
||||
termBuffer = new char[MIN_BUFFER_SIZE];
|
||||
termLength = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/** Return number of valid characters (length of the term)
|
||||
* in the termBuffer array. */
|
||||
public int termLength() {
|
||||
initTermBuffer();
|
||||
return termLength;
|
||||
}
|
||||
|
||||
/** Set number of valid characters (length of the term) in
|
||||
* the termBuffer array. Use this to truncate the termBuffer
|
||||
* or to synchronize with external manipulation of the termBuffer.
|
||||
* Note: to grow the size of the array,
|
||||
* use {@link #resizeTermBuffer(int)} first.
|
||||
* @param length the truncated length
|
||||
*/
|
||||
public void setTermLength(int length) {
|
||||
initTermBuffer();
|
||||
if (length > termBuffer.length)
|
||||
throw new IllegalArgumentException("length " + length + " exceeds the size of the termBuffer (" + termBuffer.length + ")");
|
||||
termLength = length;
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
initTermBuffer();
|
||||
int code = termLength;
|
||||
code = code * 31 + ArrayUtil.hashCode(termBuffer, 0, termLength);
|
||||
return code;
|
||||
}
|
||||
|
||||
public void clear() {
|
||||
termLength = 0;
|
||||
}
|
||||
|
||||
public Object clone() {
|
||||
TermAttribute t = (TermAttribute)super.clone();
|
||||
// Do a deep clone
|
||||
if (termBuffer != null) {
|
||||
t.termBuffer = (char[]) termBuffer.clone();
|
||||
}
|
||||
return t;
|
||||
}
|
||||
|
||||
public boolean equals(Object other) {
|
||||
if (other == this) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (other instanceof TermAttribute) {
|
||||
initTermBuffer();
|
||||
TermAttribute o = ((TermAttribute) other);
|
||||
o.initTermBuffer();
|
||||
|
||||
for(int i=0;i<termLength;i++) {
|
||||
if (termBuffer[i] != o.termBuffer[i]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
initTermBuffer();
|
||||
return "term=" + new String(termBuffer, 0, termLength);
|
||||
}
|
||||
|
||||
public void copyTo(Attribute target) {
|
||||
TermAttribute t = (TermAttribute) target;
|
||||
t.setTermBuffer(termBuffer, 0, termLength);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,83 @@
|
|||
package org.apache.lucene.analysis.tokenattributes;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import org.apache.lucene.util.Attribute;
|
||||
|
||||
/**
|
||||
* A Token's lexical type. The Default value is "word".
|
||||
*
|
||||
* <p><font color="#FF0000">
|
||||
* WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental.
|
||||
* The APIs introduced in these classes with Lucene 2.9 might change in the future.
|
||||
* We will make our best efforts to keep the APIs backwards-compatible.</font>
|
||||
*/
|
||||
public class TypeAttribute extends Attribute implements Cloneable, Serializable {
|
||||
private String type;
|
||||
public static final String DEFAULT_TYPE = "word";
|
||||
|
||||
public TypeAttribute() {
|
||||
this(DEFAULT_TYPE);
|
||||
}
|
||||
|
||||
public TypeAttribute(String type) {
|
||||
this.type = type;
|
||||
}
|
||||
|
||||
/** Returns this Token's lexical type. Defaults to "word". */
|
||||
public String type() {
|
||||
return type;
|
||||
}
|
||||
|
||||
/** Set the lexical type.
|
||||
@see #type() */
|
||||
public void setType(String type) {
|
||||
this.type = type;
|
||||
}
|
||||
|
||||
public void clear() {
|
||||
type = DEFAULT_TYPE;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "type=" + type;
|
||||
}
|
||||
|
||||
public boolean equals(Object other) {
|
||||
if (other == this) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (other instanceof TypeAttribute) {
|
||||
return type.equals(((TypeAttribute) other).type);
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
return type.hashCode();
|
||||
}
|
||||
|
||||
public void copyTo(Attribute target) {
|
||||
TypeAttribute t = (TypeAttribute) target;
|
||||
t.setType(new String(type));
|
||||
}
|
||||
}
|
|
@ -17,12 +17,14 @@ package org.apache.lucene.index;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.Map;
|
||||
import java.io.IOException;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Collection;
|
||||
import java.util.Iterator;
|
||||
import java.io.IOException;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
||||
/** This is a DocFieldConsumer that inverts each field,
|
||||
* separately, from a Document, and accepts a
|
||||
|
|
|
@ -22,6 +22,8 @@ import java.io.Reader;
|
|||
import org.apache.lucene.document.Fieldable;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
|
||||
/**
|
||||
* Holds state for inverting all occurrences of a single
|
||||
|
@ -79,10 +81,14 @@ final class DocInverterPerField extends DocFieldConsumerPerField {
|
|||
if (!field.isTokenized()) { // un-tokenized field
|
||||
String stringValue = field.stringValue();
|
||||
final int valueLength = stringValue.length();
|
||||
Token token = perThread.localToken.reinit(stringValue, 0, valueLength);
|
||||
perThread.singleTokenTokenStream.reinit(stringValue, 0, valueLength);
|
||||
fieldState.attributeSource = perThread.singleTokenTokenStream;
|
||||
perThread.localTokenStream.reset();
|
||||
consumer.start(field);
|
||||
|
||||
boolean success = false;
|
||||
try {
|
||||
consumer.add(token);
|
||||
consumer.add();
|
||||
success = true;
|
||||
} finally {
|
||||
if (!success)
|
||||
|
@ -122,7 +128,22 @@ final class DocInverterPerField extends DocFieldConsumerPerField {
|
|||
|
||||
try {
|
||||
int offsetEnd = fieldState.offset-1;
|
||||
final Token localToken = perThread.localToken;
|
||||
|
||||
boolean useNewTokenStreamAPI = stream.useNewAPI();
|
||||
Token localToken = null;
|
||||
|
||||
if (useNewTokenStreamAPI) {
|
||||
fieldState.attributeSource = stream;
|
||||
} else {
|
||||
fieldState.attributeSource = perThread.localTokenStream;
|
||||
localToken = perThread.localToken;
|
||||
}
|
||||
|
||||
consumer.start(field);
|
||||
|
||||
OffsetAttribute offsetAttribute = (OffsetAttribute) fieldState.attributeSource.addAttribute(OffsetAttribute.class);
|
||||
PositionIncrementAttribute posIncrAttribute = (PositionIncrementAttribute) fieldState.attributeSource.addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
for(;;) {
|
||||
|
||||
// If we hit an exception in stream.next below
|
||||
|
@ -131,10 +152,16 @@ final class DocInverterPerField extends DocFieldConsumerPerField {
|
|||
// non-aborting and (above) this one document
|
||||
// will be marked as deleted, but still
|
||||
// consume a docID
|
||||
Token token = stream.next(localToken);
|
||||
|
||||
Token token = null;
|
||||
if (useNewTokenStreamAPI) {
|
||||
if (!stream.incrementToken()) break;
|
||||
} else {
|
||||
token = stream.next(localToken);
|
||||
if (token == null) break;
|
||||
final int posIncr = token.getPositionIncrement();
|
||||
perThread.localTokenStream.set(token);
|
||||
}
|
||||
|
||||
final int posIncr = posIncrAttribute.getPositionIncrement();
|
||||
fieldState.position += posIncr - 1;
|
||||
if (posIncr == 0)
|
||||
fieldState.numOverlap++;
|
||||
|
@ -147,14 +174,14 @@ final class DocInverterPerField extends DocFieldConsumerPerField {
|
|||
// internal state of the consumer is now
|
||||
// corrupt and should not be flushed to a
|
||||
// new segment:
|
||||
consumer.add(token);
|
||||
consumer.add();
|
||||
success = true;
|
||||
} finally {
|
||||
if (!success)
|
||||
docState.docWriter.setAborting();
|
||||
}
|
||||
fieldState.position++;
|
||||
offsetEnd = fieldState.offset + token.endOffset();
|
||||
offsetEnd = fieldState.offset + offsetAttribute.endOffset();
|
||||
if (++fieldState.length >= maxFieldLength) {
|
||||
if (docState.infoStream != null)
|
||||
docState.infoStream.println("maxFieldLength " +maxFieldLength+ " reached for field " + fieldInfo.name + ", ignoring following tokens");
|
||||
|
|
|
@ -20,6 +20,14 @@ package org.apache.lucene.index;
|
|||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.util.Attribute;
|
||||
|
||||
/** This is a DocFieldConsumer that inverts each field,
|
||||
* separately, from a Document, and accepts a
|
||||
|
@ -30,6 +38,94 @@ final class DocInverterPerThread extends DocFieldConsumerPerThread {
|
|||
final InvertedDocConsumerPerThread consumer;
|
||||
final InvertedDocEndConsumerPerThread endConsumer;
|
||||
final Token localToken = new Token();
|
||||
//TODO: change to SingleTokenTokenStream after Token was removed
|
||||
final SingleTokenTokenStream singleTokenTokenStream = new SingleTokenTokenStream();
|
||||
final BackwardsCompatibilityStream localTokenStream = new BackwardsCompatibilityStream();
|
||||
|
||||
static class SingleTokenTokenStream extends TokenStream {
|
||||
TermAttribute termAttribute;
|
||||
OffsetAttribute offsetAttribute;
|
||||
|
||||
SingleTokenTokenStream() {
|
||||
termAttribute = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
offsetAttribute = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||
}
|
||||
|
||||
public void reinit(String stringValue, int startOffset, int endOffset) {
|
||||
termAttribute.setTermBuffer(stringValue);
|
||||
offsetAttribute.setStartOffset(startOffset);
|
||||
offsetAttribute.setEndOffset(endOffset);
|
||||
}
|
||||
}
|
||||
|
||||
/** This stream wrapper is only used to maintain backwards compatibility with the
|
||||
* old TokenStream API and can be removed in Lucene 3.0
|
||||
* @deprecated
|
||||
*/
|
||||
static class BackwardsCompatibilityStream extends TokenStream {
|
||||
private Token token;
|
||||
|
||||
TermAttribute termAttribute = new TermAttribute() {
|
||||
public String term() {
|
||||
return token.term();
|
||||
}
|
||||
|
||||
public char[] termBuffer() {
|
||||
return token.termBuffer();
|
||||
}
|
||||
|
||||
public int termLength() {
|
||||
return token.termLength();
|
||||
}
|
||||
};
|
||||
OffsetAttribute offsetAttribute = new OffsetAttribute() {
|
||||
public int startOffset() {
|
||||
return token.startOffset();
|
||||
}
|
||||
|
||||
public int endOffset() {
|
||||
return token.endOffset();
|
||||
}
|
||||
};
|
||||
|
||||
PositionIncrementAttribute positionIncrementAttribute = new PositionIncrementAttribute() {
|
||||
public int getPositionIncrement() {
|
||||
return token.getPositionIncrement();
|
||||
}
|
||||
};
|
||||
|
||||
FlagsAttribute flagsAttribute = new FlagsAttribute() {
|
||||
public int getFlags() {
|
||||
return token.getFlags();
|
||||
}
|
||||
};
|
||||
|
||||
PayloadAttribute payloadAttribute = new PayloadAttribute() {
|
||||
public Payload getPayload() {
|
||||
return token.getPayload();
|
||||
}
|
||||
};
|
||||
|
||||
TypeAttribute typeAttribute = new TypeAttribute() {
|
||||
public String type() {
|
||||
return token.type();
|
||||
}
|
||||
};
|
||||
|
||||
BackwardsCompatibilityStream() {
|
||||
attributes.put(TermAttribute.class, termAttribute);
|
||||
attributes.put(OffsetAttribute.class, offsetAttribute);
|
||||
attributes.put(PositionIncrementAttribute.class, positionIncrementAttribute);
|
||||
attributes.put(FlagsAttribute.class, flagsAttribute);
|
||||
attributes.put(PayloadAttribute.class, payloadAttribute);
|
||||
attributes.put(TypeAttribute.class, typeAttribute);
|
||||
}
|
||||
|
||||
public void set(Token token) {
|
||||
this.token = token;
|
||||
}
|
||||
};
|
||||
|
||||
final DocumentsWriter.DocState docState;
|
||||
|
||||
final FieldInvertState fieldState = new FieldInvertState();
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
import org.apache.lucene.search.Similarity;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
||||
/**
|
||||
* This class tracks the number and position / offset parameters of terms
|
||||
|
@ -32,6 +33,7 @@ public final class FieldInvertState {
|
|||
int numOverlap;
|
||||
int offset;
|
||||
float boost;
|
||||
AttributeSource attributeSource;
|
||||
|
||||
public FieldInvertState() {
|
||||
}
|
||||
|
@ -54,6 +56,7 @@ public final class FieldInvertState {
|
|||
numOverlap = 0;
|
||||
offset = 0;
|
||||
boost = docBoost;
|
||||
attributeSource = null;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -97,4 +100,8 @@ public final class FieldInvertState {
|
|||
public float getBoost() {
|
||||
return boost;
|
||||
}
|
||||
|
||||
public AttributeSource getAttributeSource() {
|
||||
return attributeSource;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -19,7 +19,7 @@ package org.apache.lucene.index;
|
|||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.document.Fieldable;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
|
||||
// TODO: break into separate freq and prox writers as
|
||||
// codecs; make separate container (tii/tis/skip/*) that can
|
||||
|
@ -32,6 +32,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
|
|||
final DocumentsWriter.DocState docState;
|
||||
final FieldInvertState fieldState;
|
||||
boolean omitTf;
|
||||
PayloadAttribute payloadAttribute;
|
||||
|
||||
public FreqProxTermsWriterPerField(TermsHashPerField termsHashPerField, FreqProxTermsWriterPerThread perThread, FieldInfo fieldInfo) {
|
||||
this.termsHashPerField = termsHashPerField;
|
||||
|
@ -53,7 +54,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
|
|||
|
||||
boolean hasPayloads;
|
||||
|
||||
void skippingLongTerm(Token t) throws IOException {}
|
||||
void skippingLongTerm() throws IOException {}
|
||||
|
||||
public int compareTo(Object other0) {
|
||||
FreqProxTermsWriterPerField other = (FreqProxTermsWriterPerField) other0;
|
||||
|
@ -64,6 +65,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
|
|||
// Record, up front, whether our in-RAM format will be
|
||||
// with or without term freqs:
|
||||
omitTf = fieldInfo.omitTf;
|
||||
payloadAttribute = null;
|
||||
}
|
||||
|
||||
boolean start(Fieldable[] fields, int count) {
|
||||
|
@ -73,8 +75,22 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
|
|||
return false;
|
||||
}
|
||||
|
||||
final void writeProx(Token t, FreqProxTermsWriter.PostingList p, int proxCode) {
|
||||
final Payload payload = t.getPayload();
|
||||
void start(Fieldable f) {
|
||||
if (fieldState.attributeSource.hasAttribute(PayloadAttribute.class)) {
|
||||
payloadAttribute = (PayloadAttribute) fieldState.attributeSource.getAttribute(PayloadAttribute.class);
|
||||
} else {
|
||||
payloadAttribute = null;
|
||||
}
|
||||
}
|
||||
|
||||
final void writeProx(FreqProxTermsWriter.PostingList p, int proxCode) {
|
||||
final Payload payload;
|
||||
if (payloadAttribute == null) {
|
||||
payload = null;
|
||||
} else {
|
||||
payload = payloadAttribute.getPayload();
|
||||
}
|
||||
|
||||
if (payload != null && payload.length > 0) {
|
||||
termsHashPerField.writeVInt(1, (proxCode<<1)|1);
|
||||
termsHashPerField.writeVInt(1, payload.length);
|
||||
|
@ -85,7 +101,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
|
|||
p.lastPosition = fieldState.position;
|
||||
}
|
||||
|
||||
final void newTerm(Token t, RawPostingList p0) {
|
||||
final void newTerm(RawPostingList p0) {
|
||||
// First time we're seeing this term since the last
|
||||
// flush
|
||||
assert docState.testPoint("FreqProxTermsWriterPerField.newTerm start");
|
||||
|
@ -96,11 +112,11 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
|
|||
} else {
|
||||
p.lastDocCode = docState.docID << 1;
|
||||
p.docFreq = 1;
|
||||
writeProx(t, p, fieldState.position);
|
||||
writeProx(p, fieldState.position);
|
||||
}
|
||||
}
|
||||
|
||||
final void addTerm(Token t, RawPostingList p0) {
|
||||
final void addTerm(RawPostingList p0) {
|
||||
|
||||
assert docState.testPoint("FreqProxTermsWriterPerField.addTerm start");
|
||||
|
||||
|
@ -132,10 +148,10 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
|
|||
p.docFreq = 1;
|
||||
p.lastDocCode = (docState.docID - p.lastDocID) << 1;
|
||||
p.lastDocID = docState.docID;
|
||||
writeProx(t, p, fieldState.position);
|
||||
writeProx(p, fieldState.position);
|
||||
} else {
|
||||
p.docFreq++;
|
||||
writeProx(t, p, fieldState.position-p.lastPosition);
|
||||
writeProx(p, fieldState.position-p.lastPosition);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,10 +17,10 @@ package org.apache.lucene.index;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.document.Fieldable;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.document.Fieldable;
|
||||
|
||||
abstract class InvertedDocConsumerPerField {
|
||||
|
||||
// Called once per field, and is given all Fieldable
|
||||
|
@ -29,8 +29,11 @@ abstract class InvertedDocConsumerPerField {
|
|||
// fields:
|
||||
abstract boolean start(Fieldable[] fields, int count) throws IOException;
|
||||
|
||||
// Called before a field instance is being processed
|
||||
abstract void start(Fieldable field);
|
||||
|
||||
// Called once per inverted token
|
||||
abstract void add(Token token) throws IOException;
|
||||
abstract void add() throws IOException;
|
||||
|
||||
// Called once per field per document, after all Fieldable
|
||||
// occurrences are inverted
|
||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.index;
|
|||
|
||||
import java.io.Serializable;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
|
||||
|
@ -29,7 +28,7 @@ import org.apache.lucene.util.ArrayUtil;
|
|||
* specific term.
|
||||
* <p>
|
||||
* To store payloads in the index a {@link TokenStream} has to be used that
|
||||
* produces {@link Token}s containing payload data.
|
||||
* produces payload data.
|
||||
* <p>
|
||||
* Use {@link TermPositions#getPayloadLength()} and {@link TermPositions#getPayload(byte[], int)}
|
||||
* to retrieve the payloads from the index.<br>
|
||||
|
|
|
@ -18,10 +18,11 @@ package org.apache.lucene.index;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.document.Fieldable;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
|
||||
final class TermVectorsTermsWriterPerField extends TermsHashConsumerPerField {
|
||||
|
||||
|
@ -37,6 +38,7 @@ final class TermVectorsTermsWriterPerField extends TermsHashConsumerPerField {
|
|||
boolean doVectorOffsets;
|
||||
|
||||
int maxNumPostings;
|
||||
OffsetAttribute offsetAttribute = null;
|
||||
|
||||
public TermVectorsTermsWriterPerField(TermsHashPerField termsHashPerField, TermVectorsTermsWriterPerThread perThread, FieldInfo fieldInfo) {
|
||||
this.termsHashPerField = termsHashPerField;
|
||||
|
@ -192,7 +194,15 @@ final class TermVectorsTermsWriterPerField extends TermsHashConsumerPerField {
|
|||
maxNumPostings = 0;
|
||||
}
|
||||
|
||||
void newTerm(Token t, RawPostingList p0) {
|
||||
void start(Fieldable f) {
|
||||
if (doVectorOffsets && fieldState.attributeSource.hasAttribute(OffsetAttribute.class)) {
|
||||
offsetAttribute = (OffsetAttribute) fieldState.attributeSource.getAttribute(OffsetAttribute.class);
|
||||
} else {
|
||||
offsetAttribute = null;
|
||||
}
|
||||
}
|
||||
|
||||
void newTerm(RawPostingList p0) {
|
||||
|
||||
assert docState.testPoint("TermVectorsTermsWriterPerField.newTerm start");
|
||||
|
||||
|
@ -201,8 +211,9 @@ final class TermVectorsTermsWriterPerField extends TermsHashConsumerPerField {
|
|||
p.freq = 1;
|
||||
|
||||
if (doVectorOffsets) {
|
||||
final int startOffset = fieldState.offset + t.startOffset();
|
||||
final int endOffset = fieldState.offset + t.endOffset();
|
||||
int startOffset = fieldState.offset + offsetAttribute.startOffset();;
|
||||
int endOffset = fieldState.offset + offsetAttribute.endOffset();
|
||||
|
||||
termsHashPerField.writeVInt(1, startOffset);
|
||||
termsHashPerField.writeVInt(1, endOffset - startOffset);
|
||||
p.lastOffset = endOffset;
|
||||
|
@ -214,7 +225,7 @@ final class TermVectorsTermsWriterPerField extends TermsHashConsumerPerField {
|
|||
}
|
||||
}
|
||||
|
||||
void addTerm(Token t, RawPostingList p0) {
|
||||
void addTerm(RawPostingList p0) {
|
||||
|
||||
assert docState.testPoint("TermVectorsTermsWriterPerField.addTerm start");
|
||||
|
||||
|
@ -222,8 +233,9 @@ final class TermVectorsTermsWriterPerField extends TermsHashConsumerPerField {
|
|||
p.freq++;
|
||||
|
||||
if (doVectorOffsets) {
|
||||
final int startOffset = fieldState.offset + t.startOffset();
|
||||
final int endOffset = fieldState.offset + t.endOffset();
|
||||
int startOffset = fieldState.offset + offsetAttribute.startOffset();;
|
||||
int endOffset = fieldState.offset + offsetAttribute.endOffset();
|
||||
|
||||
termsHashPerField.writeVInt(1, startOffset - p.lastOffset);
|
||||
termsHashPerField.writeVInt(1, endOffset - startOffset);
|
||||
p.lastOffset = endOffset;
|
||||
|
@ -235,5 +247,5 @@ final class TermVectorsTermsWriterPerField extends TermsHashConsumerPerField {
|
|||
}
|
||||
}
|
||||
|
||||
void skippingLongTerm(Token t) {}
|
||||
void skippingLongTerm() {}
|
||||
}
|
||||
|
|
|
@ -23,14 +23,15 @@ package org.apache.lucene.index;
|
|||
* multiple streams for each unique Token. */
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.document.Fieldable;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
|
||||
abstract class TermsHashConsumerPerField {
|
||||
abstract boolean start(Fieldable[] fields, int count) throws IOException;
|
||||
abstract void finish() throws IOException;
|
||||
abstract void skippingLongTerm(Token t) throws IOException;
|
||||
abstract void newTerm(Token t, RawPostingList p) throws IOException;
|
||||
abstract void addTerm(Token t, RawPostingList p) throws IOException;
|
||||
abstract void skippingLongTerm() throws IOException;
|
||||
abstract void start(Fieldable field);
|
||||
abstract void newTerm(RawPostingList p) throws IOException;
|
||||
abstract void addTerm(RawPostingList p) throws IOException;
|
||||
abstract int getStreamCount();
|
||||
}
|
||||
|
|
|
@ -20,8 +20,8 @@ package org.apache.lucene.index;
|
|||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.document.Fieldable;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
|
||||
final class TermsHashPerField extends InvertedDocConsumerPerField {
|
||||
|
@ -31,6 +31,7 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
|
|||
final TermsHashPerThread perThread;
|
||||
final DocumentsWriter.DocState docState;
|
||||
final FieldInvertState fieldState;
|
||||
TermAttribute termAtt;
|
||||
|
||||
// Copied from our perThread
|
||||
final CharBlockPool charPool;
|
||||
|
@ -247,6 +248,14 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
|
|||
private boolean doCall;
|
||||
private boolean doNextCall;
|
||||
|
||||
void start(Fieldable f) {
|
||||
termAtt = (TermAttribute) fieldState.attributeSource.getAttribute(TermAttribute.class);
|
||||
consumer.start(f);
|
||||
if (nextPerField != null) {
|
||||
nextPerField.start(f);
|
||||
}
|
||||
}
|
||||
|
||||
boolean start(Fieldable[] fields, int count) throws IOException {
|
||||
doCall = consumer.start(fields, count);
|
||||
if (nextPerField != null)
|
||||
|
@ -257,7 +266,7 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
|
|||
// Secondary entry point (for 2nd & subsequent TermsHash),
|
||||
// because token text has already been "interned" into
|
||||
// textStart, so we hash by textStart
|
||||
public void add(Token token, int textStart) throws IOException {
|
||||
public void add(int textStart) throws IOException {
|
||||
|
||||
int code = textStart;
|
||||
|
||||
|
@ -320,17 +329,17 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
|
|||
}
|
||||
p.byteStart = intUptos[intUptoStart];
|
||||
|
||||
consumer.newTerm(token, p);
|
||||
consumer.newTerm(p);
|
||||
|
||||
} else {
|
||||
intUptos = intPool.buffers[p.intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
|
||||
intUptoStart = p.intStart & DocumentsWriter.INT_BLOCK_MASK;
|
||||
consumer.addTerm(token, p);
|
||||
consumer.addTerm(p);
|
||||
}
|
||||
}
|
||||
|
||||
// Primary entry point (for first TermsHash)
|
||||
void add(Token token) throws IOException {
|
||||
void add() throws IOException {
|
||||
|
||||
assert !postingsCompacted;
|
||||
|
||||
|
@ -338,8 +347,8 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
|
|||
// term text into textStart address
|
||||
|
||||
// Get the text of this term.
|
||||
final char[] tokenText = token.termBuffer();
|
||||
final int tokenTextLen = token.termLength();
|
||||
final char[] tokenText = termAtt.termBuffer();;
|
||||
final int tokenTextLen = termAtt.termLength();
|
||||
|
||||
// Compute hashcode & replace any invalid UTF16 sequences
|
||||
int downto = tokenTextLen;
|
||||
|
@ -403,7 +412,7 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
|
|||
if (docState.maxTermPrefix == null)
|
||||
docState.maxTermPrefix = new String(tokenText, 0, 30);
|
||||
|
||||
consumer.skippingLongTerm(token);
|
||||
consumer.skippingLongTerm();
|
||||
return;
|
||||
}
|
||||
charPool.nextBuffer();
|
||||
|
@ -450,16 +459,16 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
|
|||
}
|
||||
p.byteStart = intUptos[intUptoStart];
|
||||
|
||||
consumer.newTerm(token, p);
|
||||
consumer.newTerm(p);
|
||||
|
||||
} else {
|
||||
intUptos = intPool.buffers[p.intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
|
||||
intUptoStart = p.intStart & DocumentsWriter.INT_BLOCK_MASK;
|
||||
consumer.addTerm(token, p);
|
||||
consumer.addTerm(p);
|
||||
}
|
||||
|
||||
if (doNextCall)
|
||||
nextPerField.add(token, p.textStart);
|
||||
nextPerField.add(p.textStart);
|
||||
}
|
||||
|
||||
int[] intUptos;
|
||||
|
|
|
@ -3,8 +3,8 @@ package org.apache.lucene.queryParser;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.text.DateFormat;
|
||||
import java.text.Collator;
|
||||
import java.text.DateFormat;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Calendar;
|
||||
import java.util.Date;
|
||||
|
@ -15,7 +15,10 @@ import java.util.Map;
|
|||
import java.util.Vector;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.CachingTokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.document.DateField;
|
||||
import org.apache.lucene.document.DateTools;
|
||||
import org.apache.lucene.index.Term;
|
||||
|
@ -518,48 +521,126 @@ public class QueryParser implements QueryParserConstants {
|
|||
// PhraseQuery, or nothing based on the term count
|
||||
|
||||
TokenStream source = analyzer.tokenStream(field, new StringReader(queryText));
|
||||
List list = new ArrayList();
|
||||
final org.apache.lucene.analysis.Token reusableToken = new org.apache.lucene.analysis.Token();
|
||||
org.apache.lucene.analysis.Token nextToken;
|
||||
CachingTokenFilter buffer = new CachingTokenFilter(source);
|
||||
TermAttribute termAtt = null;
|
||||
PositionIncrementAttribute posIncrAtt = null;
|
||||
int numTokens = 0;
|
||||
|
||||
org.apache.lucene.analysis.Token reusableToken = null;
|
||||
org.apache.lucene.analysis.Token nextToken = null;
|
||||
|
||||
|
||||
boolean useNewAPI = TokenStream.useNewAPIDefault();
|
||||
|
||||
if (useNewAPI) {
|
||||
boolean success = false;
|
||||
try {
|
||||
buffer.reset();
|
||||
success = true;
|
||||
} catch (IOException e) {
|
||||
// success==false if we hit an exception
|
||||
}
|
||||
if (success) {
|
||||
if (buffer.hasAttribute(TermAttribute.class)) {
|
||||
termAtt = (TermAttribute) buffer.getAttribute(TermAttribute.class);
|
||||
}
|
||||
if (buffer.hasAttribute(PositionIncrementAttribute.class)) {
|
||||
posIncrAtt = (PositionIncrementAttribute) buffer.getAttribute(PositionIncrementAttribute.class);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
reusableToken = new org.apache.lucene.analysis.Token();
|
||||
}
|
||||
|
||||
int positionCount = 0;
|
||||
boolean severalTokensAtSamePosition = false;
|
||||
|
||||
if (useNewAPI) {
|
||||
if (termAtt != null) {
|
||||
try {
|
||||
while (buffer.incrementToken()) {
|
||||
numTokens++;
|
||||
int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1;
|
||||
if (positionIncrement != 0) {
|
||||
positionCount += positionIncrement;
|
||||
} else {
|
||||
severalTokensAtSamePosition = true;
|
||||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
// ignore
|
||||
}
|
||||
}
|
||||
} else {
|
||||
while (true) {
|
||||
try {
|
||||
nextToken = source.next(reusableToken);
|
||||
nextToken = buffer.next(reusableToken);
|
||||
}
|
||||
catch (IOException e) {
|
||||
nextToken = null;
|
||||
}
|
||||
if (nextToken == null)
|
||||
break;
|
||||
list.add(nextToken.clone());
|
||||
numTokens++;
|
||||
if (nextToken.getPositionIncrement() != 0)
|
||||
positionCount += nextToken.getPositionIncrement();
|
||||
else
|
||||
severalTokensAtSamePosition = true;
|
||||
}
|
||||
}
|
||||
try {
|
||||
// rewind the buffer stream
|
||||
buffer.reset();
|
||||
|
||||
// close original stream - all tokens buffered
|
||||
source.close();
|
||||
}
|
||||
catch (IOException e) {
|
||||
// ignore
|
||||
}
|
||||
|
||||
if (list.size() == 0)
|
||||
if (numTokens == 0)
|
||||
return null;
|
||||
else if (list.size() == 1) {
|
||||
nextToken = (org.apache.lucene.analysis.Token) list.get(0);
|
||||
return newTermQuery(new Term(field, nextToken.term()));
|
||||
else if (numTokens == 1) {
|
||||
String term = null;
|
||||
try {
|
||||
|
||||
if (useNewAPI) {
|
||||
boolean hasNext = buffer.incrementToken();
|
||||
assert hasNext == true;
|
||||
term = termAtt.term();
|
||||
} else {
|
||||
nextToken = buffer.next(reusableToken);
|
||||
assert nextToken != null;
|
||||
term = nextToken.term();
|
||||
}
|
||||
} catch (IOException e) {
|
||||
// safe to ignore, because we know the number of tokens
|
||||
}
|
||||
return newTermQuery(new Term(field, term));
|
||||
} else {
|
||||
if (severalTokensAtSamePosition) {
|
||||
if (positionCount == 1) {
|
||||
// no phrase query:
|
||||
BooleanQuery q = newBooleanQuery(true);
|
||||
for (int i = 0; i < list.size(); i++) {
|
||||
nextToken = (org.apache.lucene.analysis.Token) list.get(i);
|
||||
for (int i = 0; i < numTokens; i++) {
|
||||
String term = null;
|
||||
try {
|
||||
if (useNewAPI) {
|
||||
boolean hasNext = buffer.incrementToken();
|
||||
assert hasNext == true;
|
||||
term = termAtt.term();
|
||||
} else {
|
||||
nextToken = buffer.next(reusableToken);
|
||||
assert nextToken != null;
|
||||
term = nextToken.term();
|
||||
}
|
||||
} catch (IOException e) {
|
||||
// safe to ignore, because we know the number of tokens
|
||||
}
|
||||
|
||||
Query currentQuery = newTermQuery(
|
||||
new Term(field, nextToken.term()));
|
||||
new Term(field, term));
|
||||
q.add(currentQuery, BooleanClause.Occur.SHOULD);
|
||||
}
|
||||
return q;
|
||||
|
@ -570,9 +651,28 @@ public class QueryParser implements QueryParserConstants {
|
|||
mpq.setSlop(phraseSlop);
|
||||
List multiTerms = new ArrayList();
|
||||
int position = -1;
|
||||
for (int i = 0; i < list.size(); i++) {
|
||||
nextToken = (org.apache.lucene.analysis.Token) list.get(i);
|
||||
if (nextToken.getPositionIncrement() > 0 && multiTerms.size() > 0) {
|
||||
for (int i = 0; i < numTokens; i++) {
|
||||
String term = null;
|
||||
int positionIncrement = 1;
|
||||
try {
|
||||
if (useNewAPI) {
|
||||
boolean hasNext = buffer.incrementToken();
|
||||
assert hasNext == true;
|
||||
term = termAtt.term();
|
||||
if (posIncrAtt != null) {
|
||||
positionIncrement = posIncrAtt.getPositionIncrement();
|
||||
}
|
||||
} else {
|
||||
nextToken = buffer.next(reusableToken);
|
||||
assert nextToken != null;
|
||||
term = nextToken.term();
|
||||
positionIncrement = nextToken.getPositionIncrement();
|
||||
}
|
||||
} catch (IOException e) {
|
||||
// safe to ignore, because we know the number of tokens
|
||||
}
|
||||
|
||||
if (positionIncrement > 0 && multiTerms.size() > 0) {
|
||||
if (enablePositionIncrements) {
|
||||
mpq.add((Term[])multiTerms.toArray(new Term[0]),position);
|
||||
} else {
|
||||
|
@ -580,8 +680,8 @@ public class QueryParser implements QueryParserConstants {
|
|||
}
|
||||
multiTerms.clear();
|
||||
}
|
||||
position += nextToken.getPositionIncrement();
|
||||
multiTerms.add(new Term(field, nextToken.term()));
|
||||
position += positionIncrement;
|
||||
multiTerms.add(new Term(field, term));
|
||||
}
|
||||
if (enablePositionIncrements) {
|
||||
mpq.add((Term[])multiTerms.toArray(new Term[0]),position);
|
||||
|
@ -595,13 +695,36 @@ public class QueryParser implements QueryParserConstants {
|
|||
PhraseQuery pq = newPhraseQuery();
|
||||
pq.setSlop(phraseSlop);
|
||||
int position = -1;
|
||||
for (int i = 0; i < list.size(); i++) {
|
||||
nextToken = (org.apache.lucene.analysis.Token) list.get(i);
|
||||
if (enablePositionIncrements) {
|
||||
position += nextToken.getPositionIncrement();
|
||||
pq.add(new Term(field, nextToken.term()),position);
|
||||
|
||||
|
||||
for (int i = 0; i < numTokens; i++) {
|
||||
String term = null;
|
||||
int positionIncrement = 1;
|
||||
|
||||
try {
|
||||
if (useNewAPI) {
|
||||
|
||||
boolean hasNext = buffer.incrementToken();
|
||||
assert hasNext == true;
|
||||
term = termAtt.term();
|
||||
if (posIncrAtt != null) {
|
||||
positionIncrement = posIncrAtt.getPositionIncrement();
|
||||
}
|
||||
} else {
|
||||
pq.add(new Term(field, nextToken.term()));
|
||||
nextToken = buffer.next(reusableToken);
|
||||
assert nextToken != null;
|
||||
term = nextToken.term();
|
||||
positionIncrement = nextToken.getPositionIncrement();
|
||||
}
|
||||
} catch (IOException e) {
|
||||
// safe to ignore, because we know the number of tokens
|
||||
}
|
||||
|
||||
if (enablePositionIncrements) {
|
||||
position += positionIncrement;
|
||||
pq.add(new Term(field, term),position);
|
||||
} else {
|
||||
pq.add(new Term(field, term));
|
||||
}
|
||||
}
|
||||
return pq;
|
||||
|
@ -610,6 +733,7 @@ public class QueryParser implements QueryParserConstants {
|
|||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Base implementation delegates to {@link #getFieldQuery(String,String)}.
|
||||
* This method may be overridden, for example, to return
|
||||
|
@ -1503,12 +1627,6 @@ public class QueryParser implements QueryParserConstants {
|
|||
finally { jj_save(0, xla); }
|
||||
}
|
||||
|
||||
private boolean jj_3R_3() {
|
||||
if (jj_scan_token(STAR)) return true;
|
||||
if (jj_scan_token(COLON)) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
private boolean jj_3R_2() {
|
||||
if (jj_scan_token(TERM)) return true;
|
||||
if (jj_scan_token(COLON)) return true;
|
||||
|
@ -1525,6 +1643,12 @@ public class QueryParser implements QueryParserConstants {
|
|||
return false;
|
||||
}
|
||||
|
||||
private boolean jj_3R_3() {
|
||||
if (jj_scan_token(STAR)) return true;
|
||||
if (jj_scan_token(COLON)) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
/** Generated Token Manager. */
|
||||
public QueryParserTokenManager token_source;
|
||||
/** Current token. */
|
||||
|
|
|
@ -27,8 +27,8 @@ package org.apache.lucene.queryParser;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.text.DateFormat;
|
||||
import java.text.Collator;
|
||||
import java.text.DateFormat;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Calendar;
|
||||
import java.util.Date;
|
||||
|
@ -39,7 +39,10 @@ import java.util.Map;
|
|||
import java.util.Vector;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.CachingTokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.document.DateField;
|
||||
import org.apache.lucene.document.DateTools;
|
||||
import org.apache.lucene.index.Term;
|
||||
|
@ -542,48 +545,126 @@ public class QueryParser {
|
|||
// PhraseQuery, or nothing based on the term count
|
||||
|
||||
TokenStream source = analyzer.tokenStream(field, new StringReader(queryText));
|
||||
List list = new ArrayList();
|
||||
final org.apache.lucene.analysis.Token reusableToken = new org.apache.lucene.analysis.Token();
|
||||
org.apache.lucene.analysis.Token nextToken;
|
||||
CachingTokenFilter buffer = new CachingTokenFilter(source);
|
||||
TermAttribute termAtt = null;
|
||||
PositionIncrementAttribute posIncrAtt = null;
|
||||
int numTokens = 0;
|
||||
|
||||
org.apache.lucene.analysis.Token reusableToken = null;
|
||||
org.apache.lucene.analysis.Token nextToken = null;
|
||||
|
||||
|
||||
boolean useNewAPI = TokenStream.useNewAPI();
|
||||
|
||||
if (useNewAPI) {
|
||||
boolean success = false;
|
||||
try {
|
||||
buffer.start();
|
||||
success = true;
|
||||
} catch (IOException e) {
|
||||
// success==false if we hit an exception
|
||||
}
|
||||
if (success) {
|
||||
if (buffer.hasAttribute(TermAttribute.class)) {
|
||||
termAtt = (TermAttribute) buffer.getAttribute(TermAttribute.class);
|
||||
}
|
||||
if (buffer.hasAttribute(PositionIncrementAttribute.class)) {
|
||||
posIncrAtt = (PositionIncrementAttribute) buffer.getAttribute(PositionIncrementAttribute.class);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
reusableToken = new org.apache.lucene.analysis.Token();
|
||||
}
|
||||
|
||||
int positionCount = 0;
|
||||
boolean severalTokensAtSamePosition = false;
|
||||
|
||||
if (useNewAPI) {
|
||||
if (termAtt != null) {
|
||||
try {
|
||||
while (buffer.incrementToken()) {
|
||||
numTokens++;
|
||||
int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1;
|
||||
if (positionIncrement != 0) {
|
||||
positionCount += positionIncrement;
|
||||
} else {
|
||||
severalTokensAtSamePosition = true;
|
||||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
// ignore
|
||||
}
|
||||
}
|
||||
} else {
|
||||
while (true) {
|
||||
try {
|
||||
nextToken = source.next(reusableToken);
|
||||
nextToken = buffer.next(reusableToken);
|
||||
}
|
||||
catch (IOException e) {
|
||||
nextToken = null;
|
||||
}
|
||||
if (nextToken == null)
|
||||
break;
|
||||
list.add(nextToken.clone());
|
||||
numTokens++;
|
||||
if (nextToken.getPositionIncrement() != 0)
|
||||
positionCount += nextToken.getPositionIncrement();
|
||||
else
|
||||
severalTokensAtSamePosition = true;
|
||||
}
|
||||
}
|
||||
try {
|
||||
// rewind the buffer stream
|
||||
buffer.reset();
|
||||
|
||||
// close original stream - all tokens buffered
|
||||
source.close();
|
||||
}
|
||||
catch (IOException e) {
|
||||
// ignore
|
||||
}
|
||||
|
||||
if (list.size() == 0)
|
||||
if (numTokens == 0)
|
||||
return null;
|
||||
else if (list.size() == 1) {
|
||||
nextToken = (org.apache.lucene.analysis.Token) list.get(0);
|
||||
return newTermQuery(new Term(field, nextToken.term()));
|
||||
else if (numTokens == 1) {
|
||||
String term = null;
|
||||
try {
|
||||
|
||||
if (useNewAPI) {
|
||||
boolean hasNext = buffer.incrementToken();
|
||||
assert hasNext == true;
|
||||
term = termAtt.term();
|
||||
} else {
|
||||
nextToken = buffer.next(reusableToken);
|
||||
assert nextToken != null;
|
||||
term = nextToken.term();
|
||||
}
|
||||
} catch (IOException e) {
|
||||
// safe to ignore, because we know the number of tokens
|
||||
}
|
||||
return newTermQuery(new Term(field, term));
|
||||
} else {
|
||||
if (severalTokensAtSamePosition) {
|
||||
if (positionCount == 1) {
|
||||
// no phrase query:
|
||||
BooleanQuery q = newBooleanQuery(true);
|
||||
for (int i = 0; i < list.size(); i++) {
|
||||
nextToken = (org.apache.lucene.analysis.Token) list.get(i);
|
||||
for (int i = 0; i < numTokens; i++) {
|
||||
String term = null;
|
||||
try {
|
||||
if (useNewAPI) {
|
||||
boolean hasNext = buffer.incrementToken();
|
||||
assert hasNext == true;
|
||||
term = termAtt.term();
|
||||
} else {
|
||||
nextToken = buffer.next(reusableToken);
|
||||
assert nextToken != null;
|
||||
term = nextToken.term();
|
||||
}
|
||||
} catch (IOException e) {
|
||||
// safe to ignore, because we know the number of tokens
|
||||
}
|
||||
|
||||
Query currentQuery = newTermQuery(
|
||||
new Term(field, nextToken.term()));
|
||||
new Term(field, term));
|
||||
q.add(currentQuery, BooleanClause.Occur.SHOULD);
|
||||
}
|
||||
return q;
|
||||
|
@ -594,9 +675,28 @@ public class QueryParser {
|
|||
mpq.setSlop(phraseSlop);
|
||||
List multiTerms = new ArrayList();
|
||||
int position = -1;
|
||||
for (int i = 0; i < list.size(); i++) {
|
||||
nextToken = (org.apache.lucene.analysis.Token) list.get(i);
|
||||
if (nextToken.getPositionIncrement() > 0 && multiTerms.size() > 0) {
|
||||
for (int i = 0; i < numTokens; i++) {
|
||||
String term = null;
|
||||
int positionIncrement = 1;
|
||||
try {
|
||||
if (useNewAPI) {
|
||||
boolean hasNext = buffer.incrementToken();
|
||||
assert hasNext == true;
|
||||
term = termAtt.term();
|
||||
if (posIncrAtt != null) {
|
||||
positionIncrement = posIncrAtt.getPositionIncrement();
|
||||
}
|
||||
} else {
|
||||
nextToken = buffer.next(reusableToken);
|
||||
assert nextToken != null;
|
||||
term = nextToken.term();
|
||||
positionIncrement = nextToken.getPositionIncrement();
|
||||
}
|
||||
} catch (IOException e) {
|
||||
// safe to ignore, because we know the number of tokens
|
||||
}
|
||||
|
||||
if (positionIncrement > 0 && multiTerms.size() > 0) {
|
||||
if (enablePositionIncrements) {
|
||||
mpq.add((Term[])multiTerms.toArray(new Term[0]),position);
|
||||
} else {
|
||||
|
@ -604,8 +704,8 @@ public class QueryParser {
|
|||
}
|
||||
multiTerms.clear();
|
||||
}
|
||||
position += nextToken.getPositionIncrement();
|
||||
multiTerms.add(new Term(field, nextToken.term()));
|
||||
position += positionIncrement;
|
||||
multiTerms.add(new Term(field, term));
|
||||
}
|
||||
if (enablePositionIncrements) {
|
||||
mpq.add((Term[])multiTerms.toArray(new Term[0]),position);
|
||||
|
@ -619,13 +719,36 @@ public class QueryParser {
|
|||
PhraseQuery pq = newPhraseQuery();
|
||||
pq.setSlop(phraseSlop);
|
||||
int position = -1;
|
||||
for (int i = 0; i < list.size(); i++) {
|
||||
nextToken = (org.apache.lucene.analysis.Token) list.get(i);
|
||||
if (enablePositionIncrements) {
|
||||
position += nextToken.getPositionIncrement();
|
||||
pq.add(new Term(field, nextToken.term()),position);
|
||||
|
||||
|
||||
for (int i = 0; i < numTokens; i++) {
|
||||
String term = null;
|
||||
int positionIncrement = 1;
|
||||
|
||||
try {
|
||||
if (useNewAPI) {
|
||||
|
||||
boolean hasNext = buffer.incrementToken();
|
||||
assert hasNext == true;
|
||||
term = termAtt.term();
|
||||
if (posIncrAtt != null) {
|
||||
positionIncrement = posIncrAtt.getPositionIncrement();
|
||||
}
|
||||
} else {
|
||||
pq.add(new Term(field, nextToken.term()));
|
||||
nextToken = buffer.next(reusableToken);
|
||||
assert nextToken != null;
|
||||
term = nextToken.term();
|
||||
positionIncrement = nextToken.getPositionIncrement();
|
||||
}
|
||||
} catch (IOException e) {
|
||||
// safe to ignore, because we know the number of tokens
|
||||
}
|
||||
|
||||
if (enablePositionIncrements) {
|
||||
position += positionIncrement;
|
||||
pq.add(new Term(field, term),position);
|
||||
} else {
|
||||
pq.add(new Term(field, term));
|
||||
}
|
||||
}
|
||||
return pq;
|
||||
|
@ -634,6 +757,7 @@ public class QueryParser {
|
|||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Base implementation delegates to {@link #getFieldQuery(String,String)}.
|
||||
* This method may be overridden, for example, to return
|
||||
|
|
|
@ -2,8 +2,8 @@
|
|||
package org.apache.lucene.queryParser;
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.text.DateFormat;
|
||||
import java.text.Collator;
|
||||
import java.text.DateFormat;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Calendar;
|
||||
import java.util.Date;
|
||||
|
@ -13,7 +13,10 @@ import java.util.Locale;
|
|||
import java.util.Map;
|
||||
import java.util.Vector;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.CachingTokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.document.DateField;
|
||||
import org.apache.lucene.document.DateTools;
|
||||
import org.apache.lucene.index.Term;
|
||||
|
|
|
@ -29,6 +29,7 @@ import java.util.Map;
|
|||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.index.TermFreqVector;
|
||||
|
||||
/**
|
||||
|
@ -58,10 +59,18 @@ public class QueryTermVector implements TermFreqVector {
|
|||
{
|
||||
List terms = new ArrayList();
|
||||
try {
|
||||
if (stream.useNewAPI()) {
|
||||
stream.reset();
|
||||
TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class);
|
||||
while (stream.incrementToken()) {
|
||||
terms.add(termAtt.term());
|
||||
}
|
||||
} else {
|
||||
final Token reusableToken = new Token();
|
||||
for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) {
|
||||
terms.add(nextToken.term());
|
||||
}
|
||||
}
|
||||
processTerms((String[])terms.toArray(new String[terms.size()]));
|
||||
} catch (IOException e) {
|
||||
}
|
||||
|
|
|
@ -0,0 +1,95 @@
|
|||
package org.apache.lucene.util;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
/**
|
||||
* Base class for Attributes that can be added to a
|
||||
* {@link org.apache.lucene.util.AttributeSource}.
|
||||
* <p>
|
||||
* Attributes are used to add data in a dynamic, yet type-safe way to a source
|
||||
* of usually streamed objects, e. g. a {@link org.apache.lucene.analysis.TokenStream}.
|
||||
* <p><font color="#FF0000">
|
||||
* WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental.
|
||||
* The APIs introduced in these classes with Lucene 2.9 might change in the future.
|
||||
* We will make our best efforts to keep the APIs backwards-compatible.</font>
|
||||
*/
|
||||
public abstract class Attribute implements Cloneable, Serializable {
|
||||
/**
|
||||
* Clears the values in this Attribute and resets it to its
|
||||
* default value.
|
||||
*/
|
||||
public abstract void clear();
|
||||
|
||||
/**
|
||||
* Subclasses must implement this method and should follow a syntax
|
||||
* similar to this one:
|
||||
*
|
||||
* <pre>
|
||||
* public String toString() {
|
||||
* return "start=" + startOffset + ",end=" + endOffset;
|
||||
* }
|
||||
* </pre>
|
||||
*/
|
||||
public abstract String toString();
|
||||
|
||||
/**
|
||||
* Subclasses must implement this method and should compute
|
||||
* a hashCode similar to this:
|
||||
* <pre>
|
||||
* public int hashCode() {
|
||||
* int code = startOffset;
|
||||
* code = code * 31 + endOffset;
|
||||
* return code;
|
||||
* }
|
||||
* </pre>
|
||||
*
|
||||
* see also {@link #equals(Object)}
|
||||
*/
|
||||
public abstract int hashCode();
|
||||
|
||||
/**
|
||||
* All values used for computation of {@link #hashCode()}
|
||||
* should be checked here for equality.
|
||||
*
|
||||
* see also {@link Object#equals(Object)}
|
||||
*/
|
||||
public abstract boolean equals(Object other);
|
||||
|
||||
/**
|
||||
* Copies the values from this Attribute into the passed-in
|
||||
* target attribute. The type of the target must match the type
|
||||
* of this attribute.
|
||||
*/
|
||||
public abstract void copyTo(Attribute target);
|
||||
|
||||
/**
|
||||
* Shallow clone. Subclasses must override this if they
|
||||
* need to clone any members deeply,
|
||||
*/
|
||||
public Object clone() {
|
||||
Object clone = null;
|
||||
try {
|
||||
clone = super.clone();
|
||||
} catch (CloneNotSupportedException e) {
|
||||
throw new RuntimeException(e); // shouldn't happen
|
||||
}
|
||||
return clone;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,274 @@
|
|||
package org.apache.lucene.util;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
|
||||
/**
|
||||
* An AttributeSource contains a list of different {@link Attribute}s,
|
||||
* and methods to add and get them. There can only be a single instance
|
||||
* of an attribute in the same AttributeSource instance. This is ensured
|
||||
* by passing in the actual type of the Attribute (Class<Attribute>) to
|
||||
* the {@link #addAttribute(Class)}, which then checks if an instance of
|
||||
* that type is already present. If yes, it returns the instance, otherwise
|
||||
* it creates a new instance and returns it.
|
||||
*
|
||||
* <p><font color="#FF0000">
|
||||
* WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental.
|
||||
* The APIs introduced in these classes with Lucene 2.9 might change in the future.
|
||||
* We will make our best efforts to keep the APIs backwards-compatible.</font>
|
||||
*/
|
||||
public class AttributeSource {
|
||||
/**
|
||||
* An AttributeAcceptor defines only a single method {@link #accept(Class)}.
|
||||
* It can be used for e. g. buffering purposes to specify which attributes
|
||||
* to buffer.
|
||||
*/
|
||||
public static abstract class AttributeAcceptor {
|
||||
/** Return true, to accept this attribute; false otherwise */
|
||||
public abstract boolean accept(Class attClass);
|
||||
}
|
||||
|
||||
/**
|
||||
* Default AttributeAcceptor that accepts all attributes.
|
||||
*/
|
||||
public static final AttributeAcceptor AllAcceptor = new AttributeAcceptor() {
|
||||
public boolean accept(Class attClass) {return true;}
|
||||
};
|
||||
|
||||
/**
|
||||
* Holds the Class<Attribute> -> Attribute mapping
|
||||
*/
|
||||
protected Map attributes;
|
||||
|
||||
public AttributeSource() {
|
||||
this.attributes = new LinkedHashMap();
|
||||
}
|
||||
|
||||
public AttributeSource(AttributeSource input) {
|
||||
this.attributes = input.attributes;
|
||||
}
|
||||
|
||||
/** Returns an iterator that iterates the attributes
|
||||
* in the same order they were added in.
|
||||
*/
|
||||
public Iterator getAttributesIterator() {
|
||||
return attributes.values().iterator();
|
||||
}
|
||||
|
||||
/**
|
||||
* The caller must pass in a Class<? extends Attribute> value.
|
||||
* This method first checks if an instance of that class is
|
||||
* already in this AttributeSource and returns it. Otherwise a
|
||||
* new instance is created, added to this AttributeSource and returned.
|
||||
*/
|
||||
public Attribute addAttribute(Class attClass) {
|
||||
Attribute att = (Attribute) attributes.get(attClass);
|
||||
if (att == null) {
|
||||
try {
|
||||
att = (Attribute) attClass.newInstance();
|
||||
} catch (InstantiationException e) {
|
||||
throw new IllegalArgumentException("Could not instantiate class " + attClass);
|
||||
} catch (IllegalAccessException e) {
|
||||
throw new IllegalArgumentException("Could not instantiate class " + attClass);
|
||||
}
|
||||
|
||||
attributes.put(attClass, att);
|
||||
}
|
||||
return att;
|
||||
}
|
||||
|
||||
/** Returns true, iff this AttributeSource has any attributes */
|
||||
public boolean hasAttributes() {
|
||||
return !this.attributes.isEmpty();
|
||||
}
|
||||
|
||||
/**
|
||||
* The caller must pass in a Class<? extends Attribute> value.
|
||||
* Returns true, iff this AttributeSource contains the passed-in Attribute.
|
||||
*/
|
||||
public boolean hasAttribute(Class attClass) {
|
||||
return this.attributes.containsKey(attClass);
|
||||
}
|
||||
|
||||
/**
|
||||
* The caller must pass in a Class<? extends Attribute> value.
|
||||
* Returns the instance of the passed in Attribute contained in this AttributeSource
|
||||
*
|
||||
* @throws IllegalArgumentException if this AttributeSource does not contain the
|
||||
* Attribute
|
||||
*/
|
||||
public Attribute getAttribute(Class attClass) {
|
||||
Attribute att = (Attribute) this.attributes.get(attClass);
|
||||
if (att == null) {
|
||||
throw new IllegalArgumentException("This token does not have the attribute '" + attClass + "'.");
|
||||
}
|
||||
|
||||
return att;
|
||||
}
|
||||
|
||||
/**
|
||||
* Resets all Attributes in this AttributeSource by calling
|
||||
* {@link Attribute#clear()} on each Attribute.
|
||||
*/
|
||||
public void clearAttributes() {
|
||||
Iterator it = getAttributesIterator();
|
||||
while (it.hasNext()) {
|
||||
((Attribute) it.next()).clear();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Captures the current state of the passed in TokenStream.
|
||||
* <p>
|
||||
* This state will contain all of the passed in TokenStream's
|
||||
* {@link Attribute}s. If only a subset of the attributes is needed
|
||||
* please use {@link #captureState(AttributeAcceptor)}
|
||||
*/
|
||||
public AttributeSource captureState() {
|
||||
return captureState(AllAcceptor);
|
||||
}
|
||||
|
||||
/**
|
||||
* Captures the current state of the passed in TokenStream.
|
||||
* <p>
|
||||
* This state will contain all of the passed in TokenStream's
|
||||
* {@link Attribute}s which the {@link AttributeAcceptor} accepts.
|
||||
*/
|
||||
public AttributeSource captureState(AttributeAcceptor acceptor) {
|
||||
AttributeSource state = new AttributeSource();
|
||||
|
||||
Iterator it = getAttributesIterator();
|
||||
while(it.hasNext()) {
|
||||
Attribute att = (Attribute) it.next();
|
||||
if (acceptor.accept(att.getClass())) {
|
||||
Attribute clone = (Attribute) att.clone();
|
||||
state.attributes.put(att.getClass(), clone);
|
||||
}
|
||||
}
|
||||
|
||||
return state;
|
||||
}
|
||||
|
||||
/**
|
||||
* Restores this state by copying the values of all attributes
|
||||
* that this state contains into the attributes of the targetStream.
|
||||
* The targetStream must contain a corresponding instance for each argument
|
||||
* contained in this state.
|
||||
* <p>
|
||||
* Note that this method does not affect attributes of the targetStream
|
||||
* that are not contained in this state. In other words, if for example
|
||||
* the targetStream contains an OffsetAttribute, but this state doesn't, then
|
||||
* the value of the OffsetAttribute remains unchanged. It might be desirable to
|
||||
* reset its value to the default, in which case the caller should first
|
||||
* call {@link TokenStream#clearAttributes()} on the targetStream.
|
||||
*/
|
||||
public void restoreState(AttributeSource target) {
|
||||
Iterator it = getAttributesIterator();
|
||||
while (it.hasNext()) {
|
||||
Attribute att = (Attribute) it.next();
|
||||
Attribute targetAtt = target.getAttribute(att.getClass());
|
||||
att.copyTo(targetAtt);
|
||||
}
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
int code = 0;
|
||||
if (hasAttributes()) {
|
||||
Iterator it = getAttributesIterator();
|
||||
while (it.hasNext()) {
|
||||
code = code * 31 + it.next().hashCode();
|
||||
}
|
||||
}
|
||||
|
||||
return code;
|
||||
}
|
||||
|
||||
public boolean equals(Object obj) {
|
||||
if (obj == this) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (obj instanceof AttributeSource) {
|
||||
AttributeSource other = (AttributeSource) obj;
|
||||
|
||||
if (hasAttributes()) {
|
||||
if (!other.hasAttributes()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (attributes.size() != other.attributes.size()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
Iterator it = getAttributesIterator();
|
||||
while (it.hasNext()) {
|
||||
Class attName = it.next().getClass();
|
||||
|
||||
Attribute otherAtt = (Attribute) other.attributes.get(attName);
|
||||
if (otherAtt == null || !otherAtt.equals(attributes.get(attName))) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
return !other.hasAttributes();
|
||||
}
|
||||
} else
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
// TODO: Java 1.5
|
||||
// private Map<Class<? extends Attribute>, Attribute> attributes;
|
||||
// public <T extends Attribute> T addAttribute(Class<T> attClass) {
|
||||
// T att = (T) attributes.get(attClass);
|
||||
// if (att == null) {
|
||||
// try {
|
||||
// att = attClass.newInstance();
|
||||
// } catch (InstantiationException e) {
|
||||
// throw new IllegalArgumentException("Could not instantiate class " + attClass);
|
||||
// } catch (IllegalAccessException e) {
|
||||
// throw new IllegalArgumentException("Could not instantiate class " + attClass);
|
||||
// }
|
||||
//
|
||||
// attributes.put(attClass, att);
|
||||
// }
|
||||
// return att;
|
||||
// }
|
||||
//
|
||||
// public boolean hasAttribute(Class<? extends Attribute> attClass) {
|
||||
// return this.attributes.containsKey(attClass);
|
||||
// }
|
||||
//
|
||||
// public <T extends Attribute> T getAttribute(Class<T> attClass) {
|
||||
// Attribute att = this.attributes.get(attClass);
|
||||
// if (att == null) {
|
||||
// throw new IllegalArgumentException("This token does not have the attribute '" + attClass + "'.");
|
||||
// }
|
||||
//
|
||||
// return (T) att;
|
||||
// }
|
||||
//
|
||||
|
||||
}
|
|
@ -17,19 +17,20 @@ package org.apache.lucene;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.SimpleAnalyzer;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.Date;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.SimpleAnalyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
|
||||
class AnalysisTest {
|
||||
static File tmpFile;
|
||||
public static void main(String[] args) {
|
||||
|
@ -70,12 +71,15 @@ class AnalysisTest {
|
|||
Date start = new Date();
|
||||
|
||||
int count = 0;
|
||||
final Token reusableToken = new Token();
|
||||
for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) {
|
||||
|
||||
stream.reset();
|
||||
TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class);
|
||||
OffsetAttribute offsetAtt = (OffsetAttribute) stream.getAttribute(OffsetAttribute.class);
|
||||
while (stream.incrementToken()) {
|
||||
if (verbose) {
|
||||
System.out.println("Text=" + nextToken.term()
|
||||
+ " start=" + nextToken.startOffset()
|
||||
+ " end=" + nextToken.endOffset());
|
||||
System.out.println("Text=" + termAtt.term()
|
||||
+ " start=" + offsetAtt.startOffset()
|
||||
+ " end=" + offsetAtt.endOffset());
|
||||
}
|
||||
count++;
|
||||
}
|
||||
|
|
|
@ -18,6 +18,9 @@ package org.apache.lucene.analysis;
|
|||
|
||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.English;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
|
@ -40,7 +43,8 @@ public class TeeSinkTokenTest extends LuceneTestCase {
|
|||
super(s);
|
||||
}
|
||||
|
||||
protected void setUp() {
|
||||
protected void setUp() throws Exception {
|
||||
super.setUp();
|
||||
tokens1 = new String[]{"The", "quick", "Burgundy", "Fox", "jumped", "over", "the", "lazy", "Red", "Dogs"};
|
||||
tokens2 = new String[]{"The", "Lazy", "Dogs", "should", "stay", "on", "the", "porch"};
|
||||
buffer1 = new StringBuffer();
|
||||
|
@ -62,24 +66,29 @@ public class TeeSinkTokenTest extends LuceneTestCase {
|
|||
public void test() throws IOException {
|
||||
|
||||
SinkTokenizer sink1 = new SinkTokenizer(null) {
|
||||
public void add(Token t) {
|
||||
if (t != null && t.term().equalsIgnoreCase("The")) {
|
||||
super.add(t);
|
||||
public void add(AttributeSource a) throws IOException {
|
||||
TermAttribute termAtt = null;
|
||||
if (a.hasAttribute(TermAttribute.class)) {
|
||||
termAtt = (TermAttribute) a.getAttribute(TermAttribute.class);
|
||||
}
|
||||
if (termAtt != null && termAtt.term().equalsIgnoreCase("The")) {
|
||||
super.add(a);
|
||||
}
|
||||
}
|
||||
};
|
||||
TokenStream source = new TeeTokenFilter(new WhitespaceTokenizer(new StringReader(buffer1.toString())), sink1);
|
||||
int i = 0;
|
||||
final Token reusableToken = new Token();
|
||||
for (Token nextToken = source.next(reusableToken); nextToken != null; nextToken = source.next(reusableToken)) {
|
||||
assertTrue(nextToken.term() + " is not equal to " + tokens1[i], nextToken.term().equals(tokens1[i]) == true);
|
||||
TermAttribute termAtt = (TermAttribute) source.getAttribute(TermAttribute.class);
|
||||
while (source.incrementToken()) {
|
||||
assertTrue(termAtt.term() + " is not equal to " + tokens1[i], termAtt.term().equals(tokens1[i]) == true);
|
||||
i++;
|
||||
}
|
||||
assertTrue(i + " does not equal: " + tokens1.length, i == tokens1.length);
|
||||
assertTrue("sink1 Size: " + sink1.getTokens().size() + " is not: " + 2, sink1.getTokens().size() == 2);
|
||||
i = 0;
|
||||
for (Token token = sink1.next(reusableToken); token != null; token = sink1.next(reusableToken)) {
|
||||
assertTrue(token.term() + " is not equal to " + "The", token.term().equalsIgnoreCase("The") == true);
|
||||
termAtt = (TermAttribute) sink1.getAttribute(TermAttribute.class);
|
||||
while (sink1.incrementToken()) {
|
||||
assertTrue(termAtt.term() + " is not equal to " + "The", termAtt.term().equalsIgnoreCase("The") == true);
|
||||
i++;
|
||||
}
|
||||
assertTrue(i + " does not equal: " + sink1.getTokens().size(), i == sink1.getTokens().size());
|
||||
|
@ -87,55 +96,67 @@ public class TeeSinkTokenTest extends LuceneTestCase {
|
|||
|
||||
public void testMultipleSources() throws Exception {
|
||||
SinkTokenizer theDetector = new SinkTokenizer(null) {
|
||||
public void add(Token t) {
|
||||
if (t != null && t.term().equalsIgnoreCase("The")) {
|
||||
super.add(t);
|
||||
public void add(AttributeSource a) throws IOException {
|
||||
TermAttribute termAtt = null;
|
||||
if (a.hasAttribute(TermAttribute.class)) {
|
||||
termAtt = (TermAttribute) a.getAttribute(TermAttribute.class);
|
||||
}
|
||||
if (termAtt != null && termAtt.term().equalsIgnoreCase("The")) {
|
||||
super.add(a);
|
||||
}
|
||||
}
|
||||
};
|
||||
SinkTokenizer dogDetector = new SinkTokenizer(null) {
|
||||
public void add(Token t) {
|
||||
if (t != null && t.term().equalsIgnoreCase("Dogs")) {
|
||||
super.add(t);
|
||||
public void add(AttributeSource a) throws IOException {
|
||||
TermAttribute termAtt = null;
|
||||
if (a.hasAttribute(TermAttribute.class)) {
|
||||
termAtt = (TermAttribute) a.getAttribute(TermAttribute.class);
|
||||
}
|
||||
if (termAtt != null && termAtt.term().equalsIgnoreCase("Dogs")) {
|
||||
super.add(a);
|
||||
}
|
||||
}
|
||||
};
|
||||
TokenStream source1 = new CachingTokenFilter(new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(new StringReader(buffer1.toString())), theDetector), dogDetector));
|
||||
TokenStream source2 = new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(new StringReader(buffer2.toString())), theDetector), dogDetector);
|
||||
int i = 0;
|
||||
final Token reusableToken = new Token();
|
||||
for (Token nextToken = source1.next(reusableToken); nextToken != null; nextToken = source1.next(reusableToken)) {
|
||||
assertTrue(nextToken.term() + " is not equal to " + tokens1[i], nextToken.term().equals(tokens1[i]) == true);
|
||||
TermAttribute termAtt = (TermAttribute) source1.getAttribute(TermAttribute.class);
|
||||
while (source1.incrementToken()) {
|
||||
assertTrue(termAtt.term() + " is not equal to " + tokens1[i], termAtt.term().equals(tokens1[i]) == true);
|
||||
i++;
|
||||
}
|
||||
assertTrue(i + " does not equal: " + tokens1.length, i == tokens1.length);
|
||||
assertTrue("theDetector Size: " + theDetector.getTokens().size() + " is not: " + 2, theDetector.getTokens().size() == 2);
|
||||
assertTrue("dogDetector Size: " + dogDetector.getTokens().size() + " is not: " + 1, dogDetector.getTokens().size() == 1);
|
||||
i = 0;
|
||||
for (Token nextToken = source2.next(reusableToken); nextToken != null; nextToken = source2.next(reusableToken)) {
|
||||
assertTrue(nextToken.term() + " is not equal to " + tokens2[i], nextToken.term().equals(tokens2[i]) == true);
|
||||
termAtt = (TermAttribute) source2.getAttribute(TermAttribute.class);
|
||||
while (source2.incrementToken()) {
|
||||
assertTrue(termAtt.term() + " is not equal to " + tokens2[i], termAtt.term().equals(tokens2[i]) == true);
|
||||
i++;
|
||||
}
|
||||
assertTrue(i + " does not equal: " + tokens2.length, i == tokens2.length);
|
||||
assertTrue("theDetector Size: " + theDetector.getTokens().size() + " is not: " + 4, theDetector.getTokens().size() == 4);
|
||||
assertTrue("dogDetector Size: " + dogDetector.getTokens().size() + " is not: " + 2, dogDetector.getTokens().size() == 2);
|
||||
i = 0;
|
||||
for (Token nextToken = theDetector.next(reusableToken); nextToken != null; nextToken = theDetector.next(reusableToken)) {
|
||||
assertTrue(nextToken.term() + " is not equal to " + "The", nextToken.term().equalsIgnoreCase("The") == true);
|
||||
termAtt = (TermAttribute) theDetector.getAttribute(TermAttribute.class);
|
||||
while (theDetector.incrementToken()) {
|
||||
assertTrue(termAtt.term() + " is not equal to " + "The", termAtt.term().equalsIgnoreCase("The") == true);
|
||||
i++;
|
||||
}
|
||||
assertTrue(i + " does not equal: " + theDetector.getTokens().size(), i == theDetector.getTokens().size());
|
||||
i = 0;
|
||||
for (Token nextToken = dogDetector.next(reusableToken); nextToken != null; nextToken = dogDetector.next(reusableToken)) {
|
||||
assertTrue(nextToken.term() + " is not equal to " + "Dogs", nextToken.term().equalsIgnoreCase("Dogs") == true);
|
||||
termAtt = (TermAttribute) dogDetector.getAttribute(TermAttribute.class);
|
||||
while (dogDetector.incrementToken()) {
|
||||
assertTrue(termAtt.term() + " is not equal to " + "Dogs", termAtt.term().equalsIgnoreCase("Dogs") == true);
|
||||
i++;
|
||||
}
|
||||
assertTrue(i + " does not equal: " + dogDetector.getTokens().size(), i == dogDetector.getTokens().size());
|
||||
source1.reset();
|
||||
TokenStream lowerCasing = new LowerCaseFilter(source1);
|
||||
i = 0;
|
||||
for (Token nextToken = lowerCasing.next(reusableToken); nextToken != null; nextToken = lowerCasing.next(reusableToken)) {
|
||||
assertTrue(nextToken.term() + " is not equal to " + tokens1[i].toLowerCase(), nextToken.term().equals(tokens1[i].toLowerCase()) == true);
|
||||
termAtt = (TermAttribute) lowerCasing.getAttribute(TermAttribute.class);
|
||||
while (lowerCasing.incrementToken()) {
|
||||
assertTrue(termAtt.term() + " is not equal to " + tokens1[i].toLowerCase(), termAtt.term().equals(tokens1[i].toLowerCase()) == true);
|
||||
i++;
|
||||
}
|
||||
assertTrue(i + " does not equal: " + tokens1.length, i == tokens1.length);
|
||||
|
@ -157,21 +178,20 @@ public class TeeSinkTokenTest extends LuceneTestCase {
|
|||
}
|
||||
//make sure we produce the same tokens
|
||||
ModuloSinkTokenizer sink = new ModuloSinkTokenizer(tokCount[k], 100);
|
||||
final Token reusableToken = new Token();
|
||||
TokenStream stream = new TeeTokenFilter(new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString()))), sink);
|
||||
while (stream.next(reusableToken) != null) {
|
||||
while (stream.incrementToken()) {
|
||||
}
|
||||
stream = new ModuloTokenFilter(new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString()))), 100);
|
||||
List tmp = new ArrayList();
|
||||
for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) {
|
||||
tmp.add(nextToken.clone());
|
||||
while (stream.incrementToken()) {
|
||||
tmp.add(stream.captureState());
|
||||
}
|
||||
List sinkList = sink.getTokens();
|
||||
assertTrue("tmp Size: " + tmp.size() + " is not: " + sinkList.size(), tmp.size() == sinkList.size());
|
||||
for (int i = 0; i < tmp.size(); i++) {
|
||||
Token tfTok = (Token) tmp.get(i);
|
||||
Token sinkTok = (Token) sinkList.get(i);
|
||||
assertTrue(tfTok.term() + " is not equal to " + sinkTok.term() + " at token: " + i, tfTok.term().equals(sinkTok.term()) == true);
|
||||
AttributeSource tfTok = (AttributeSource) tmp.get(i);
|
||||
AttributeSource sinkTok = (AttributeSource) sinkList.get(i);
|
||||
assertTrue(tfTok + " is not equal to " + sinkTok + " at token: " + i, tfTok.equals(sinkTok) == true);
|
||||
}
|
||||
//simulate two fields, each being analyzed once, for 20 documents
|
||||
|
||||
|
@ -180,12 +200,14 @@ public class TeeSinkTokenTest extends LuceneTestCase {
|
|||
long start = System.currentTimeMillis();
|
||||
for (int i = 0; i < 20; i++) {
|
||||
stream = new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString())));
|
||||
for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) {
|
||||
tfPos += nextToken.getPositionIncrement();
|
||||
PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) stream.getAttribute(PositionIncrementAttribute.class);
|
||||
while (stream.incrementToken()) {
|
||||
tfPos += posIncrAtt.getPositionIncrement();
|
||||
}
|
||||
stream = new ModuloTokenFilter(new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString()))), modCounts[j]);
|
||||
for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) {
|
||||
tfPos += nextToken.getPositionIncrement();
|
||||
posIncrAtt = (PositionIncrementAttribute) stream.getAttribute(PositionIncrementAttribute.class);
|
||||
while (stream.incrementToken()) {
|
||||
tfPos += posIncrAtt.getPositionIncrement();
|
||||
}
|
||||
}
|
||||
long finish = System.currentTimeMillis();
|
||||
|
@ -196,13 +218,15 @@ public class TeeSinkTokenTest extends LuceneTestCase {
|
|||
for (int i = 0; i < 20; i++) {
|
||||
sink = new ModuloSinkTokenizer(tokCount[k], modCounts[j]);
|
||||
stream = new TeeTokenFilter(new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString()))), sink);
|
||||
for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) {
|
||||
sinkPos += nextToken.getPositionIncrement();
|
||||
PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) stream.getAttribute(PositionIncrementAttribute.class);
|
||||
while (stream.incrementToken()) {
|
||||
sinkPos += posIncrAtt.getPositionIncrement();
|
||||
}
|
||||
//System.out.println("Modulo--------");
|
||||
stream = sink;
|
||||
for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) {
|
||||
sinkPos += nextToken.getPositionIncrement();
|
||||
posIncrAtt = (PositionIncrementAttribute) stream.getAttribute(PositionIncrementAttribute.class);
|
||||
while (stream.incrementToken()) {
|
||||
sinkPos += posIncrAtt.getPositionIncrement();
|
||||
}
|
||||
}
|
||||
finish = System.currentTimeMillis();
|
||||
|
@ -228,15 +252,15 @@ public class TeeSinkTokenTest extends LuceneTestCase {
|
|||
int count = 0;
|
||||
|
||||
//return every 100 tokens
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
Token nextToken = null;
|
||||
for (nextToken = input.next(reusableToken);
|
||||
nextToken != null && count % modCount != 0;
|
||||
nextToken = input.next(reusableToken)) {
|
||||
public boolean incrementToken() throws IOException {
|
||||
boolean hasNext;
|
||||
for (hasNext = input.incrementToken();
|
||||
hasNext && count % modCount != 0;
|
||||
hasNext = input.incrementToken()) {
|
||||
count++;
|
||||
}
|
||||
count++;
|
||||
return nextToken;
|
||||
return hasNext;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -250,9 +274,9 @@ public class TeeSinkTokenTest extends LuceneTestCase {
|
|||
lst = new ArrayList(numToks % mc);
|
||||
}
|
||||
|
||||
public void add(Token t) {
|
||||
if (t != null && count % modCount == 0) {
|
||||
super.add(t);
|
||||
public void add(AttributeSource a) throws IOException {
|
||||
if (a != null && count % modCount == 0) {
|
||||
super.add(a);
|
||||
}
|
||||
count++;
|
||||
}
|
||||
|
|
|
@ -19,10 +19,10 @@ package org.apache.lucene.analysis;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.index.Payload;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
|
@ -36,13 +36,12 @@ public class TestAnalyzers extends LuceneTestCase {
|
|||
String input,
|
||||
String[] output) throws Exception {
|
||||
TokenStream ts = a.tokenStream("dummy", new StringReader(input));
|
||||
final Token reusableToken = new Token();
|
||||
TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class);
|
||||
for (int i=0; i<output.length; i++) {
|
||||
Token nextToken = ts.next(reusableToken);
|
||||
assertNotNull(nextToken);
|
||||
assertEquals(nextToken.term(), output[i]);
|
||||
assertTrue(ts.incrementToken());
|
||||
assertEquals(termAtt.term(), output[i]);
|
||||
}
|
||||
assertNull(ts.next(reusableToken));
|
||||
assertFalse(ts.incrementToken());
|
||||
ts.close();
|
||||
}
|
||||
|
||||
|
@ -95,14 +94,13 @@ public class TestAnalyzers extends LuceneTestCase {
|
|||
}
|
||||
|
||||
void verifyPayload(TokenStream ts) throws IOException {
|
||||
final Token reusableToken = new Token();
|
||||
PayloadAttribute payloadAtt = (PayloadAttribute) ts.getAttribute(PayloadAttribute.class);
|
||||
for(byte b=1;;b++) {
|
||||
reusableToken.clear();
|
||||
Token nextToken = ts.next(reusableToken);
|
||||
if (nextToken==null) break;
|
||||
boolean hasNext = ts.incrementToken();
|
||||
if (!hasNext) break;
|
||||
// System.out.println("id="+System.identityHashCode(nextToken) + " " + t);
|
||||
// System.out.println("payload=" + (int)nextToken.getPayload().toByteArray()[0]);
|
||||
assertEquals(b, nextToken.getPayload().toByteArray()[0]);
|
||||
assertEquals(b, payloadAtt.getPayload().toByteArray()[0]);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -111,13 +109,11 @@ public class TestAnalyzers extends LuceneTestCase {
|
|||
String s = "how now brown cow";
|
||||
TokenStream ts;
|
||||
ts = new WhitespaceTokenizer(new StringReader(s));
|
||||
ts = new BuffTokenFilter(ts);
|
||||
ts = new PayloadSetter(ts);
|
||||
verifyPayload(ts);
|
||||
|
||||
ts = new WhitespaceTokenizer(new StringReader(s));
|
||||
ts = new PayloadSetter(ts);
|
||||
ts = new BuffTokenFilter(ts);
|
||||
verifyPayload(ts);
|
||||
}
|
||||
|
||||
|
@ -136,38 +132,21 @@ public class TestAnalyzers extends LuceneTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
class BuffTokenFilter extends TokenFilter {
|
||||
List lst;
|
||||
|
||||
public BuffTokenFilter(TokenStream input) {
|
||||
super(input);
|
||||
}
|
||||
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
if (lst == null) {
|
||||
lst = new LinkedList();
|
||||
for(Token nextToken = input.next(reusableToken); nextToken != null; nextToken = input.next(reusableToken)) {
|
||||
lst.add(nextToken.clone());
|
||||
}
|
||||
}
|
||||
return lst.size()==0 ? null : (Token)lst.remove(0);
|
||||
}
|
||||
}
|
||||
|
||||
class PayloadSetter extends TokenFilter {
|
||||
PayloadAttribute payloadAtt;
|
||||
public PayloadSetter(TokenStream input) {
|
||||
super(input);
|
||||
payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
|
||||
}
|
||||
|
||||
byte[] data = new byte[1];
|
||||
Payload p = new Payload(data,0,1);
|
||||
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
Token nextToken = input.next(reusableToken);
|
||||
if (nextToken==null) return null;
|
||||
nextToken.setPayload(p); // reuse the payload / byte[]
|
||||
public boolean incrementToken() throws IOException {
|
||||
boolean hasNext = input.incrementToken();
|
||||
if (!hasNext) return false;
|
||||
payloadAtt.setPayload(p); // reuse the payload / byte[]
|
||||
data[0]++;
|
||||
return nextToken;
|
||||
return true;
|
||||
}
|
||||
}
|
|
@ -22,6 +22,8 @@ import java.io.IOException;
|
|||
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.Field.TermVector;
|
||||
|
@ -41,13 +43,17 @@ public class TestCachingTokenFilter extends LuceneTestCase {
|
|||
Document doc = new Document();
|
||||
TokenStream stream = new TokenStream() {
|
||||
private int index = 0;
|
||||
private TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
private OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (index == tokens.length) {
|
||||
return null;
|
||||
return false;
|
||||
} else {
|
||||
return reusableToken.reinit(tokens[index++], 0, 0);
|
||||
termAtt.setTermBuffer(tokens[index++]);
|
||||
offsetAtt.setStartOffset(0);
|
||||
offsetAtt.setEndOffset(0);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -92,10 +98,12 @@ public class TestCachingTokenFilter extends LuceneTestCase {
|
|||
|
||||
private void checkTokens(TokenStream stream) throws IOException {
|
||||
int count = 0;
|
||||
final Token reusableToken = new Token();
|
||||
for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) {
|
||||
|
||||
TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class);
|
||||
assertNotNull(termAtt);
|
||||
while (stream.incrementToken()) {
|
||||
assertTrue(count < tokens.length);
|
||||
assertEquals(tokens[count], nextToken.term());
|
||||
assertEquals(tokens[count], termAtt.term());
|
||||
count++;
|
||||
}
|
||||
|
||||
|
|
|
@ -17,6 +17,7 @@ package org.apache.lucene.analysis;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
import java.io.StringReader;
|
||||
|
@ -25,82 +26,87 @@ public class TestISOLatin1AccentFilter extends LuceneTestCase {
|
|||
public void testU() throws Exception {
|
||||
TokenStream stream = new WhitespaceTokenizer(new StringReader("Des mot clés À LA CHAÎNE À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï IJ Ð Ñ Ò Ó Ô Õ Ö Ø Œ Þ Ù Ú Û Ü Ý Ÿ à á â ã ä å æ ç è é ê ë ì í î ï ij ð ñ ò ó ô õ ö ø œ ß þ ù ú û ü ý ÿ fi fl"));
|
||||
ISOLatin1AccentFilter filter = new ISOLatin1AccentFilter(stream);
|
||||
final Token reusableToken = new Token();
|
||||
assertEquals("Des", filter.next(reusableToken).term());
|
||||
assertEquals("mot", filter.next(reusableToken).term());
|
||||
assertEquals("cles", filter.next(reusableToken).term());
|
||||
assertEquals("A", filter.next(reusableToken).term());
|
||||
assertEquals("LA", filter.next(reusableToken).term());
|
||||
assertEquals("CHAINE", filter.next(reusableToken).term());
|
||||
assertEquals("A", filter.next(reusableToken).term());
|
||||
assertEquals("A", filter.next(reusableToken).term());
|
||||
assertEquals("A", filter.next(reusableToken).term());
|
||||
assertEquals("A", filter.next(reusableToken).term());
|
||||
assertEquals("A", filter.next(reusableToken).term());
|
||||
assertEquals("A", filter.next(reusableToken).term());
|
||||
assertEquals("AE", filter.next(reusableToken).term());
|
||||
assertEquals("C", filter.next(reusableToken).term());
|
||||
assertEquals("E", filter.next(reusableToken).term());
|
||||
assertEquals("E", filter.next(reusableToken).term());
|
||||
assertEquals("E", filter.next(reusableToken).term());
|
||||
assertEquals("E", filter.next(reusableToken).term());
|
||||
assertEquals("I", filter.next(reusableToken).term());
|
||||
assertEquals("I", filter.next(reusableToken).term());
|
||||
assertEquals("I", filter.next(reusableToken).term());
|
||||
assertEquals("I", filter.next(reusableToken).term());
|
||||
assertEquals("IJ", filter.next(reusableToken).term());
|
||||
assertEquals("D", filter.next(reusableToken).term());
|
||||
assertEquals("N", filter.next(reusableToken).term());
|
||||
assertEquals("O", filter.next(reusableToken).term());
|
||||
assertEquals("O", filter.next(reusableToken).term());
|
||||
assertEquals("O", filter.next(reusableToken).term());
|
||||
assertEquals("O", filter.next(reusableToken).term());
|
||||
assertEquals("O", filter.next(reusableToken).term());
|
||||
assertEquals("O", filter.next(reusableToken).term());
|
||||
assertEquals("OE", filter.next(reusableToken).term());
|
||||
assertEquals("TH", filter.next(reusableToken).term());
|
||||
assertEquals("U", filter.next(reusableToken).term());
|
||||
assertEquals("U", filter.next(reusableToken).term());
|
||||
assertEquals("U", filter.next(reusableToken).term());
|
||||
assertEquals("U", filter.next(reusableToken).term());
|
||||
assertEquals("Y", filter.next(reusableToken).term());
|
||||
assertEquals("Y", filter.next(reusableToken).term());
|
||||
assertEquals("a", filter.next(reusableToken).term());
|
||||
assertEquals("a", filter.next(reusableToken).term());
|
||||
assertEquals("a", filter.next(reusableToken).term());
|
||||
assertEquals("a", filter.next(reusableToken).term());
|
||||
assertEquals("a", filter.next(reusableToken).term());
|
||||
assertEquals("a", filter.next(reusableToken).term());
|
||||
assertEquals("ae", filter.next(reusableToken).term());
|
||||
assertEquals("c", filter.next(reusableToken).term());
|
||||
assertEquals("e", filter.next(reusableToken).term());
|
||||
assertEquals("e", filter.next(reusableToken).term());
|
||||
assertEquals("e", filter.next(reusableToken).term());
|
||||
assertEquals("e", filter.next(reusableToken).term());
|
||||
assertEquals("i", filter.next(reusableToken).term());
|
||||
assertEquals("i", filter.next(reusableToken).term());
|
||||
assertEquals("i", filter.next(reusableToken).term());
|
||||
assertEquals("i", filter.next(reusableToken).term());
|
||||
assertEquals("ij", filter.next(reusableToken).term());
|
||||
assertEquals("d", filter.next(reusableToken).term());
|
||||
assertEquals("n", filter.next(reusableToken).term());
|
||||
assertEquals("o", filter.next(reusableToken).term());
|
||||
assertEquals("o", filter.next(reusableToken).term());
|
||||
assertEquals("o", filter.next(reusableToken).term());
|
||||
assertEquals("o", filter.next(reusableToken).term());
|
||||
assertEquals("o", filter.next(reusableToken).term());
|
||||
assertEquals("o", filter.next(reusableToken).term());
|
||||
assertEquals("oe", filter.next(reusableToken).term());
|
||||
assertEquals("ss", filter.next(reusableToken).term());
|
||||
assertEquals("th", filter.next(reusableToken).term());
|
||||
assertEquals("u", filter.next(reusableToken).term());
|
||||
assertEquals("u", filter.next(reusableToken).term());
|
||||
assertEquals("u", filter.next(reusableToken).term());
|
||||
assertEquals("u", filter.next(reusableToken).term());
|
||||
assertEquals("y", filter.next(reusableToken).term());
|
||||
assertEquals("y", filter.next(reusableToken).term());
|
||||
assertEquals("fi", filter.next(reusableToken).term());
|
||||
assertEquals("fl", filter.next(reusableToken).term());
|
||||
assertNull(filter.next(reusableToken));
|
||||
TermAttribute termAtt = (TermAttribute) filter.getAttribute(TermAttribute.class);
|
||||
assertTermEquals("Des", filter, termAtt);
|
||||
assertTermEquals("mot", filter, termAtt);
|
||||
assertTermEquals("cles", filter, termAtt);
|
||||
assertTermEquals("A", filter, termAtt);
|
||||
assertTermEquals("LA", filter, termAtt);
|
||||
assertTermEquals("CHAINE", filter, termAtt);
|
||||
assertTermEquals("A", filter, termAtt);
|
||||
assertTermEquals("A", filter, termAtt);
|
||||
assertTermEquals("A", filter, termAtt);
|
||||
assertTermEquals("A", filter, termAtt);
|
||||
assertTermEquals("A", filter, termAtt);
|
||||
assertTermEquals("A", filter, termAtt);
|
||||
assertTermEquals("AE", filter, termAtt);
|
||||
assertTermEquals("C", filter, termAtt);
|
||||
assertTermEquals("E", filter, termAtt);
|
||||
assertTermEquals("E", filter, termAtt);
|
||||
assertTermEquals("E", filter, termAtt);
|
||||
assertTermEquals("E", filter, termAtt);
|
||||
assertTermEquals("I", filter, termAtt);
|
||||
assertTermEquals("I", filter, termAtt);
|
||||
assertTermEquals("I", filter, termAtt);
|
||||
assertTermEquals("I", filter, termAtt);
|
||||
assertTermEquals("IJ", filter, termAtt);
|
||||
assertTermEquals("D", filter, termAtt);
|
||||
assertTermEquals("N", filter, termAtt);
|
||||
assertTermEquals("O", filter, termAtt);
|
||||
assertTermEquals("O", filter, termAtt);
|
||||
assertTermEquals("O", filter, termAtt);
|
||||
assertTermEquals("O", filter, termAtt);
|
||||
assertTermEquals("O", filter, termAtt);
|
||||
assertTermEquals("O", filter, termAtt);
|
||||
assertTermEquals("OE", filter, termAtt);
|
||||
assertTermEquals("TH", filter, termAtt);
|
||||
assertTermEquals("U", filter, termAtt);
|
||||
assertTermEquals("U", filter, termAtt);
|
||||
assertTermEquals("U", filter, termAtt);
|
||||
assertTermEquals("U", filter, termAtt);
|
||||
assertTermEquals("Y", filter, termAtt);
|
||||
assertTermEquals("Y", filter, termAtt);
|
||||
assertTermEquals("a", filter, termAtt);
|
||||
assertTermEquals("a", filter, termAtt);
|
||||
assertTermEquals("a", filter, termAtt);
|
||||
assertTermEquals("a", filter, termAtt);
|
||||
assertTermEquals("a", filter, termAtt);
|
||||
assertTermEquals("a", filter, termAtt);
|
||||
assertTermEquals("ae", filter, termAtt);
|
||||
assertTermEquals("c", filter, termAtt);
|
||||
assertTermEquals("e", filter, termAtt);
|
||||
assertTermEquals("e", filter, termAtt);
|
||||
assertTermEquals("e", filter, termAtt);
|
||||
assertTermEquals("e", filter, termAtt);
|
||||
assertTermEquals("i", filter, termAtt);
|
||||
assertTermEquals("i", filter, termAtt);
|
||||
assertTermEquals("i", filter, termAtt);
|
||||
assertTermEquals("i", filter, termAtt);
|
||||
assertTermEquals("ij", filter, termAtt);
|
||||
assertTermEquals("d", filter, termAtt);
|
||||
assertTermEquals("n", filter, termAtt);
|
||||
assertTermEquals("o", filter, termAtt);
|
||||
assertTermEquals("o", filter, termAtt);
|
||||
assertTermEquals("o", filter, termAtt);
|
||||
assertTermEquals("o", filter, termAtt);
|
||||
assertTermEquals("o", filter, termAtt);
|
||||
assertTermEquals("o", filter, termAtt);
|
||||
assertTermEquals("oe", filter, termAtt);
|
||||
assertTermEquals("ss", filter, termAtt);
|
||||
assertTermEquals("th", filter, termAtt);
|
||||
assertTermEquals("u", filter, termAtt);
|
||||
assertTermEquals("u", filter, termAtt);
|
||||
assertTermEquals("u", filter, termAtt);
|
||||
assertTermEquals("u", filter, termAtt);
|
||||
assertTermEquals("y", filter, termAtt);
|
||||
assertTermEquals("y", filter, termAtt);
|
||||
assertTermEquals("fi", filter, termAtt);
|
||||
assertTermEquals("fl", filter, termAtt);
|
||||
assertFalse(filter.incrementToken());
|
||||
}
|
||||
|
||||
void assertTermEquals(String expected, TokenStream stream, TermAttribute termAtt) throws Exception {
|
||||
assertTrue(stream.incrementToken());
|
||||
assertEquals(expected, termAtt.term());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.lucene.analysis;
|
|||
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
|
@ -88,9 +89,9 @@ public class TestKeywordAnalyzer extends LuceneTestCase {
|
|||
// LUCENE-1441
|
||||
public void testOffsets() throws Exception {
|
||||
TokenStream stream = new KeywordAnalyzer().tokenStream("field", new StringReader("abcd"));
|
||||
Token token = new Token();
|
||||
assertTrue(stream.next(token) != null);
|
||||
assertEquals(0, token.startOffset);
|
||||
assertEquals(4, token.endOffset);
|
||||
OffsetAttribute offsetAtt = (OffsetAttribute) stream.addAttribute(OffsetAttribute.class);
|
||||
assertTrue(stream.incrementToken());
|
||||
assertEquals(0, offsetAtt.startOffset());
|
||||
assertEquals(4, offsetAtt.endOffset());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,6 +17,7 @@ package org.apache.lucene.analysis;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
import java.io.StringReader;
|
||||
|
@ -27,11 +28,15 @@ public class TestLengthFilter extends LuceneTestCase {
|
|||
TokenStream stream = new WhitespaceTokenizer(
|
||||
new StringReader("short toolong evenmuchlongertext a ab toolong foo"));
|
||||
LengthFilter filter = new LengthFilter(stream, 2, 6);
|
||||
final Token reusableToken = new Token();
|
||||
assertEquals("short", filter.next(reusableToken).term());
|
||||
assertEquals("ab", filter.next(reusableToken).term());
|
||||
assertEquals("foo", filter.next(reusableToken).term());
|
||||
assertNull(filter.next(reusableToken));
|
||||
TermAttribute termAtt = (TermAttribute) filter.getAttribute(TermAttribute.class);
|
||||
|
||||
assertTrue(filter.incrementToken());
|
||||
assertEquals("short", termAtt.term());
|
||||
assertTrue(filter.incrementToken());
|
||||
assertEquals("ab", termAtt.term());
|
||||
assertTrue(filter.incrementToken());
|
||||
assertEquals("foo", termAtt.term());
|
||||
assertFalse(filter.incrementToken());
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,8 +1,10 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
|
@ -29,17 +31,19 @@ public class TestPerFieldAnalzyerWrapper extends LuceneTestCase {
|
|||
|
||||
TokenStream tokenStream = analyzer.tokenStream("field",
|
||||
new StringReader(text));
|
||||
final Token reusableToken = new Token();
|
||||
Token nextToken = tokenStream.next(reusableToken);
|
||||
TermAttribute termAtt = (TermAttribute) tokenStream.getAttribute(TermAttribute.class);
|
||||
|
||||
assertTrue(tokenStream.incrementToken());
|
||||
assertEquals("WhitespaceAnalyzer does not lowercase",
|
||||
"Qwerty",
|
||||
nextToken.term());
|
||||
termAtt.term());
|
||||
|
||||
tokenStream = analyzer.tokenStream("special",
|
||||
new StringReader(text));
|
||||
nextToken = tokenStream.next(reusableToken);
|
||||
termAtt = (TermAttribute) tokenStream.getAttribute(TermAttribute.class);
|
||||
assertTrue(tokenStream.incrementToken());
|
||||
assertEquals("SimpleAnalyzer lowercases",
|
||||
"qwerty",
|
||||
nextToken.term());
|
||||
termAtt.term());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,6 +1,10 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
import java.io.StringReader;
|
||||
|
@ -35,19 +39,25 @@ public class TestStandardAnalyzer extends LuceneTestCase {
|
|||
|
||||
public void assertAnalyzesTo(Analyzer a, String input, String[] expectedImages, String[] expectedTypes, int[] expectedPosIncrs) throws Exception {
|
||||
TokenStream ts = a.tokenStream("dummy", new StringReader(input));
|
||||
final Token reusableToken = new Token();
|
||||
// TODO Java 1.5
|
||||
//final TypeAttribute typeAtt = reusableToken.getAttribute(TypeAttribute.class);
|
||||
//final PositionIncrementAttribute posIncrAtt = reusableToken.getAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
final TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class);
|
||||
final TypeAttribute typeAtt = (TypeAttribute) ts.getAttribute(TypeAttribute.class);
|
||||
final PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) ts.getAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
for (int i = 0; i < expectedImages.length; i++) {
|
||||
Token nextToken = ts.next(reusableToken);
|
||||
assertNotNull(nextToken);
|
||||
assertEquals(expectedImages[i], nextToken.term());
|
||||
assertTrue(ts.incrementToken());
|
||||
assertEquals(expectedImages[i], new String(termAtt.termBuffer(), 0, termAtt.termLength()));
|
||||
if (expectedTypes != null) {
|
||||
assertEquals(expectedTypes[i], nextToken.type());
|
||||
assertEquals(expectedTypes[i], typeAtt.type());
|
||||
}
|
||||
if (expectedPosIncrs != null) {
|
||||
assertEquals(expectedPosIncrs[i], nextToken.getPositionIncrement());
|
||||
assertEquals(expectedPosIncrs[i], posIncrAtt.getPositionIncrement());
|
||||
}
|
||||
}
|
||||
assertNull(ts.next(reusableToken));
|
||||
assertFalse(ts.incrementToken());
|
||||
ts.close();
|
||||
}
|
||||
|
||||
|
|
|
@ -17,6 +17,8 @@ package org.apache.lucene.analysis;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
import java.io.StringReader;
|
||||
|
@ -45,9 +47,10 @@ public class TestStopAnalyzer extends LuceneTestCase {
|
|||
StringReader reader = new StringReader("This is a test of the english stop analyzer");
|
||||
TokenStream stream = stop.tokenStream("test", reader);
|
||||
assertTrue(stream != null);
|
||||
final Token reusableToken = new Token();
|
||||
for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) {
|
||||
assertFalse(inValidTokens.contains(nextToken.term()));
|
||||
TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class);
|
||||
|
||||
while (stream.incrementToken()) {
|
||||
assertFalse(inValidTokens.contains(termAtt.term()));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -60,11 +63,13 @@ public class TestStopAnalyzer extends LuceneTestCase {
|
|||
StringReader reader = new StringReader("This is a good test of the english stop analyzer");
|
||||
TokenStream stream = newStop.tokenStream("test", reader);
|
||||
assertNotNull(stream);
|
||||
final Token reusableToken = new Token();
|
||||
for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) {
|
||||
String text = nextToken.term();
|
||||
TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class);
|
||||
PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) stream.addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
while (stream.incrementToken()) {
|
||||
String text = termAtt.term();
|
||||
assertFalse(stopWordsSet.contains(text));
|
||||
assertEquals(1,nextToken.getPositionIncrement()); // by default stop tokenizer does not apply increments.
|
||||
assertEquals(1,posIncrAtt.getPositionIncrement()); // by default stop tokenizer does not apply increments.
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -82,11 +87,13 @@ public class TestStopAnalyzer extends LuceneTestCase {
|
|||
TokenStream stream = newStop.tokenStream("test", reader);
|
||||
assertNotNull(stream);
|
||||
int i = 0;
|
||||
final Token reusableToken = new Token();
|
||||
for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) {
|
||||
String text = nextToken.term();
|
||||
TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class);
|
||||
PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) stream.addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
while (stream.incrementToken()) {
|
||||
String text = termAtt.term();
|
||||
assertFalse(stopWordsSet.contains(text));
|
||||
assertEquals(expectedIncr[i++],nextToken.getPositionIncrement());
|
||||
assertEquals(expectedIncr[i++],posIncrAtt.getPositionIncrement());
|
||||
}
|
||||
} finally {
|
||||
StopFilter.setEnablePositionIncrementsDefault(defaultEnable);
|
||||
|
|
|
@ -16,6 +16,8 @@ package org.apache.lucene.analysis;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.util.English;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
|
@ -35,19 +37,22 @@ public class TestStopFilter extends LuceneTestCase {
|
|||
StringReader reader = new StringReader("Now is The Time");
|
||||
String[] stopWords = new String[] { "is", "the", "Time" };
|
||||
TokenStream stream = new StopFilter(new WhitespaceTokenizer(reader), stopWords);
|
||||
final Token reusableToken = new Token();
|
||||
assertEquals("Now", stream.next(reusableToken).term());
|
||||
assertEquals("The", stream.next(reusableToken).term());
|
||||
assertEquals(null, stream.next(reusableToken));
|
||||
final TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class);
|
||||
assertTrue(stream.incrementToken());
|
||||
assertEquals("Now", termAtt.term());
|
||||
assertTrue(stream.incrementToken());
|
||||
assertEquals("The", termAtt.term());
|
||||
assertFalse(stream.incrementToken());
|
||||
}
|
||||
|
||||
public void testIgnoreCase() throws IOException {
|
||||
StringReader reader = new StringReader("Now is The Time");
|
||||
String[] stopWords = new String[] { "is", "the", "Time" };
|
||||
TokenStream stream = new StopFilter(new WhitespaceTokenizer(reader), stopWords, true);
|
||||
final Token reusableToken = new Token();
|
||||
assertEquals("Now", stream.next(reusableToken).term());
|
||||
assertEquals(null,stream.next(reusableToken));
|
||||
final TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class);
|
||||
assertTrue(stream.incrementToken());
|
||||
assertEquals("Now", termAtt.term());
|
||||
assertFalse(stream.incrementToken());
|
||||
}
|
||||
|
||||
public void testStopFilt() throws IOException {
|
||||
|
@ -55,10 +60,12 @@ public class TestStopFilter extends LuceneTestCase {
|
|||
String[] stopWords = new String[] { "is", "the", "Time" };
|
||||
Set stopSet = StopFilter.makeStopSet(stopWords);
|
||||
TokenStream stream = new StopFilter(new WhitespaceTokenizer(reader), stopSet);
|
||||
final Token reusableToken = new Token();
|
||||
assertEquals("Now", stream.next(reusableToken).term());
|
||||
assertEquals("The", stream.next(reusableToken).term());
|
||||
assertEquals(null, stream.next(reusableToken));
|
||||
final TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class);
|
||||
assertTrue(stream.incrementToken());
|
||||
assertEquals("Now", termAtt.term());
|
||||
assertTrue(stream.incrementToken());
|
||||
assertEquals("The", termAtt.term());
|
||||
assertFalse(stream.incrementToken());
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -110,15 +117,16 @@ public class TestStopFilter extends LuceneTestCase {
|
|||
private void doTestStopPositons(StopFilter stpf, boolean enableIcrements) throws IOException {
|
||||
log("---> test with enable-increments-"+(enableIcrements?"enabled":"disabled"));
|
||||
stpf.setEnablePositionIncrements(enableIcrements);
|
||||
final Token reusableToken = new Token();
|
||||
TermAttribute termAtt = (TermAttribute) stpf.getAttribute(TermAttribute.class);
|
||||
PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) stpf.getAttribute(PositionIncrementAttribute.class);
|
||||
for (int i=0; i<20; i+=3) {
|
||||
Token nextToken = stpf.next(reusableToken);
|
||||
log("Token "+i+": "+nextToken);
|
||||
assertTrue(stpf.incrementToken());
|
||||
log("Token "+i+": "+stpf);
|
||||
String w = English.intToEnglish(i).trim();
|
||||
assertEquals("expecting token "+i+" to be "+w,w,nextToken.term());
|
||||
assertEquals("all but first token must have position increment of 3",enableIcrements?(i==0?1:3):1,nextToken.getPositionIncrement());
|
||||
assertEquals("expecting token "+i+" to be "+w,w,termAtt.term());
|
||||
assertEquals("all but first token must have position increment of 3",enableIcrements?(i==0?1:3):1,posIncrAtt.getPositionIncrement());
|
||||
}
|
||||
assertNull(stpf.next(reusableToken));
|
||||
assertFalse(stpf.incrementToken());
|
||||
}
|
||||
|
||||
// print debug info depending on VERBOSE
|
||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.lucene.analysis;
|
|||
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
/** @deprecated */
|
||||
public class TestToken extends LuceneTestCase {
|
||||
|
||||
public TestToken(String name) {
|
||||
|
|
|
@ -22,12 +22,14 @@ import java.io.Reader;
|
|||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.SimpleAnalyzer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.Fieldable;
|
||||
|
@ -35,6 +37,7 @@ import org.apache.lucene.document.Field.Index;
|
|||
import org.apache.lucene.document.Field.Store;
|
||||
import org.apache.lucene.document.Field.TermVector;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
|
||||
|
@ -138,33 +141,38 @@ public class TestDocumentWriter extends LuceneTestCase {
|
|||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
return new TokenFilter(new WhitespaceTokenizer(reader)) {
|
||||
boolean first=true;
|
||||
Token buffered;
|
||||
AttributeSource state;
|
||||
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
if (buffered != null) {
|
||||
Token nextToken = buffered;
|
||||
buffered=null;
|
||||
return nextToken;
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (state != null) {
|
||||
state.restoreState(this);
|
||||
payloadAtt.setPayload(null);
|
||||
posIncrAtt.setPositionIncrement(0);
|
||||
termAtt.setTermBuffer(new char[]{'b'}, 0, 1);
|
||||
state = null;
|
||||
return true;
|
||||
}
|
||||
Token nextToken = input.next(reusableToken);
|
||||
if (nextToken==null) return null;
|
||||
if (Character.isDigit(nextToken.termBuffer()[0])) {
|
||||
nextToken.setPositionIncrement(nextToken.termBuffer()[0] - '0');
|
||||
|
||||
boolean hasNext = input.incrementToken();
|
||||
if (!hasNext) return false;
|
||||
if (Character.isDigit(termAtt.termBuffer()[0])) {
|
||||
posIncrAtt.setPositionIncrement(termAtt.termBuffer()[0] - '0');
|
||||
}
|
||||
if (first) {
|
||||
// set payload on first position only
|
||||
nextToken.setPayload(new Payload(new byte[]{100}));
|
||||
payloadAtt.setPayload(new Payload(new byte[]{100}));
|
||||
first = false;
|
||||
}
|
||||
|
||||
// index a "synonym" for every token
|
||||
buffered = (Token)nextToken.clone();
|
||||
buffered.setPayload(null);
|
||||
buffered.setPositionIncrement(0);
|
||||
buffered.setTermBuffer(new char[]{'b'}, 0, 1);
|
||||
state = captureState();
|
||||
return true;
|
||||
|
||||
return nextToken;
|
||||
}
|
||||
|
||||
TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
PayloadAttribute payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
|
||||
PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
||||
};
|
||||
}
|
||||
};
|
||||
|
@ -201,12 +209,14 @@ public class TestDocumentWriter extends LuceneTestCase {
|
|||
private String[] tokens = new String[] {"term1", "term2", "term3", "term2"};
|
||||
private int index = 0;
|
||||
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
private TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (index == tokens.length) {
|
||||
return null;
|
||||
return false;
|
||||
} else {
|
||||
return reusableToken.reinit(tokens[index++], 0, 0);
|
||||
termAtt.setTermBuffer(tokens[index++]);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -17,48 +17,48 @@ package org.apache.lucene.index;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.File;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.PrintStream;
|
||||
import java.util.Arrays;
|
||||
import java.io.Reader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Random;
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
|
||||
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.SinkTokenizer;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.PhraseQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.ScoreDoc;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.spans.SpanTermQuery;
|
||||
import org.apache.lucene.search.PhraseQuery;
|
||||
import org.apache.lucene.store.AlreadyClosedException;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.FSDirectory;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.store.AlreadyClosedException;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
|
||||
import org.apache.lucene.store.MockRAMDirectory;
|
||||
import org.apache.lucene.store.LockFactory;
|
||||
import org.apache.lucene.store.Lock;
|
||||
import org.apache.lucene.store.LockFactory;
|
||||
import org.apache.lucene.store.MockRAMDirectory;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
import org.apache.lucene.store.SingleInstanceLockFactory;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
|
||||
/**
|
||||
*
|
||||
|
@ -1793,11 +1793,11 @@ public class TestIndexWriter extends LuceneTestCase
|
|||
return new TokenFilter(new StandardTokenizer(reader)) {
|
||||
private int count = 0;
|
||||
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (count++ == 5) {
|
||||
throw new IOException();
|
||||
}
|
||||
return input.next(reusableToken);
|
||||
return input.incrementToken();
|
||||
}
|
||||
};
|
||||
}
|
||||
|
@ -1916,10 +1916,10 @@ public class TestIndexWriter extends LuceneTestCase
|
|||
this.fieldName = fieldName;
|
||||
}
|
||||
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (this.fieldName.equals("crash") && count++ >= 4)
|
||||
throw new IOException("I'm experiencing problems");
|
||||
return input.next(reusableToken);
|
||||
return input.incrementToken();
|
||||
}
|
||||
|
||||
public void reset() throws IOException {
|
||||
|
@ -3577,21 +3577,47 @@ public class TestIndexWriter extends LuceneTestCase
|
|||
}
|
||||
}
|
||||
|
||||
private static class MyAnalyzer extends Analyzer {
|
||||
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
TokenStream s = new WhitespaceTokenizer(reader);
|
||||
s.addAttribute(PositionIncrementAttribute.class);
|
||||
return s;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// LUCENE-1255
|
||||
public void testNegativePositions() throws Throwable {
|
||||
SinkTokenizer tokens = new SinkTokenizer();
|
||||
Token t = new Token();
|
||||
t.setTermBuffer("a");
|
||||
t.setPositionIncrement(0);
|
||||
tokens.add(t);
|
||||
t.setTermBuffer("b");
|
||||
t.setPositionIncrement(1);
|
||||
tokens.add(t);
|
||||
t.setTermBuffer("c");
|
||||
tokens.add(t);
|
||||
tokens.addAttribute(TermAttribute.class);
|
||||
tokens.addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
AttributeSource state = new AttributeSource();
|
||||
TermAttribute termAtt = (TermAttribute) state.addAttribute(TermAttribute.class);
|
||||
PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) state.addAttribute(PositionIncrementAttribute.class);
|
||||
termAtt.setTermBuffer("a");
|
||||
posIncrAtt.setPositionIncrement(0);
|
||||
tokens.add(state);
|
||||
|
||||
state = new AttributeSource();
|
||||
termAtt = (TermAttribute) state.addAttribute(TermAttribute.class);
|
||||
posIncrAtt = (PositionIncrementAttribute) state.addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
termAtt.setTermBuffer("b");
|
||||
posIncrAtt.setPositionIncrement(1);
|
||||
tokens.add(state);
|
||||
|
||||
state = new AttributeSource();
|
||||
termAtt = (TermAttribute) state.addAttribute(TermAttribute.class);
|
||||
posIncrAtt = (PositionIncrementAttribute) state.addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
termAtt.setTermBuffer("c");
|
||||
posIncrAtt.setPositionIncrement(1);
|
||||
tokens.add(state);
|
||||
|
||||
MockRAMDirectory dir = new MockRAMDirectory();
|
||||
IndexWriter w = new IndexWriter(dir, new WhitespaceAnalyzer(), true, IndexWriter.MaxFieldLength.UNLIMITED);
|
||||
IndexWriter w = new IndexWriter(dir, new MyAnalyzer(), true, IndexWriter.MaxFieldLength.UNLIMITED);
|
||||
Document doc = new Document();
|
||||
doc.add(new Field("field", tokens));
|
||||
w.addDocument(doc);
|
||||
|
|
|
@ -20,19 +20,18 @@ package org.apache.lucene.index;
|
|||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.LowerCaseTokenizer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.Field.Index;
|
||||
import org.apache.lucene.document.Field.Store;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
/**
|
||||
* This testcase tests whether multi-level skipping is being used
|
||||
|
@ -99,17 +98,19 @@ public class TestMultiLevelSkipList extends LuceneTestCase {
|
|||
private static class PayloadFilter extends TokenFilter {
|
||||
static int count = 0;
|
||||
|
||||
PayloadAttribute payloadAtt;
|
||||
|
||||
protected PayloadFilter(TokenStream input) {
|
||||
super(input);
|
||||
payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
|
||||
}
|
||||
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
Token nextToken = input.next(reusableToken);
|
||||
if (nextToken != null) {
|
||||
nextToken.setPayload(new Payload(new byte[] { (byte) count++ }));
|
||||
public boolean incrementToken() throws IOException {
|
||||
boolean hasNext = input.incrementToken();
|
||||
if (hasNext) {
|
||||
payloadAtt.setPayload(new Payload(new byte[] { (byte) count++ }));
|
||||
}
|
||||
return nextToken;
|
||||
return hasNext;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -27,20 +27,20 @@ import java.util.List;
|
|||
import java.util.Map;
|
||||
import java.util.Random;
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.FSDirectory;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
|
||||
|
||||
public class TestPayloads extends LuceneTestCase {
|
||||
|
@ -442,32 +442,33 @@ public class TestPayloads extends LuceneTestCase {
|
|||
private int length;
|
||||
private int offset;
|
||||
Payload payload = new Payload();
|
||||
PayloadAttribute payloadAtt;
|
||||
|
||||
public PayloadFilter(TokenStream in, byte[] data, int offset, int length) {
|
||||
super(in);
|
||||
this.data = data;
|
||||
this.length = length;
|
||||
this.offset = offset;
|
||||
payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
|
||||
}
|
||||
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
Token nextToken = input.next(reusableToken);
|
||||
if (nextToken != null) {
|
||||
public boolean incrementToken() throws IOException {
|
||||
boolean hasNext = input.incrementToken();
|
||||
if (hasNext) {
|
||||
if (offset + length <= data.length) {
|
||||
Payload p = null;
|
||||
if (p == null) {
|
||||
p = new Payload();
|
||||
nextToken.setPayload(p);
|
||||
payloadAtt.setPayload(p);
|
||||
}
|
||||
p.setData(data, offset, length);
|
||||
offset += length;
|
||||
} else {
|
||||
nextToken.setPayload(null);
|
||||
payloadAtt.setPayload(null);
|
||||
}
|
||||
}
|
||||
|
||||
return nextToken;
|
||||
return hasNext;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -529,19 +530,25 @@ public class TestPayloads extends LuceneTestCase {
|
|||
private boolean first;
|
||||
private ByteArrayPool pool;
|
||||
private String term;
|
||||
|
||||
TermAttribute termAtt;
|
||||
PayloadAttribute payloadAtt;
|
||||
|
||||
PoolingPayloadTokenStream(ByteArrayPool pool) {
|
||||
this.pool = pool;
|
||||
payload = pool.get();
|
||||
generateRandomData(payload);
|
||||
term = pool.bytesToString(payload);
|
||||
first = true;
|
||||
payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
|
||||
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
}
|
||||
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
if (!first) return null;
|
||||
reusableToken.reinit(term, 0, 0);
|
||||
reusableToken.setPayload(new Payload(payload));
|
||||
return reusableToken;
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (!first) return false;
|
||||
termAtt.setTermBuffer(term);
|
||||
payloadAtt.setPayload(new Payload(payload));
|
||||
return true;
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
|
|
|
@ -17,14 +17,6 @@ package org.apache.lucene.index;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.store.MockRAMDirectory;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Arrays;
|
||||
|
@ -32,6 +24,16 @@ import java.util.Iterator;
|
|||
import java.util.Map;
|
||||
import java.util.SortedSet;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.store.MockRAMDirectory;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
public class TestTermVectorsReader extends LuceneTestCase {
|
||||
//Must be lexicographically sorted, will do in setup, versus trying to maintain here
|
||||
private String[] testFields = {"f1", "f2", "f3", "f4"};
|
||||
|
@ -118,17 +120,31 @@ public class TestTermVectorsReader extends LuceneTestCase {
|
|||
|
||||
private class MyTokenStream extends TokenStream {
|
||||
int tokenUpto;
|
||||
public Token next(final Token reusableToken) {
|
||||
|
||||
TermAttribute termAtt;
|
||||
PositionIncrementAttribute posIncrAtt;
|
||||
OffsetAttribute offsetAtt;
|
||||
|
||||
public MyTokenStream() {
|
||||
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
||||
offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||
}
|
||||
|
||||
public boolean incrementToken() {
|
||||
if (tokenUpto >= tokens.length)
|
||||
return null;
|
||||
return false;
|
||||
else {
|
||||
final TestToken testToken = tokens[tokenUpto++];
|
||||
reusableToken.reinit(testToken.text, testToken.startOffset, testToken.endOffset);
|
||||
if (tokenUpto > 1)
|
||||
reusableToken.setPositionIncrement(testToken.pos - tokens[tokenUpto-2].pos);
|
||||
else
|
||||
reusableToken.setPositionIncrement(testToken.pos+1);
|
||||
return reusableToken;
|
||||
termAtt.setTermBuffer(testToken.text);
|
||||
offsetAtt.setStartOffset(testToken.startOffset);
|
||||
offsetAtt.setEndOffset(testToken.endOffset);
|
||||
if (tokenUpto > 1) {
|
||||
posIncrAtt.setPositionIncrement(testToken.pos - tokens[tokenUpto-2].pos);
|
||||
} else {
|
||||
posIncrAtt.setPositionIncrement(testToken.pos+1);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,18 +17,18 @@ package org.apache.lucene.index;
|
|||
*/
|
||||
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Random;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.IOException;
|
||||
import java.util.Random;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
/**
|
||||
* @version $Id$
|
||||
|
@ -36,15 +36,21 @@ import java.util.Random;
|
|||
|
||||
class RepeatingTokenStream extends TokenStream {
|
||||
public int num;
|
||||
Token t;
|
||||
TermAttribute termAtt;
|
||||
String value;
|
||||
|
||||
public RepeatingTokenStream(String val) {
|
||||
t = new Token(0,val.length());
|
||||
t.setTermBuffer(val);
|
||||
this.value = val;
|
||||
this.termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
}
|
||||
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
return --num<0 ? null : (Token) t.clone();
|
||||
public boolean incrementToken() throws IOException {
|
||||
num--;
|
||||
if (num >= 0) {
|
||||
termAtt.setTermBuffer(value);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -17,17 +17,20 @@ package org.apache.lucene.queryParser;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
/**
|
||||
* Test QueryParser's ability to deal with Analyzers that return more
|
||||
|
@ -140,34 +143,49 @@ public class TestMultiAnalyzer extends LuceneTestCase {
|
|||
|
||||
private final class TestFilter extends TokenFilter {
|
||||
|
||||
private Token prevToken;
|
||||
private String prevType;
|
||||
private int prevStartOffset;
|
||||
private int prevEndOffset;
|
||||
|
||||
TermAttribute termAtt;
|
||||
PositionIncrementAttribute posIncrAtt;
|
||||
OffsetAttribute offsetAtt;
|
||||
TypeAttribute typeAtt;
|
||||
|
||||
public TestFilter(TokenStream in) {
|
||||
super(in);
|
||||
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
||||
offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||
typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
|
||||
}
|
||||
|
||||
public final Token next(final Token reusableToken) throws java.io.IOException {
|
||||
public final boolean incrementToken() throws java.io.IOException {
|
||||
if (multiToken > 0) {
|
||||
reusableToken.reinit("multi"+(multiToken+1), prevToken.startOffset(), prevToken.endOffset(), prevToken.type());
|
||||
reusableToken.setPositionIncrement(0);
|
||||
termAtt.setTermBuffer("multi"+(multiToken+1));
|
||||
offsetAtt.setStartOffset(prevStartOffset);
|
||||
offsetAtt.setEndOffset(prevEndOffset);
|
||||
typeAtt.setType(prevType);
|
||||
posIncrAtt.setPositionIncrement(0);
|
||||
multiToken--;
|
||||
return reusableToken;
|
||||
return true;
|
||||
} else {
|
||||
Token nextToken = input.next(reusableToken);
|
||||
if (nextToken == null) {
|
||||
prevToken = null;
|
||||
return null;
|
||||
boolean next = input.incrementToken();
|
||||
if (next == false) {
|
||||
return false;
|
||||
}
|
||||
prevToken = (Token) nextToken.clone();
|
||||
String text = nextToken.term();
|
||||
prevType = typeAtt.type();
|
||||
prevStartOffset = offsetAtt.startOffset();
|
||||
prevEndOffset = offsetAtt.endOffset();
|
||||
String text = termAtt.term();
|
||||
if (text.equals("triplemulti")) {
|
||||
multiToken = 2;
|
||||
return nextToken;
|
||||
return true;
|
||||
} else if (text.equals("multi")) {
|
||||
multiToken = 1;
|
||||
return nextToken;
|
||||
return true;
|
||||
} else {
|
||||
return nextToken;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -192,23 +210,28 @@ public class TestMultiAnalyzer extends LuceneTestCase {
|
|||
|
||||
private final class TestPosIncrementFilter extends TokenFilter {
|
||||
|
||||
TermAttribute termAtt;
|
||||
PositionIncrementAttribute posIncrAtt;
|
||||
|
||||
public TestPosIncrementFilter(TokenStream in) {
|
||||
super(in);
|
||||
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
||||
}
|
||||
|
||||
public final Token next(final Token reusableToken) throws java.io.IOException {
|
||||
for (Token nextToken = input.next(reusableToken); nextToken != null; nextToken = input.next(reusableToken)) {
|
||||
if (nextToken.term().equals("the")) {
|
||||
public final boolean incrementToken () throws java.io.IOException {
|
||||
while(input.incrementToken()) {
|
||||
if (termAtt.term().equals("the")) {
|
||||
// stopword, do nothing
|
||||
} else if (nextToken.term().equals("quick")) {
|
||||
nextToken.setPositionIncrement(2);
|
||||
return nextToken;
|
||||
} else if (termAtt.term().equals("quick")) {
|
||||
posIncrAtt.setPositionIncrement(2);
|
||||
return true;
|
||||
} else {
|
||||
nextToken.setPositionIncrement(1);
|
||||
return nextToken;
|
||||
posIncrAtt.setPositionIncrement(1);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -22,7 +22,6 @@ import java.util.HashMap;
|
|||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
|
|
|
@ -19,8 +19,8 @@ package org.apache.lucene.queryParser;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.text.DateFormat;
|
||||
import java.text.Collator;
|
||||
import java.text.DateFormat;
|
||||
import java.util.Calendar;
|
||||
import java.util.Date;
|
||||
import java.util.Locale;
|
||||
|
@ -31,11 +31,12 @@ import org.apache.lucene.analysis.LowerCaseTokenizer;
|
|||
import org.apache.lucene.analysis.SimpleAnalyzer;
|
||||
import org.apache.lucene.analysis.StopAnalyzer;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.document.DateField;
|
||||
import org.apache.lucene.document.DateTools;
|
||||
import org.apache.lucene.document.Document;
|
||||
|
@ -64,36 +65,47 @@ public class TestQueryParser extends LuceneTestCase {
|
|||
public static Analyzer qpAnalyzer = new QPTestAnalyzer();
|
||||
|
||||
public static class QPTestFilter extends TokenFilter {
|
||||
TermAttribute termAtt;
|
||||
OffsetAttribute offsetAtt;
|
||||
|
||||
/**
|
||||
* Filter which discards the token 'stop' and which expands the
|
||||
* token 'phrase' into 'phrase1 phrase2'
|
||||
*/
|
||||
public QPTestFilter(TokenStream in) {
|
||||
super(in);
|
||||
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||
}
|
||||
|
||||
boolean inPhrase = false;
|
||||
int savedStart = 0, savedEnd = 0;
|
||||
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (inPhrase) {
|
||||
inPhrase = false;
|
||||
return reusableToken.reinit("phrase2", savedStart, savedEnd);
|
||||
termAtt.setTermBuffer("phrase2");
|
||||
offsetAtt.setStartOffset(savedStart);
|
||||
offsetAtt.setEndOffset(savedEnd);
|
||||
return true;
|
||||
} else
|
||||
for (Token nextToken = input.next(reusableToken); nextToken != null; nextToken = input.next(reusableToken)) {
|
||||
if (nextToken.term().equals("phrase")) {
|
||||
while (input.incrementToken()) {
|
||||
if (termAtt.term().equals("phrase")) {
|
||||
inPhrase = true;
|
||||
savedStart = nextToken.startOffset();
|
||||
savedEnd = nextToken.endOffset();
|
||||
return nextToken.reinit("phrase1", savedStart, savedEnd);
|
||||
} else if (!nextToken.term().equals("stop"))
|
||||
return nextToken;
|
||||
savedStart = offsetAtt.startOffset();
|
||||
savedEnd = offsetAtt.endOffset();
|
||||
termAtt.setTermBuffer("phrase1");
|
||||
offsetAtt.setStartOffset(savedStart);
|
||||
offsetAtt.setEndOffset(savedEnd);
|
||||
return true;
|
||||
} else if (!termAtt.term().equals("stop"))
|
||||
return true;
|
||||
}
|
||||
return null;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public static class QPTestAnalyzer extends Analyzer {
|
||||
|
||||
/** Filters LowerCaseTokenizer with StopFilter. */
|
||||
|
|
|
@ -17,14 +17,16 @@ package org.apache.lucene.search;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
|
@ -49,14 +51,19 @@ public class TestPositionIncrement extends LuceneTestCase {
|
|||
private final int[] INCREMENTS = {1, 2, 1, 0, 1};
|
||||
private int i = 0;
|
||||
|
||||
public Token next(final Token reusableToken) {
|
||||
assert reusableToken != null;
|
||||
PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
||||
TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||
|
||||
public boolean incrementToken() {
|
||||
if (i == TOKENS.length)
|
||||
return null;
|
||||
reusableToken.reinit(TOKENS[i], i, i);
|
||||
reusableToken.setPositionIncrement(INCREMENTS[i]);
|
||||
return false;
|
||||
termAtt.setTermBuffer(TOKENS[i]);
|
||||
offsetAtt.setStartOffset(i);
|
||||
offsetAtt.setEndOffset(i);
|
||||
posIncrAtt.setPositionIncrement(INCREMENTS[i]);
|
||||
i++;
|
||||
return reusableToken;
|
||||
return true;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
@ -196,18 +203,4 @@ public class TestPositionIncrement extends LuceneTestCase {
|
|||
StopFilter.setEnablePositionIncrementsDefault(dflt);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Basic analyzer behavior should be to keep sequential terms in one
|
||||
* increment from one another.
|
||||
*/
|
||||
public void testIncrementingPositions() throws Exception {
|
||||
Analyzer analyzer = new WhitespaceAnalyzer();
|
||||
TokenStream ts = analyzer.tokenStream("field",
|
||||
new StringReader("one two three four five"));
|
||||
final Token reusableToken = new Token();
|
||||
for (Token nextToken = ts.next(reusableToken); nextToken != null; nextToken = ts.next(reusableToken)) {
|
||||
assertEquals(nextToken.term(), 1, nextToken.getPositionIncrement());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -26,7 +26,7 @@ import org.apache.lucene.store.RAMDirectory;
|
|||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import java.io.IOException;
|
||||
|
@ -236,23 +236,25 @@ public class TestRangeQuery extends LuceneTestCase {
|
|||
private static class SingleCharTokenizer extends Tokenizer {
|
||||
char[] buffer = new char[1];
|
||||
boolean done;
|
||||
TermAttribute termAtt;
|
||||
|
||||
public SingleCharTokenizer(Reader r) {
|
||||
super(r);
|
||||
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
}
|
||||
|
||||
public final Token next(final Token reusableToken) throws IOException {
|
||||
public boolean incrementToken() throws IOException {
|
||||
int count = input.read(buffer);
|
||||
if (done)
|
||||
return null;
|
||||
return false;
|
||||
else {
|
||||
done = true;
|
||||
if (count == 1) {
|
||||
reusableToken.termBuffer()[0] = buffer[0];
|
||||
reusableToken.setTermLength(1);
|
||||
termAtt.termBuffer()[0] = buffer[0];
|
||||
termAtt.setTermLength(1);
|
||||
} else
|
||||
reusableToken.setTermLength(0);
|
||||
return reusableToken;
|
||||
termAtt.setTermLength(0);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -2,6 +2,7 @@ package org.apache.lucene.search.payloads;
|
|||
|
||||
|
||||
import org.apache.lucene.analysis.*;
|
||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
import org.apache.lucene.index.Payload;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
|
@ -41,34 +42,36 @@ public class PayloadHelper {
|
|||
public class PayloadFilter extends TokenFilter {
|
||||
String fieldName;
|
||||
int numSeen = 0;
|
||||
PayloadAttribute payloadAtt;
|
||||
|
||||
public PayloadFilter(TokenStream input, String fieldName) {
|
||||
super(input);
|
||||
this.fieldName = fieldName;
|
||||
payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
|
||||
}
|
||||
|
||||
public Token next() throws IOException {
|
||||
Token result = input.next();
|
||||
if (result != null) {
|
||||
public boolean incrementToken() throws IOException {
|
||||
|
||||
if (input.incrementToken()) {
|
||||
if (fieldName.equals(FIELD))
|
||||
{
|
||||
result.setPayload(new Payload(payloadField));
|
||||
payloadAtt.setPayload(new Payload(payloadField));
|
||||
}
|
||||
else if (fieldName.equals(MULTI_FIELD))
|
||||
{
|
||||
if (numSeen % 2 == 0)
|
||||
{
|
||||
result.setPayload(new Payload(payloadMultiField1));
|
||||
payloadAtt.setPayload(new Payload(payloadMultiField1));
|
||||
}
|
||||
else
|
||||
{
|
||||
result.setPayload(new Payload(payloadMultiField2));
|
||||
payloadAtt.setPayload(new Payload(payloadMultiField2));
|
||||
}
|
||||
numSeen++;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
return result;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -21,9 +21,9 @@ import java.io.Reader;
|
|||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.LowerCaseTokenizer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
|
@ -67,28 +67,31 @@ public class TestBoostingTermQuery extends LuceneTestCase {
|
|||
String fieldName;
|
||||
int numSeen = 0;
|
||||
|
||||
PayloadAttribute payloadAtt;
|
||||
|
||||
public PayloadFilter(TokenStream input, String fieldName) {
|
||||
super(input);
|
||||
this.fieldName = fieldName;
|
||||
payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
|
||||
}
|
||||
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
Token nextToken = input.next(reusableToken);
|
||||
if (nextToken != null) {
|
||||
public boolean incrementToken() throws IOException {
|
||||
boolean hasNext = input.incrementToken();
|
||||
if (hasNext) {
|
||||
if (fieldName.equals("field")) {
|
||||
nextToken.setPayload(new Payload(payloadField));
|
||||
payloadAtt.setPayload(new Payload(payloadField));
|
||||
} else if (fieldName.equals("multiField")) {
|
||||
if (numSeen % 2 == 0) {
|
||||
nextToken.setPayload(new Payload(payloadMultiField1));
|
||||
payloadAtt.setPayload(new Payload(payloadMultiField1));
|
||||
} else {
|
||||
nextToken.setPayload(new Payload(payloadMultiField2));
|
||||
payloadAtt.setPayload(new Payload(payloadMultiField2));
|
||||
}
|
||||
numSeen++;
|
||||
}
|
||||
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
return nextToken;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -27,9 +27,11 @@ import junit.framework.TestCase;
|
|||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.LowerCaseTokenizer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
|
@ -43,8 +45,9 @@ import org.apache.lucene.search.TermQuery;
|
|||
import org.apache.lucene.search.payloads.PayloadHelper;
|
||||
import org.apache.lucene.search.payloads.PayloadSpanUtil;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
public class TestPayloadSpans extends TestCase {
|
||||
public class TestPayloadSpans extends LuceneTestCase {
|
||||
private final static boolean DEBUG = false;
|
||||
private IndexSearcher searcher;
|
||||
private Similarity similarity = new DefaultSimilarity();
|
||||
|
@ -54,7 +57,8 @@ public class TestPayloadSpans extends TestCase {
|
|||
super(s);
|
||||
}
|
||||
|
||||
protected void setUp() throws IOException {
|
||||
protected void setUp() throws Exception {
|
||||
super.setUp();
|
||||
PayloadHelper helper = new PayloadHelper();
|
||||
searcher = helper.setUp(similarity, 1000);
|
||||
indexReader = searcher.getIndexReader();
|
||||
|
@ -345,6 +349,9 @@ public class TestPayloadSpans extends TestCase {
|
|||
Set entities = new HashSet();
|
||||
Set nopayload = new HashSet();
|
||||
int pos;
|
||||
PayloadAttribute payloadAtt;
|
||||
TermAttribute termAtt;
|
||||
PositionIncrementAttribute posIncrAtt;
|
||||
|
||||
public PayloadFilter(TokenStream input, String fieldName) {
|
||||
super(input);
|
||||
|
@ -354,24 +361,26 @@ public class TestPayloadSpans extends TestCase {
|
|||
entities.add("one");
|
||||
nopayload.add("nopayload");
|
||||
nopayload.add("np");
|
||||
|
||||
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
||||
payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
|
||||
}
|
||||
|
||||
public Token next() throws IOException {
|
||||
Token result = input.next();
|
||||
if (result != null) {
|
||||
String token = new String(result.termBuffer(), 0, result.termLength());
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
String token = new String(termAtt.termBuffer(), 0, termAtt.termLength());
|
||||
|
||||
if (!nopayload.contains(token)) {
|
||||
if (entities.contains(token)) {
|
||||
result.setPayload(new Payload((token + ":Entity:"+ pos ).getBytes()));
|
||||
payloadAtt.setPayload(new Payload((token + ":Entity:"+ pos ).getBytes()));
|
||||
} else {
|
||||
result.setPayload(new Payload((token + ":Noise:" + pos ).getBytes()));
|
||||
payloadAtt.setPayload(new Payload((token + ":Noise:" + pos ).getBytes()));
|
||||
}
|
||||
}
|
||||
pos += result.getPositionIncrement();
|
||||
pos += posIncrAtt.getPositionIncrement();
|
||||
return true;
|
||||
}
|
||||
return result;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -17,6 +17,7 @@ package org.apache.lucene.util;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.index.ConcurrentMergeScheduler;
|
||||
import junit.framework.TestCase;
|
||||
|
||||
|
@ -42,6 +43,7 @@ public abstract class LuceneTestCase extends TestCase {
|
|||
|
||||
protected void setUp() throws Exception {
|
||||
ConcurrentMergeScheduler.setTestMode();
|
||||
TokenStream.setUseNewAPIDefault(true);
|
||||
}
|
||||
|
||||
protected void tearDown() throws Exception {
|
||||
|
|
Loading…
Reference in New Issue