LUCENE-1801: All Tokenizers/TokenStreams that are source of tokens call AttributeSource.clearAttributes() first. Made Token.clear() consistent to AttributeImpl (clear everything)

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@804392 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Uwe Schindler 2009-08-14 22:01:42 +00:00
parent e3db2b9906
commit b16e0aa31b
18 changed files with 50 additions and 20 deletions

View File

@ -171,6 +171,13 @@ Changes in runtime behavior
reusableTokenStream. This is now fixed, such that if
reusableTokenStream is invoked on such a subclass, that method
will forcefully fallback to tokenStream. (Mike McCandless)
12. LUCENE-1801: Token.clear() and Token.clearNoTermBuffer() now also clear
startOffset, endOffset and type. This should normally affect no
Tokenizer chains, as Tokenizers normally always set these three values.
This change was made to be conform to the new AttributeImpl.clear() and
AttributeSource.clearAttributes() to work identical for Token as one for all
AttributeImpl and the 6 separate AttributeImpls. (Uwe Schindler, Michael Busch)
API Changes
@ -468,6 +475,10 @@ Bug fixes
22. LUCENE-1805: CloseableThreadLocal did not allow a null Object in get(),
although it does allow it in set(Object). Fix get() to not assert the object
is not null. (Shai Erera via Mike McCandless)
23. LUCENE-1801: Changed all Tokenizers or TokenStreams in core/contrib)
that are the source of Tokens to always call
AttributeSource.clearAttributes() first. (Uwe Schindler)
New features

View File

@ -123,6 +123,7 @@ public final class CJKTokenizer extends Tokenizer {
*
*/
public boolean incrementToken() throws IOException {
clearAttributes();
/** how many character(s) has been stored in buffer */
while(true) { // loop until we find a non-empty token

View File

@ -96,6 +96,7 @@ public final class ChineseTokenizer extends Tokenizer {
}
public boolean incrementToken() throws IOException {
clearAttributes();
length = 0;
start = offset;

View File

@ -64,6 +64,7 @@ public class SingleTokenTokenStream extends TokenStream {
Token clone = (Token) singleToken.clone();
clearAttributes();
termAtt.setTermBuffer(clone.termBuffer(), 0, clone.termLength());
offsetAtt.setOffset(clone.startOffset(), clone.endOffset());
flagsAtt.setFlags(clone.getFlags());

View File

@ -123,6 +123,7 @@ public class EdgeNGramTokenizer extends Tokenizer {
/** Returns the next token in the stream, or null at EOS. */
public final boolean incrementToken() throws IOException {
clearAttributes();
// if we are just starting, read the whole input
if (!started) {
started = true;

View File

@ -72,6 +72,7 @@ public class NGramTokenizer extends Tokenizer {
/** Returns the next token in the stream, or null at EOS. */
public final boolean incrementToken() throws IOException {
clearAttributes();
if (!started) {
started = true;
gramSize = minGram;

View File

@ -54,6 +54,7 @@ public final class SentenceTokenizer extends Tokenizer {
}
public boolean incrementToken() throws IOException {
clearAttributes();
buffer.setLength(0);
int ci;
char ch, pch;

View File

@ -343,7 +343,7 @@ public class PatternAnalyzer extends Analyzer {
public final boolean incrementToken() {
if (matcher == null) return false;
clearAttributes();
while (true) { // loop takes care of leading and trailing boundary cases
int start = pos;
int end;
@ -401,6 +401,7 @@ public class PatternAnalyzer extends Analyzer {
}
public boolean incrementToken() {
clearAttributes();
// cache loop instance vars (performance)
String s = str;
int len = s.length();

View File

@ -184,6 +184,7 @@ public class WikipediaTokenizer extends Tokenizer {
restoreState(state);
return true;
}
clearAttributes();
int tokenType = scanner.getNextToken();
if (tokenType == WikipediaTokenizerImpl.YYEOF) {

View File

@ -53,9 +53,9 @@ public abstract class CharTokenizer extends Tokenizer {
}
public final boolean incrementToken() throws IOException {
clearAttributes();
int length = 0;
int start = bufferIndex;
termAtt.clear();
char[] buffer = termAtt.termBuffer();
while (true) {

View File

@ -49,6 +49,7 @@ public class KeywordTokenizer extends Tokenizer {
public final boolean incrementToken() throws IOException {
if (!done) {
clearAttributes();
done = true;
int upto = 0;
char[] buffer = termAtt.termBuffer();

View File

@ -184,6 +184,7 @@ public final class NumericTokenStream extends TokenStream {
if (shift >= valSize)
return false;
clearAttributes();
final char[] buffer;
switch (valSize) {
case 64:

View File

@ -117,7 +117,7 @@ import org.apache.lucene.util.AttributeImpl;
</ul>
A few things to note:
<ul>
<li>clear() initializes most of the fields to default values, but not startOffset, endOffset and type.</li>
<li>clear() initializes all of the fields to default values. This was changed in contrast to Lucene 2.4, but should affect no one.</li>
<li>Because <code>TokenStreams</code> can be chained, one cannot assume that the <code>Token's</code> current type is correct.</li>
<li>The startOffset and endOffset represent the start and offset in the source text. So be careful in adjusting them.</li>
<li>When caching a reusable token, clone it. When injecting a cached token into a stream that can be reset, clone it again.</li>
@ -622,9 +622,9 @@ public class Token extends AttributeImpl
return sb.toString();
}
/** Resets the term text, payload, flags, and positionIncrement to default.
* Other fields such as startOffset, endOffset and the token type are
* not reset since they are normally overwritten by the tokenizer. */
/** Resets the term text, payload, flags, and positionIncrement,
* startOffset, endOffset and token type to default.
*/
public void clear() {
payload = null;
// Leave termBuffer to allow re-use
@ -632,8 +632,8 @@ public class Token extends AttributeImpl
termText = null;
positionIncrement = 1;
flags = 0;
// startOffset = endOffset = 0;
// type = DEFAULT_TYPE;
startOffset = endOffset = 0;
type = DEFAULT_TYPE;
}
public Object clone() {
@ -715,6 +715,8 @@ public class Token extends AttributeImpl
payload = null;
positionIncrement = 1;
flags = 0;
startOffset = endOffset = 0;
type = DEFAULT_TYPE;
}
/** Shorthand for calling {@link #clear},

View File

@ -122,6 +122,7 @@ final class TokenWrapper extends AttributeImpl
}
// PayloadAttribute
public Payload getPayload() {
return delegate.getPayload();
}
@ -130,14 +131,12 @@ final class TokenWrapper extends AttributeImpl
delegate.setPayload(payload);
}
// TokenAttribute
// AttributeImpl
public void clear() {
delegate.clear();
}
// AttributeImpl
public String toString() {
return delegate.toString();
}

View File

@ -26,12 +26,16 @@ import java.io.IOException;
<p>
This is an abstract class.
<p>
NOTE: To use the old API subclasses must override {@link #next(Token)}.
It's also OK to instead override {@link #next()} but that
method is slower compared to {@link #next(Token)}.
NOTE: subclasses must override
{@link #incrementToken()} if the new TokenStream API is used
and {@link #next(Token)} or {@link #next()} if the old
TokenStream API is used.
<p>
NOTE: subclasses overriding {@link #next(Token)} must
call {@link Token#clear()}.
NOTE: Subclasses overriding {@link #incrementToken()} must
call {@link AttributeSource#clearAttributes()} before
setting attributes.
Subclasses overriding {@link #next(Token)} must call
{@link Token#clear()} before setting Token attributes.
*/
public abstract class Tokenizer extends TokenStream {
@ -85,6 +89,9 @@ public abstract class Tokenizer extends TokenStream {
this.input = CharReader.get(input);
}
/** Expert: Reset the tokenizer to a new CharStream. Typically, an
* analyzer (in its reusableTokenStream method) will use
* this to re-use a previously created tokenizer. */
public void reset(CharStream input) throws IOException {
this.input = input;
}

View File

@ -148,6 +148,7 @@ public class StandardTokenizer extends Tokenizer {
* @see org.apache.lucene.analysis.TokenStream#next()
*/
public final boolean incrementToken() throws IOException {
clearAttributes();
int posIncr = 1;
while(true) {

View File

@ -21,5 +21,4 @@ package org.apache.lucene.util;
* Base interface for attributes.
*/
public interface Attribute {
public void clear();
}

View File

@ -30,8 +30,9 @@ import java.lang.reflect.Modifier;
*/
public abstract class AttributeImpl implements Cloneable, Serializable {
/**
* Clears the values in this Attribute and resets it to its
* default value.
* Clears the values in this AttributeImpl and resets it to its
* default value. If this implementation implements more than one Attribute interface
* it clears all.
*/
public abstract void clear();