mirror of https://github.com/apache/lucene.git
LUCENE-1801: All Tokenizers/TokenStreams that are source of tokens call AttributeSource.clearAttributes() first. Made Token.clear() consistent to AttributeImpl (clear everything)
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@804392 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
e3db2b9906
commit
b16e0aa31b
11
CHANGES.txt
11
CHANGES.txt
|
@ -171,6 +171,13 @@ Changes in runtime behavior
|
|||
reusableTokenStream. This is now fixed, such that if
|
||||
reusableTokenStream is invoked on such a subclass, that method
|
||||
will forcefully fallback to tokenStream. (Mike McCandless)
|
||||
|
||||
12. LUCENE-1801: Token.clear() and Token.clearNoTermBuffer() now also clear
|
||||
startOffset, endOffset and type. This should normally affect no
|
||||
Tokenizer chains, as Tokenizers normally always set these three values.
|
||||
This change was made to be conform to the new AttributeImpl.clear() and
|
||||
AttributeSource.clearAttributes() to work identical for Token as one for all
|
||||
AttributeImpl and the 6 separate AttributeImpls. (Uwe Schindler, Michael Busch)
|
||||
|
||||
API Changes
|
||||
|
||||
|
@ -468,6 +475,10 @@ Bug fixes
|
|||
22. LUCENE-1805: CloseableThreadLocal did not allow a null Object in get(),
|
||||
although it does allow it in set(Object). Fix get() to not assert the object
|
||||
is not null. (Shai Erera via Mike McCandless)
|
||||
|
||||
23. LUCENE-1801: Changed all Tokenizers or TokenStreams in core/contrib)
|
||||
that are the source of Tokens to always call
|
||||
AttributeSource.clearAttributes() first. (Uwe Schindler)
|
||||
|
||||
New features
|
||||
|
||||
|
|
|
@ -123,6 +123,7 @@ public final class CJKTokenizer extends Tokenizer {
|
|||
*
|
||||
*/
|
||||
public boolean incrementToken() throws IOException {
|
||||
clearAttributes();
|
||||
/** how many character(s) has been stored in buffer */
|
||||
|
||||
while(true) { // loop until we find a non-empty token
|
||||
|
|
|
@ -96,6 +96,7 @@ public final class ChineseTokenizer extends Tokenizer {
|
|||
}
|
||||
|
||||
public boolean incrementToken() throws IOException {
|
||||
clearAttributes();
|
||||
|
||||
length = 0;
|
||||
start = offset;
|
||||
|
|
|
@ -64,6 +64,7 @@ public class SingleTokenTokenStream extends TokenStream {
|
|||
|
||||
Token clone = (Token) singleToken.clone();
|
||||
|
||||
clearAttributes();
|
||||
termAtt.setTermBuffer(clone.termBuffer(), 0, clone.termLength());
|
||||
offsetAtt.setOffset(clone.startOffset(), clone.endOffset());
|
||||
flagsAtt.setFlags(clone.getFlags());
|
||||
|
|
|
@ -123,6 +123,7 @@ public class EdgeNGramTokenizer extends Tokenizer {
|
|||
|
||||
/** Returns the next token in the stream, or null at EOS. */
|
||||
public final boolean incrementToken() throws IOException {
|
||||
clearAttributes();
|
||||
// if we are just starting, read the whole input
|
||||
if (!started) {
|
||||
started = true;
|
||||
|
|
|
@ -72,6 +72,7 @@ public class NGramTokenizer extends Tokenizer {
|
|||
|
||||
/** Returns the next token in the stream, or null at EOS. */
|
||||
public final boolean incrementToken() throws IOException {
|
||||
clearAttributes();
|
||||
if (!started) {
|
||||
started = true;
|
||||
gramSize = minGram;
|
||||
|
|
|
@ -54,6 +54,7 @@ public final class SentenceTokenizer extends Tokenizer {
|
|||
}
|
||||
|
||||
public boolean incrementToken() throws IOException {
|
||||
clearAttributes();
|
||||
buffer.setLength(0);
|
||||
int ci;
|
||||
char ch, pch;
|
||||
|
|
|
@ -343,7 +343,7 @@ public class PatternAnalyzer extends Analyzer {
|
|||
|
||||
public final boolean incrementToken() {
|
||||
if (matcher == null) return false;
|
||||
|
||||
clearAttributes();
|
||||
while (true) { // loop takes care of leading and trailing boundary cases
|
||||
int start = pos;
|
||||
int end;
|
||||
|
@ -401,6 +401,7 @@ public class PatternAnalyzer extends Analyzer {
|
|||
}
|
||||
|
||||
public boolean incrementToken() {
|
||||
clearAttributes();
|
||||
// cache loop instance vars (performance)
|
||||
String s = str;
|
||||
int len = s.length();
|
||||
|
|
|
@ -184,6 +184,7 @@ public class WikipediaTokenizer extends Tokenizer {
|
|||
restoreState(state);
|
||||
return true;
|
||||
}
|
||||
clearAttributes();
|
||||
int tokenType = scanner.getNextToken();
|
||||
|
||||
if (tokenType == WikipediaTokenizerImpl.YYEOF) {
|
||||
|
|
|
@ -53,9 +53,9 @@ public abstract class CharTokenizer extends Tokenizer {
|
|||
}
|
||||
|
||||
public final boolean incrementToken() throws IOException {
|
||||
clearAttributes();
|
||||
int length = 0;
|
||||
int start = bufferIndex;
|
||||
termAtt.clear();
|
||||
char[] buffer = termAtt.termBuffer();
|
||||
while (true) {
|
||||
|
||||
|
|
|
@ -49,6 +49,7 @@ public class KeywordTokenizer extends Tokenizer {
|
|||
|
||||
public final boolean incrementToken() throws IOException {
|
||||
if (!done) {
|
||||
clearAttributes();
|
||||
done = true;
|
||||
int upto = 0;
|
||||
char[] buffer = termAtt.termBuffer();
|
||||
|
|
|
@ -184,6 +184,7 @@ public final class NumericTokenStream extends TokenStream {
|
|||
if (shift >= valSize)
|
||||
return false;
|
||||
|
||||
clearAttributes();
|
||||
final char[] buffer;
|
||||
switch (valSize) {
|
||||
case 64:
|
||||
|
|
|
@ -117,7 +117,7 @@ import org.apache.lucene.util.AttributeImpl;
|
|||
</ul>
|
||||
A few things to note:
|
||||
<ul>
|
||||
<li>clear() initializes most of the fields to default values, but not startOffset, endOffset and type.</li>
|
||||
<li>clear() initializes all of the fields to default values. This was changed in contrast to Lucene 2.4, but should affect no one.</li>
|
||||
<li>Because <code>TokenStreams</code> can be chained, one cannot assume that the <code>Token's</code> current type is correct.</li>
|
||||
<li>The startOffset and endOffset represent the start and offset in the source text. So be careful in adjusting them.</li>
|
||||
<li>When caching a reusable token, clone it. When injecting a cached token into a stream that can be reset, clone it again.</li>
|
||||
|
@ -622,9 +622,9 @@ public class Token extends AttributeImpl
|
|||
return sb.toString();
|
||||
}
|
||||
|
||||
/** Resets the term text, payload, flags, and positionIncrement to default.
|
||||
* Other fields such as startOffset, endOffset and the token type are
|
||||
* not reset since they are normally overwritten by the tokenizer. */
|
||||
/** Resets the term text, payload, flags, and positionIncrement,
|
||||
* startOffset, endOffset and token type to default.
|
||||
*/
|
||||
public void clear() {
|
||||
payload = null;
|
||||
// Leave termBuffer to allow re-use
|
||||
|
@ -632,8 +632,8 @@ public class Token extends AttributeImpl
|
|||
termText = null;
|
||||
positionIncrement = 1;
|
||||
flags = 0;
|
||||
// startOffset = endOffset = 0;
|
||||
// type = DEFAULT_TYPE;
|
||||
startOffset = endOffset = 0;
|
||||
type = DEFAULT_TYPE;
|
||||
}
|
||||
|
||||
public Object clone() {
|
||||
|
@ -715,6 +715,8 @@ public class Token extends AttributeImpl
|
|||
payload = null;
|
||||
positionIncrement = 1;
|
||||
flags = 0;
|
||||
startOffset = endOffset = 0;
|
||||
type = DEFAULT_TYPE;
|
||||
}
|
||||
|
||||
/** Shorthand for calling {@link #clear},
|
||||
|
|
|
@ -122,6 +122,7 @@ final class TokenWrapper extends AttributeImpl
|
|||
}
|
||||
|
||||
// PayloadAttribute
|
||||
|
||||
public Payload getPayload() {
|
||||
return delegate.getPayload();
|
||||
}
|
||||
|
@ -130,14 +131,12 @@ final class TokenWrapper extends AttributeImpl
|
|||
delegate.setPayload(payload);
|
||||
}
|
||||
|
||||
// TokenAttribute
|
||||
|
||||
// AttributeImpl
|
||||
|
||||
public void clear() {
|
||||
delegate.clear();
|
||||
}
|
||||
|
||||
// AttributeImpl
|
||||
|
||||
public String toString() {
|
||||
return delegate.toString();
|
||||
}
|
||||
|
|
|
@ -26,12 +26,16 @@ import java.io.IOException;
|
|||
<p>
|
||||
This is an abstract class.
|
||||
<p>
|
||||
NOTE: To use the old API subclasses must override {@link #next(Token)}.
|
||||
It's also OK to instead override {@link #next()} but that
|
||||
method is slower compared to {@link #next(Token)}.
|
||||
NOTE: subclasses must override
|
||||
{@link #incrementToken()} if the new TokenStream API is used
|
||||
and {@link #next(Token)} or {@link #next()} if the old
|
||||
TokenStream API is used.
|
||||
<p>
|
||||
NOTE: subclasses overriding {@link #next(Token)} must
|
||||
call {@link Token#clear()}.
|
||||
NOTE: Subclasses overriding {@link #incrementToken()} must
|
||||
call {@link AttributeSource#clearAttributes()} before
|
||||
setting attributes.
|
||||
Subclasses overriding {@link #next(Token)} must call
|
||||
{@link Token#clear()} before setting Token attributes.
|
||||
*/
|
||||
|
||||
public abstract class Tokenizer extends TokenStream {
|
||||
|
@ -85,6 +89,9 @@ public abstract class Tokenizer extends TokenStream {
|
|||
this.input = CharReader.get(input);
|
||||
}
|
||||
|
||||
/** Expert: Reset the tokenizer to a new CharStream. Typically, an
|
||||
* analyzer (in its reusableTokenStream method) will use
|
||||
* this to re-use a previously created tokenizer. */
|
||||
public void reset(CharStream input) throws IOException {
|
||||
this.input = input;
|
||||
}
|
||||
|
|
|
@ -148,6 +148,7 @@ public class StandardTokenizer extends Tokenizer {
|
|||
* @see org.apache.lucene.analysis.TokenStream#next()
|
||||
*/
|
||||
public final boolean incrementToken() throws IOException {
|
||||
clearAttributes();
|
||||
int posIncr = 1;
|
||||
|
||||
while(true) {
|
||||
|
|
|
@ -21,5 +21,4 @@ package org.apache.lucene.util;
|
|||
* Base interface for attributes.
|
||||
*/
|
||||
public interface Attribute {
|
||||
public void clear();
|
||||
}
|
||||
|
|
|
@ -30,8 +30,9 @@ import java.lang.reflect.Modifier;
|
|||
*/
|
||||
public abstract class AttributeImpl implements Cloneable, Serializable {
|
||||
/**
|
||||
* Clears the values in this Attribute and resets it to its
|
||||
* default value.
|
||||
* Clears the values in this AttributeImpl and resets it to its
|
||||
* default value. If this implementation implements more than one Attribute interface
|
||||
* it clears all.
|
||||
*/
|
||||
public abstract void clear();
|
||||
|
||||
|
|
Loading…
Reference in New Issue