LUCENE-1801: All Tokenizers/TokenStreams that are source of tokens call AttributeSource.clearAttributes() first. Made Token.clear() consistent to AttributeImpl (clear everything)

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@804392 13f79535-47bb-0310-9956-ffa450edef68
2009-08-14 22:01:42 +00:00 · 2009-08-14 22:01:42 +00:00 · b16e0aa31b
parent e3db2b9906
commit b16e0aa31b
18 changed files with 50 additions and 20 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -171,6 +171,13 @@ Changes in runtime behavior
    reusableTokenStream.  This is now fixed, such that if
    reusableTokenStream is invoked on such a subclass, that method
    will forcefully fallback to tokenStream.  (Mike McCandless)
+    
+12. LUCENE-1801: Token.clear() and Token.clearNoTermBuffer() now also clear
+    startOffset, endOffset and type. This should normally affect no
+    Tokenizer chains, as Tokenizers normally always set these three values.
+    This change was made to be conform to the new AttributeImpl.clear() and
+    AttributeSource.clearAttributes() to work identical for Token as one for all
+    AttributeImpl and the 6 separate AttributeImpls. (Uwe Schindler, Michael Busch)

 API Changes

@ -468,6 +475,10 @@ Bug fixes
 22. LUCENE-1805: CloseableThreadLocal did not allow a null Object in get(), 
    although it does allow it in set(Object). Fix get() to not assert the object
    is not null. (Shai Erera via Mike McCandless)
+    
+23. LUCENE-1801: Changed all Tokenizers or TokenStreams in core/contrib)
+    that are the source of Tokens to always call
+    AttributeSource.clearAttributes() first. (Uwe Schindler)

 New features

--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java
@ -123,6 +123,7 @@ public final class CJKTokenizer extends Tokenizer {
     *
     */
    public boolean incrementToken() throws IOException {
+        clearAttributes();
        /** how many character(s) has been stored in buffer */

        while(true) { // loop until we find a non-empty token
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java
@ -96,6 +96,7 @@ public final class ChineseTokenizer extends Tokenizer {
    }

    public boolean incrementToken() throws IOException {
+        clearAttributes();

        length = 0;
        start = offset;
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/SingleTokenTokenStream.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/SingleTokenTokenStream.java
@ -64,6 +64,7 @@ public class SingleTokenTokenStream extends TokenStream {
    
    Token clone = (Token) singleToken.clone();
    
+    clearAttributes();
    termAtt.setTermBuffer(clone.termBuffer(), 0, clone.termLength());
    offsetAtt.setOffset(clone.startOffset(), clone.endOffset());
    flagsAtt.setFlags(clone.getFlags());
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java
@ -123,6 +123,7 @@ public class EdgeNGramTokenizer extends Tokenizer {

  /** Returns the next token in the stream, or null at EOS. */
  public final boolean incrementToken() throws IOException {
+    clearAttributes();
    // if we are just starting, read the whole input
    if (!started) {
      started = true;
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
@ -72,6 +72,7 @@ public class NGramTokenizer extends Tokenizer {

  /** Returns the next token in the stream, or null at EOS. */
  public final boolean incrementToken() throws IOException {
+    clearAttributes();
    if (!started) {
      started = true;
      gramSize = minGram;
--- a/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java
+++ b/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java
@ -54,6 +54,7 @@ public final class SentenceTokenizer extends Tokenizer {
  }

  public boolean incrementToken() throws IOException {
+    clearAttributes();
    buffer.setLength(0);
    int ci;
    char ch, pch;
--- a/contrib/memory/src/java/org/apache/lucene/index/memory/PatternAnalyzer.java
+++ b/contrib/memory/src/java/org/apache/lucene/index/memory/PatternAnalyzer.java
@ -343,7 +343,7 @@ public class PatternAnalyzer extends Analyzer {

    public final boolean incrementToken() {
      if (matcher == null) return false;
-      
+      clearAttributes();
      while (true) { // loop takes care of leading and trailing boundary cases
        int start = pos;
        int end;
@ -401,6 +401,7 @@ public class PatternAnalyzer extends Analyzer {
    }

    public boolean incrementToken() {
+      clearAttributes();
      // cache loop instance vars (performance)
      String s = str;
      int len = s.length();
--- a/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java
+++ b/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java
@ -184,6 +184,7 @@ public class WikipediaTokenizer extends Tokenizer {
      restoreState(state);
      return true;
    }
+    clearAttributes();
    int tokenType = scanner.getNextToken();

    if (tokenType == WikipediaTokenizerImpl.YYEOF) {
--- a/src/java/org/apache/lucene/analysis/CharTokenizer.java
+++ b/src/java/org/apache/lucene/analysis/CharTokenizer.java
@ -53,9 +53,9 @@ public abstract class CharTokenizer extends Tokenizer {
  }

  public final boolean incrementToken() throws IOException {
+    clearAttributes();
    int length = 0;
    int start = bufferIndex;
-    termAtt.clear();
    char[] buffer = termAtt.termBuffer();
    while (true) {

--- a/src/java/org/apache/lucene/analysis/KeywordTokenizer.java
+++ b/src/java/org/apache/lucene/analysis/KeywordTokenizer.java
@ -49,6 +49,7 @@ public class KeywordTokenizer extends Tokenizer {
  
  public final boolean incrementToken() throws IOException {
    if (!done) {
+      clearAttributes();
      done = true;
      int upto = 0;
      char[] buffer = termAtt.termBuffer();
--- a/src/java/org/apache/lucene/analysis/NumericTokenStream.java
+++ b/src/java/org/apache/lucene/analysis/NumericTokenStream.java
@ -184,6 +184,7 @@ public final class NumericTokenStream extends TokenStream {
    if (shift >= valSize)
      return false;

+    clearAttributes();
    final char[] buffer;
    switch (valSize) {
      case 64:
--- a/src/java/org/apache/lucene/analysis/Token.java
+++ b/src/java/org/apache/lucene/analysis/Token.java
@ -117,7 +117,7 @@ import org.apache.lucene.util.AttributeImpl;
  </ul>
  A few things to note:
  <ul>
-  <li>clear() initializes most of the fields to default values, but not startOffset, endOffset and type.</li>
+  <li>clear() initializes all of the fields to default values. This was changed in contrast to Lucene 2.4, but should affect no one.</li>
  <li>Because <code>TokenStreams</code> can be chained, one cannot assume that the <code>Token's</code> current type is correct.</li>
  <li>The startOffset and endOffset represent the start and offset in the source text. So be careful in adjusting them.</li>
  <li>When caching a reusable token, clone it. When injecting a cached token into a stream that can be reset, clone it again.</li>
@ -622,9 +622,9 @@ public class Token extends AttributeImpl
    return sb.toString();
  }

-  /** Resets the term text, payload, flags, and positionIncrement to default.
-   * Other fields such as startOffset, endOffset and the token type are
-   * not reset since they are normally overwritten by the tokenizer. */
+  /** Resets the term text, payload, flags, and positionIncrement,
+   * startOffset, endOffset and token type to default.
+   */
  public void clear() {
    payload = null;
    // Leave termBuffer to allow re-use
@ -632,8 +632,8 @@ public class Token extends AttributeImpl
    termText = null;
    positionIncrement = 1;
    flags = 0;
-    // startOffset = endOffset = 0;
-    // type = DEFAULT_TYPE;
+    startOffset = endOffset = 0;
+    type = DEFAULT_TYPE;
  }

  public Object clone() {
@ -715,6 +715,8 @@ public class Token extends AttributeImpl
    payload = null;
    positionIncrement = 1;
    flags = 0;
+    startOffset = endOffset = 0;
+    type = DEFAULT_TYPE;
  }

  /** Shorthand for calling {@link #clear},
--- a/src/java/org/apache/lucene/analysis/TokenWrapper.java
+++ b/src/java/org/apache/lucene/analysis/TokenWrapper.java
@ -122,6 +122,7 @@ final class TokenWrapper extends AttributeImpl
  }
  
  // PayloadAttribute
+  
  public Payload getPayload() {
    return delegate.getPayload();
  }
@ -130,14 +131,12 @@ final class TokenWrapper extends AttributeImpl
    delegate.setPayload(payload);
  }
  
-  // TokenAttribute
-  
+  // AttributeImpl
+
  public void clear() {
    delegate.clear();
  }

-  // AttributeImpl
-
  public String toString() {
    return delegate.toString();
  }
--- a/src/java/org/apache/lucene/analysis/Tokenizer.java
+++ b/src/java/org/apache/lucene/analysis/Tokenizer.java
@ -26,12 +26,16 @@ import java.io.IOException;
  <p>
  This is an abstract class.
  <p>
-  NOTE: To use the old API subclasses must override {@link #next(Token)}.
-  It's also OK to instead override {@link #next()} but that
-  method is slower compared to {@link #next(Token)}.
+  NOTE: subclasses must override 
+  {@link #incrementToken()} if the new TokenStream API is used
+  and {@link #next(Token)} or {@link #next()} if the old
+  TokenStream API is used.
  <p>
-  NOTE: subclasses overriding {@link #next(Token)} must  
-  call {@link Token#clear()}.
+  NOTE: Subclasses overriding {@link #incrementToken()} must
+  call {@link AttributeSource#clearAttributes()} before
+  setting attributes.
+  Subclasses overriding {@link #next(Token)} must call
+  {@link Token#clear()} before setting Token attributes. 
 */

 public abstract class Tokenizer extends TokenStream {
@ -85,6 +89,9 @@ public abstract class Tokenizer extends TokenStream {
    this.input = CharReader.get(input);
  }

+  /** Expert: Reset the tokenizer to a new CharStream.  Typically, an
+   *  analyzer (in its reusableTokenStream method) will use
+   *  this to re-use a previously created tokenizer. */
  public void reset(CharStream input) throws IOException {
    this.input = input;
  }
--- a/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
+++ b/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
@ -148,6 +148,7 @@ public class StandardTokenizer extends Tokenizer {
   * @see org.apache.lucene.analysis.TokenStream#next()
   */
  public final boolean incrementToken() throws IOException {
+    clearAttributes();
    int posIncr = 1;

    while(true) {
--- a/src/java/org/apache/lucene/util/Attribute.java
+++ b/src/java/org/apache/lucene/util/Attribute.java
@ -21,5 +21,4 @@ package org.apache.lucene.util;
 * Base interface for attributes.
 */
 public interface Attribute {
-  public void clear();
 }
--- a/src/java/org/apache/lucene/util/AttributeImpl.java
+++ b/src/java/org/apache/lucene/util/AttributeImpl.java
@ -30,8 +30,9 @@ import java.lang.reflect.Modifier;
 */
 public abstract class AttributeImpl implements Cloneable, Serializable {  
  /**
-   * Clears the values in this Attribute and resets it to its 
-   * default value.
+   * Clears the values in this AttributeImpl and resets it to its 
+   * default value. If this implementation implements more than one Attribute interface
+   * it clears all.
   */
  public abstract void clear();