LUCENE-1057: call clear when reusing token, change clear to only resent essential fields, re-add Token.clone()

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@596398 13f79535-47bb-0310-9956-ffa450edef68
2007-11-19 19:10:37 +00:00 · 2007-11-19 19:10:37 +00:00 · e9a5866777
parent 31f50ad41d
commit e9a5866777
3 changed files with 94 additions and 16 deletions
--- a/src/java/org/apache/lucene/analysis/Token.java
+++ b/src/java/org/apache/lucene/analysis/Token.java
@ -361,14 +361,29 @@ public class Token implements Cloneable {
    return sb.toString();
  }
-  /** Reset all state for this token back to defaults. */
+  /** Resets the term text, payload, and positionIncrement to default.
   * Other fields such as startOffset, endOffset and the token type are
   * not reset since they are normally overwritten by the tokenizer. */
  public void clear() {
    payload = null;
    // Leave termBuffer to allow re-use
    termLength = 0;
    termText = null;
    positionIncrement = 1;
-    startOffset = endOffset = 0;
+    // startOffset = endOffset = 0;
-    type = DEFAULT_TYPE;
+    // type = DEFAULT_TYPE;
  }
  public Object clone() {
    try {
      Token t = (Token)super.clone();
      if (termBuffer != null) {
        t.termBuffer = null;
        t.setTermBuffer(termBuffer, 0, termLength);
      }
      return t;
    } catch (CloneNotSupportedException e) {
      throw new RuntimeException(e);  // shouldn't happen
    }
  }
 }
--- a/src/java/org/apache/lucene/index/DocumentsWriter.java
+++ b/src/java/org/apache/lucene/index/DocumentsWriter.java
@ -1281,6 +1281,7 @@ final class DocumentsWriter {
        if (!field.isTokenized()) {		  // un-tokenized field
          String stringValue = field.stringValue();
          Token token = localToken;
          token.clear();
          token.setTermText(stringValue);
          token.setStartOffset(offset);
          token.setEndOffset(offset + stringValue.length());
@ -1319,7 +1320,10 @@ final class DocumentsWriter {
          try {
            offsetEnd = offset-1;
            Token token;
-            while((token = stream.next(localToken)) != null) {
+            for(;;) {
              localToken.clear();
              token = stream.next(localToken);
              if (token == null) break;
              position += (token.getPositionIncrement() - 1);
              addPosition(token);
              if (++length >= maxFieldLength) {
--- a/src/test/org/apache/lucene/index/TestDocumentWriter.java
+++ b/src/test/org/apache/lucene/index/TestDocumentWriter.java
@ -17,22 +17,17 @@ package org.apache.lucene.index;
 * limitations under the License.
 */
-import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.analysis.*;
-import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.document.Document;
-import org.apache.lucene.analysis.SimpleAnalyzer;
+import org.apache.lucene.document.Field;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.WhitespaceAnalyzer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.WhitespaceTokenizer;
 import org.apache.lucene.document.*;
 import org.apache.lucene.document.Field.TermVector;
 import org.apache.lucene.document.Fieldable;
 import org.apache.lucene.search.Similarity;
 import org.apache.lucene.store.RAMDirectory;
 import org.apache.lucene.util.LuceneTestCase;
 import java.io.Reader;
 import java.io.IOException;
-
+import java.io.Reader;
 import java.util.Arrays;
 public class TestDocumentWriter extends LuceneTestCase {
  private RAMDirectory dir;
@ -131,6 +126,70 @@ public class TestDocumentWriter extends LuceneTestCase {
    assertEquals(502, termPositions.nextPosition());
  }
  public void testTokenReuse() throws IOException {
    Analyzer analyzer = new Analyzer() {
      public TokenStream tokenStream(String fieldName, Reader reader) {
        return new TokenFilter(new WhitespaceTokenizer(reader)) {
          boolean first=true;
          Token buffered;
          public Token next() throws IOException {
            return input.next();
          }
          public Token next(Token result) throws IOException {
            if (buffered != null) {
              Token t = buffered;
              buffered=null;
              return t;
            }
            Token t = input.next(result);
            if (t==null) return null;
            if (Character.isDigit(t.termBuffer()[0])) {
              t.setPositionIncrement(t.termBuffer()[0] - '0');
            }
            if (first) {
              // set payload on first position only
              t.setPayload(new Payload(new byte[]{100}));
              first = false;
            }
            // index a "synonym" for every token
            buffered = (Token)t.clone();
            buffered.setPayload(null);
            buffered.setPositionIncrement(0);
            buffered.setTermBuffer(new char[]{'b'}, 0, 1);
            return t;
          }
        };
      }
    };
    IndexWriter writer = new IndexWriter(dir, analyzer, true);
    Document doc = new Document();
    doc.add(new Field("f1", "a 5 a a", Field.Store.YES, Field.Index.TOKENIZED));
    writer.addDocument(doc);
    writer.flush();
    SegmentInfo info = writer.newestSegment();
    writer.close();
    SegmentReader reader = SegmentReader.get(info);
    TermPositions termPositions = reader.termPositions(new Term("f1", "a"));
    assertTrue(termPositions.next());
    int freq = termPositions.freq();
    assertEquals(3, freq);
    assertEquals(0, termPositions.nextPosition());
    assertEquals(true, termPositions.isPayloadAvailable());
    assertEquals(6, termPositions.nextPosition());
    assertEquals(false, termPositions.isPayloadAvailable());
    assertEquals(7, termPositions.nextPosition());
    assertEquals(false, termPositions.isPayloadAvailable());
  }
  public void testPreAnalyzedField() throws IOException {
    Similarity similarity = Similarity.getDefault();
    IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer(), true);