LUCENE-580:

- Added the public method reset() to TokenStream. - Added a new constructor to Field that takes a TokenStream as argument, useful for pre-analyzed fields. git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@533549 13f79535-47bb-0310-9956-ffa450edef68
2007-04-29 19:26:11 +00:00 · 2007-04-29 19:26:11 +00:00 · 80c0f267f6
parent 3c60a00b69
commit 80c0f267f6
9 changed files with 364 additions and 51 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -47,7 +47,16 @@ API Changes
    (Chris Hostetter, Otis Gospodnetic)
 8. LUCENE-869: Changed FSIndexInput and FSIndexOutput to inner classes of FSDirectory
-    to enable extensibility of these classes.
+    to enable extensibility of these classes. (Michael Busch)
 9. LUCENE-580: Added the public method reset() to TokenStream. This method does
    nothing by default, but may be overwritten by subclasses to support consuming
    the TokenStream more than once. (Michael Busch)
 10. LUCENE-580: Added a new constructor to Field that takes a TokenStream as
    argument, available as tokenStreamValue(). This is useful to avoid the need of 
    "dummy analyzers" for pre-analyzed fields. (Karl Wettin, Michael Busch)
 Bug fixes
--- a/src/java/org/apache/lucene/analysis/CachingTokenFilter.java
+++ b/src/java/org/apache/lucene/analysis/CachingTokenFilter.java
@ -0,0 +1,68 @@
 package org.apache.lucene.analysis;
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.io.IOException;
 import java.util.LinkedList;
 import java.util.List;
 /**
 * This class can be used if the Tokens of a TokenStream
 * are intended to be consumed more than once. It caches
 * all Tokens locally in a List.
 * 
 * CachingTokenFilter implements the optional method
 * {@link TokenStream#reset()}, which repositions the
 * stream to the first Token. 
 *
 */
 public class CachingTokenFilter extends TokenFilter {
  private List cache;
  private int index;
  public CachingTokenFilter(TokenStream input) {
    super(input);
  }
  public Token next() throws IOException {
    if (cache == null) {
      // fill cache lazily
      cache = new LinkedList();
      fillCache();
    }
    if (index == cache.size()) {
      // the cache is exhausted, return null
      return null;
    }
    return (Token) cache.get(index++);
  }
  public void reset() throws IOException {
    index = 0;
  }
  private void fillCache() throws IOException {
    Token token;
    while ( (token = input.next()) != null) {
      cache.add(token);
    }
  }
 }
--- a/src/java/org/apache/lucene/analysis/TokenStream.java
+++ b/src/java/org/apache/lucene/analysis/TokenStream.java
@ -35,6 +35,15 @@ public abstract class TokenStream {
  /** Returns the next token in the stream, or null at EOS. */
  public abstract Token next() throws IOException;
  /** Resets this stream to the beginning. This is an
   *  optional operation, so subclasses may or may not
   *  implement this method. Reset() is not needed for
   *  the standard indexing process. However, if the Tokens 
   *  of a TokenStream are intended to be consumed more than 
   *  once, it is neccessary to implement reset(). 
   */
  public void reset() throws IOException {}
  /** Releases resources associated with this stream. */
  public void close() throws IOException {}
 }
--- a/src/java/org/apache/lucene/document/Field.java
+++ b/src/java/org/apache/lucene/document/Field.java
@ -17,6 +17,7 @@ package org.apache.lucene.document;
 * limitations under the License.
 */
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.index.IndexWriter;   // for javadoc
 import org.apache.lucene.util.Parameter;
@ -134,21 +135,26 @@ public final class Field extends AbstractField implements Fieldable, Serializabl
  }
-  /** The value of the field as a String, or null.  If null, the Reader value
+  /** The value of the field as a String, or null.  If null, the Reader value,
-   * or binary value is used.  Exactly one of stringValue(), readerValue(), and
+   * binary value, or TokenStream value is used.  Exactly one of stringValue(), 
-   * binaryValue() must be set. */
+   * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
  public String stringValue()   { return fieldsData instanceof String ? (String)fieldsData : null; }
-  /** The value of the field as a Reader, or null.  If null, the String value
+  /** The value of the field as a Reader, or null.  If null, the String value,
-   * or binary value is  used.  Exactly one of stringValue(), readerValue(),
+   * binary value, or TokenStream value is used.  Exactly one of stringValue(), 
-   * and binaryValue() must be set. */
+   * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
  public Reader readerValue()   { return fieldsData instanceof Reader ? (Reader)fieldsData : null; }
-  /** The value of the field in Binary, or null.  If null, the Reader or
+  /** The value of the field in Binary, or null.  If null, the Reader value,
-   * String value is used.  Exactly one of stringValue(), readerValue() and
+   * String value, or TokenStream value is used. Exactly one of stringValue(), 
-   * binaryValue() must be set. */
+   * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
  public byte[] binaryValue()   { return fieldsData instanceof byte[] ? (byte[])fieldsData : null; }
  /** The value of the field as a TokesStream, or null.  If null, the Reader value,
   * String value, or binary value is used. Exactly one of stringValue(), 
   * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
  public TokenStream tokenStreamValue()   { return fieldsData instanceof TokenStream ? (TokenStream)fieldsData : null; }
  /**
   * Create a field by specifying its name, value and how it will
   * be saved in the index. Term vectors will not be stored in the index.
@ -280,6 +286,54 @@ public final class Field extends AbstractField implements Fieldable, Serializabl
    setStoreTermVector(termVector);
  }
  /**
   * Create a tokenized and indexed field that is not stored. Term vectors will
   * not be stored. This is useful for pre-analyzed fields.
   * The TokenStream is read only when the Document is added to the index,
   * i.e. you may not close the TokenStream until {@link IndexWriter#addDocument(Document)}
   * has been called.
   * 
   * @param name The name of the field
   * @param tokenStream The TokenStream with the content
   * @throws NullPointerException if name or tokenStream is <code>null</code>
   */ 
  public Field(String name, TokenStream tokenStream) {
    this(name, tokenStream, TermVector.NO);
  }
  /**
   * Create a tokenized and indexed field that is not stored, optionally with 
   * storing term vectors.  This is useful for pre-analyzed fields.
   * The TokenStream is read only when the Document is added to the index,
   * i.e. you may not close the TokenStream until {@link IndexWriter#addDocument(Document)}
   * has been called.
   * 
   * @param name The name of the field
   * @param tokenStream The TokenStream with the content
   * @param termVector Whether term vector should be stored
   * @throws NullPointerException if name or tokenStream is <code>null</code>
   */ 
  public Field(String name, TokenStream tokenStream, TermVector termVector) {
    if (name == null)
      throw new NullPointerException("name cannot be null");
    if (tokenStream == null)
      throw new NullPointerException("tokenStream cannot be null");
    this.name = name.intern();        // field names are interned
    this.fieldsData = tokenStream;
    this.isStored = false;
    this.isCompressed = false;
    this.isIndexed = true;
    this.isTokenized = true;
    this.isBinary = false;
    setStoreTermVector(termVector);
  }
  /**
   * Create a stored field with binary value. Optionally the value may be compressed.
--- a/src/java/org/apache/lucene/document/Fieldable.java
+++ b/src/java/org/apache/lucene/document/Fieldable.java
@ -19,6 +19,8 @@ package org.apache.lucene.document;
 import java.io.Reader;
 import java.io.Serializable;
 import org.apache.lucene.analysis.TokenStream;
 /**
 * Synonymous with {@link Field}.
 *
@ -60,20 +62,25 @@ public interface Fieldable extends Serializable {
   */
  String name();
-  /** The value of the field as a String, or null.  If null, the Reader value
+  /** The value of the field as a String, or null.  If null, the Reader value,
-   * or binary value is used.  Exactly one of stringValue(), readerValue(), and
+   * binary value, or TokenStream value is used.  Exactly one of stringValue(), 
-   * binaryValue() must be set. */
+   * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
-  String stringValue();
+  public String stringValue();
-
+  
-  /** The value of the field as a Reader, or null.  If null, the String value
+  /** The value of the field as a Reader, or null.  If null, the String value,
-   * or binary value is  used.  Exactly one of stringValue(), readerValue(),
+   * binary value, or TokenStream value is used.  Exactly one of stringValue(), 
-   * and binaryValue() must be set. */
+   * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
-  Reader readerValue();
+  public Reader readerValue();
-
+  
-  /** The value of the field in Binary, or null.  If null, the Reader or
+  /** The value of the field in Binary, or null.  If null, the Reader value,
-   * String value is used.  Exactly one of stringValue(), readerValue() and
+   * String value, or TokenStream value is used. Exactly one of stringValue(), 
-   * binaryValue() must be set. */
+   * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
-  byte[] binaryValue();
+  public byte[] binaryValue();
  /** The value of the field as a TokesStream, or null.  If null, the Reader value,
   * String value, or binary value is used. Exactly one of stringValue(), 
   * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
  public TokenStream tokenStreamValue();
  /** True iff the value of the field is to be stored in the index for return
    with search hits.  It is an error for this to be true if a field is
--- a/src/java/org/apache/lucene/index/DocumentWriter.java
+++ b/src/java/org/apache/lucene/index/DocumentWriter.java
@ -162,18 +162,28 @@ final class DocumentWriter {
          offset += stringValue.length();
          length++;
        } else 
-        {
+        { // tokenized field
-          Reader reader;			  // find or make Reader
+          TokenStream stream = field.tokenStreamValue();
-          if (field.readerValue() != null)
+          
-            reader = field.readerValue();
+          // the field does not have a TokenStream,
-          else if (field.stringValue() != null)
+          // so we have to obtain one from the analyzer
-            reader = new StringReader(field.stringValue());
+          if (stream == null) {
-          else
+            Reader reader;			  // find or make Reader
-            throw new IllegalArgumentException
+            if (field.readerValue() != null)
-                    ("field must have either String or Reader value");
+              reader = field.readerValue();
-
+            else if (field.stringValue() != null)
-          // Tokenize field and add to postingTable
+              reader = new StringReader(field.stringValue());
-          TokenStream stream = analyzer.tokenStream(fieldName, reader);
+            else
              throw new IllegalArgumentException
                      ("field must have either String or Reader value");
            // Tokenize field and add to postingTable
            stream = analyzer.tokenStream(fieldName, reader);
          }
          // reset the TokenStream to the first token
          stream.reset();
          try {
            Token lastToken = null;
            for (Token t = stream.next(); t != null; t = stream.next()) {
--- a/src/java/org/apache/lucene/index/FieldsReader.java
+++ b/src/java/org/apache/lucene/index/FieldsReader.java
@ -17,6 +17,7 @@ package org.apache.lucene.index;
 * limitations under the License.
 */
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.document.*;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.IndexInput;
@ -331,11 +332,9 @@ final class FieldsReader {
      return localFieldsStream;
    }
-    /**
+    /** The value of the field in Binary, or null.  If null, the Reader value,
-     * The value of the field in Binary, or null.  If null, the Reader or
+     * String value, or TokenStream value is used. Exactly one of stringValue(), 
-     * String value is used.  Exactly one of stringValue(), readerValue() and
+     * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
     * binaryValue() must be set.
     */
    public byte[] binaryValue() {
      ensureOpen();
      if (fieldsData == null) {
@ -358,21 +357,26 @@ final class FieldsReader {
      return fieldsData instanceof byte[] ? (byte[]) fieldsData : null;
    }
-    /**
+    /** The value of the field as a Reader, or null.  If null, the String value,
-     * The value of the field as a Reader, or null.  If null, the String value
+     * binary value, or TokenStream value is used.  Exactly one of stringValue(), 
-     * or binary value is  used.  Exactly one of stringValue(), readerValue(),
+     * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
     * and binaryValue() must be set.
     */
    public Reader readerValue() {
      ensureOpen();
      return fieldsData instanceof Reader ? (Reader) fieldsData : null;
    }
-    /**
+    /** The value of the field as a TokesStream, or null.  If null, the Reader value,
-     * The value of the field as a String, or null.  If null, the Reader value
+     * String value, or binary value is used. Exactly one of stringValue(), 
-     * or binary value is used.  Exactly one of stringValue(), readerValue(), and
+     * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
-     * binaryValue() must be set.
+    public TokenStream tokenStreamValue() {
-     */
+      ensureOpen();
      return fieldsData instanceof TokenStream ? (TokenStream) fieldsData : null;
    }
    /** The value of the field as a String, or null.  If null, the Reader value,
     * binary value, or TokenStream value is used.  Exactly one of stringValue(), 
     * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
    public String stringValue() {
      ensureOpen();
      if (fieldsData == null) {
@ -462,6 +466,11 @@ final class FieldsReader {
    public byte[] binaryValue() {
      return (byte[]) this.fieldsData;
    }
    public TokenStream tokenStreamValue() {
      // not needed for merge
      return null;
    }
    public FieldForMerge(Object value, FieldInfo fi, boolean binary, boolean compressed, boolean tokenize) {
      this.isStored = true;  
--- a/src/test/org/apache/lucene/analysis/TestCachingTokenFilter.java
+++ b/src/test/org/apache/lucene/analysis/TestCachingTokenFilter.java
@ -0,0 +1,103 @@
 package org.apache.lucene.analysis;
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.io.IOException;
 import junit.framework.TestCase;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.Field.TermVector;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.TermPositions;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.RAMDirectory;
 public class TestCachingTokenFilter extends TestCase {
  private String[] tokens = new String[] {"term1", "term2", "term3", "term2"};
  public void testCaching() throws IOException {
    Directory dir = new RAMDirectory();
    IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer());
    Document doc = new Document();
    TokenStream stream = new TokenStream() {
      private int index = 0;
      public Token next() throws IOException {
        if (index == tokens.length) {
          return null;
        } else {
          return new Token(tokens[index++], 0, 0);
        }        
      }
    };
    stream = new CachingTokenFilter(stream);
    doc.add(new Field("preanalyzed", stream, TermVector.NO));
    // 1) we consume all tokens twice before we add the doc to the index
    checkTokens(stream);
    stream.reset();  
    checkTokens(stream);
    // 2) now add the document to the index and verify if all tokens are indexed
    //    don't reset the stream here, the DocumentWriter should do that implicitly
    writer.addDocument(doc);
    writer.close();
    IndexReader reader = IndexReader.open(dir);
    TermPositions termPositions = reader.termPositions(new Term("preanalyzed", "term1"));
    assertTrue(termPositions.next());
    assertEquals(1, termPositions.freq());
    assertEquals(0, termPositions.nextPosition());
    termPositions.seek(new Term("preanalyzed", "term2"));
    assertTrue(termPositions.next());
    assertEquals(2, termPositions.freq());
    assertEquals(1, termPositions.nextPosition());
    assertEquals(3, termPositions.nextPosition());
    termPositions.seek(new Term("preanalyzed", "term3"));
    assertTrue(termPositions.next());
    assertEquals(1, termPositions.freq());
    assertEquals(2, termPositions.nextPosition());
    reader.close();
    // 3) reset stream and consume tokens again
    stream.reset();
    checkTokens(stream);
  }
  private void checkTokens(TokenStream stream) throws IOException {
    int count = 0;
    Token token;
    while ((token = stream.next()) != null) {
      assertTrue(count < tokens.length);
      assertEquals(tokens[count], token.termText);
      count++;
    }
    assertEquals(tokens.length, count);
  }
 }
--- a/src/test/org/apache/lucene/index/TestDocumentWriter.java
+++ b/src/test/org/apache/lucene/index/TestDocumentWriter.java
@ -19,10 +19,13 @@ package org.apache.lucene.index;
 import junit.framework.TestCase;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.SimpleAnalyzer;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.WhitespaceAnalyzer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.WhitespaceTokenizer;
 import org.apache.lucene.document.*;
 import org.apache.lucene.document.Field.TermVector;
 import org.apache.lucene.search.Similarity;
 import org.apache.lucene.store.RAMDirectory;
@ -124,4 +127,45 @@ public class TestDocumentWriter extends TestCase {
    assertEquals(0, termPositions.nextPosition());
    assertEquals(502, termPositions.nextPosition());
  }
  public void testPreAnalyzedField() throws IOException {
    Similarity similarity = Similarity.getDefault();
    DocumentWriter writer = new DocumentWriter(dir, new SimpleAnalyzer(), similarity, 50);
    Document doc = new Document();
    doc.add(new Field("preanalyzed", new TokenStream() {
      private String[] tokens = new String[] {"term1", "term2", "term3", "term2"};
      private int index = 0;
      public Token next() throws IOException {
        if (index == tokens.length) {
          return null;
        } else {
          return new Token(tokens[index++], 0, 0);
        }        
      }
    }, TermVector.NO));
    String segName = "test";
    writer.addDocument(segName, doc);
    SegmentReader reader = SegmentReader.get(new SegmentInfo(segName, 1, dir));
    TermPositions termPositions = reader.termPositions(new Term("preanalyzed", "term1"));
    assertTrue(termPositions.next());
    assertEquals(1, termPositions.freq());
    assertEquals(0, termPositions.nextPosition());
    termPositions.seek(new Term("preanalyzed", "term2"));
    assertTrue(termPositions.next());
    assertEquals(2, termPositions.freq());
    assertEquals(1, termPositions.nextPosition());
    assertEquals(3, termPositions.nextPosition());
    termPositions.seek(new Term("preanalyzed", "term3"));
    assertTrue(termPositions.next());
    assertEquals(1, termPositions.freq());
    assertEquals(2, termPositions.nextPosition());
  }
 }