LUCENE-580:

- Added the public method reset() to TokenStream. - Added a new constructor to Field that takes a TokenStream as argument, useful for pre-analyzed fields. git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@533549 13f79535-47bb-0310-9956-ffa450edef68
2007-04-29 19:26:11 +00:00 · 2007-04-29 19:26:11 +00:00 · 80c0f267f6
parent 3c60a00b69
commit 80c0f267f6
9 changed files with 364 additions and 51 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -47,7 +47,16 @@ API Changes
    (Chris Hostetter, Otis Gospodnetic)

 8. LUCENE-869: Changed FSIndexInput and FSIndexOutput to inner classes of FSDirectory
-    to enable extensibility of these classes.
+    to enable extensibility of these classes. (Michael Busch)
+
+ 9. LUCENE-580: Added the public method reset() to TokenStream. This method does
+    nothing by default, but may be overwritten by subclasses to support consuming
+    the TokenStream more than once. (Michael Busch)
+
+10. LUCENE-580: Added a new constructor to Field that takes a TokenStream as
+    argument, available as tokenStreamValue(). This is useful to avoid the need of 
+    "dummy analyzers" for pre-analyzed fields. (Karl Wettin, Michael Busch)
+

 Bug fixes

--- a/src/java/org/apache/lucene/analysis/CachingTokenFilter.java
+++ b/src/java/org/apache/lucene/analysis/CachingTokenFilter.java
@ -0,0 +1,68 @@
+package org.apache.lucene.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.LinkedList;
+import java.util.List;
+
+/**
+ * This class can be used if the Tokens of a TokenStream
+ * are intended to be consumed more than once. It caches
+ * all Tokens locally in a List.
+ * 
+ * CachingTokenFilter implements the optional method
+ * {@link TokenStream#reset()}, which repositions the
+ * stream to the first Token. 
+ *
+ */
+public class CachingTokenFilter extends TokenFilter {
+  private List cache;
+  private int index;
+  
+  public CachingTokenFilter(TokenStream input) {
+    super(input);
+  }
+  
+  public Token next() throws IOException {
+    if (cache == null) {
+      // fill cache lazily
+      cache = new LinkedList();
+      fillCache();
+    }
+    
+    if (index == cache.size()) {
+      // the cache is exhausted, return null
+      return null;
+    }
+    
+    return (Token) cache.get(index++);
+  }
+  
+  public void reset() throws IOException {
+    index = 0;
+  }
+  
+  private void fillCache() throws IOException {
+    Token token;
+    while ( (token = input.next()) != null) {
+      cache.add(token);
+    }
+  }
+
+}
--- a/src/java/org/apache/lucene/analysis/TokenStream.java
+++ b/src/java/org/apache/lucene/analysis/TokenStream.java
@ -35,6 +35,15 @@ public abstract class TokenStream {
  /** Returns the next token in the stream, or null at EOS. */
  public abstract Token next() throws IOException;

+  /** Resets this stream to the beginning. This is an
+   *  optional operation, so subclasses may or may not
+   *  implement this method. Reset() is not needed for
+   *  the standard indexing process. However, if the Tokens 
+   *  of a TokenStream are intended to be consumed more than 
+   *  once, it is neccessary to implement reset(). 
+   */
+  public void reset() throws IOException {}
+  
  /** Releases resources associated with this stream. */
  public void close() throws IOException {}
 }
--- a/src/java/org/apache/lucene/document/Field.java
+++ b/src/java/org/apache/lucene/document/Field.java
@ -17,6 +17,7 @@ package org.apache.lucene.document;
 * limitations under the License.
 */

+import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.index.IndexWriter;   // for javadoc
 import org.apache.lucene.util.Parameter;

@ -134,21 +135,26 @@ public final class Field extends AbstractField implements Fieldable, Serializabl
  }
  
  
-  /** The value of the field as a String, or null.  If null, the Reader value
-   * or binary value is used.  Exactly one of stringValue(), readerValue(), and
-   * binaryValue() must be set. */
+  /** The value of the field as a String, or null.  If null, the Reader value,
+   * binary value, or TokenStream value is used.  Exactly one of stringValue(), 
+   * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
  public String stringValue()   { return fieldsData instanceof String ? (String)fieldsData : null; }
  
-  /** The value of the field as a Reader, or null.  If null, the String value
-   * or binary value is  used.  Exactly one of stringValue(), readerValue(),
-   * and binaryValue() must be set. */
+  /** The value of the field as a Reader, or null.  If null, the String value,
+   * binary value, or TokenStream value is used.  Exactly one of stringValue(), 
+   * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
  public Reader readerValue()   { return fieldsData instanceof Reader ? (Reader)fieldsData : null; }
  
-  /** The value of the field in Binary, or null.  If null, the Reader or
-   * String value is used.  Exactly one of stringValue(), readerValue() and
-   * binaryValue() must be set. */
+  /** The value of the field in Binary, or null.  If null, the Reader value,
+   * String value, or TokenStream value is used. Exactly one of stringValue(), 
+   * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
  public byte[] binaryValue()   { return fieldsData instanceof byte[] ? (byte[])fieldsData : null; }
  
+  /** The value of the field as a TokesStream, or null.  If null, the Reader value,
+   * String value, or binary value is used. Exactly one of stringValue(), 
+   * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
+  public TokenStream tokenStreamValue()   { return fieldsData instanceof TokenStream ? (TokenStream)fieldsData : null; }
+  
  /**
   * Create a field by specifying its name, value and how it will
   * be saved in the index. Term vectors will not be stored in the index.
@ -280,6 +286,54 @@ public final class Field extends AbstractField implements Fieldable, Serializabl
    
    setStoreTermVector(termVector);
  }
+
+  /**
+   * Create a tokenized and indexed field that is not stored. Term vectors will
+   * not be stored. This is useful for pre-analyzed fields.
+   * The TokenStream is read only when the Document is added to the index,
+   * i.e. you may not close the TokenStream until {@link IndexWriter#addDocument(Document)}
+   * has been called.
+   * 
+   * @param name The name of the field
+   * @param tokenStream The TokenStream with the content
+   * @throws NullPointerException if name or tokenStream is <code>null</code>
+   */ 
+  public Field(String name, TokenStream tokenStream) {
+    this(name, tokenStream, TermVector.NO);
+  }
+  
+  /**
+   * Create a tokenized and indexed field that is not stored, optionally with 
+   * storing term vectors.  This is useful for pre-analyzed fields.
+   * The TokenStream is read only when the Document is added to the index,
+   * i.e. you may not close the TokenStream until {@link IndexWriter#addDocument(Document)}
+   * has been called.
+   * 
+   * @param name The name of the field
+   * @param tokenStream The TokenStream with the content
+   * @param termVector Whether term vector should be stored
+   * @throws NullPointerException if name or tokenStream is <code>null</code>
+   */ 
+  public Field(String name, TokenStream tokenStream, TermVector termVector) {
+    if (name == null)
+      throw new NullPointerException("name cannot be null");
+    if (tokenStream == null)
+      throw new NullPointerException("tokenStream cannot be null");
+    
+    this.name = name.intern();        // field names are interned
+    this.fieldsData = tokenStream;
+    
+    this.isStored = false;
+    this.isCompressed = false;
+    
+    this.isIndexed = true;
+    this.isTokenized = true;
+    
+    this.isBinary = false;
+    
+    setStoreTermVector(termVector);
+  }
+
  
  /**
   * Create a stored field with binary value. Optionally the value may be compressed.
--- a/src/java/org/apache/lucene/document/Fieldable.java
+++ b/src/java/org/apache/lucene/document/Fieldable.java
@ -19,6 +19,8 @@ package org.apache.lucene.document;
 import java.io.Reader;
 import java.io.Serializable;

+import org.apache.lucene.analysis.TokenStream;
+
 /**
 * Synonymous with {@link Field}.
 *
@ -60,20 +62,25 @@ public interface Fieldable extends Serializable {
   */
  String name();

-  /** The value of the field as a String, or null.  If null, the Reader value
-   * or binary value is used.  Exactly one of stringValue(), readerValue(), and
-   * binaryValue() must be set. */
-  String stringValue();
-
-  /** The value of the field as a Reader, or null.  If null, the String value
-   * or binary value is  used.  Exactly one of stringValue(), readerValue(),
-   * and binaryValue() must be set. */
-  Reader readerValue();
-
-  /** The value of the field in Binary, or null.  If null, the Reader or
-   * String value is used.  Exactly one of stringValue(), readerValue() and
-   * binaryValue() must be set. */
-  byte[] binaryValue();
+  /** The value of the field as a String, or null.  If null, the Reader value,
+   * binary value, or TokenStream value is used.  Exactly one of stringValue(), 
+   * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
+  public String stringValue();
+  
+  /** The value of the field as a Reader, or null.  If null, the String value,
+   * binary value, or TokenStream value is used.  Exactly one of stringValue(), 
+   * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
+  public Reader readerValue();
+  
+  /** The value of the field in Binary, or null.  If null, the Reader value,
+   * String value, or TokenStream value is used. Exactly one of stringValue(), 
+   * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
+  public byte[] binaryValue();
+  
+  /** The value of the field as a TokesStream, or null.  If null, the Reader value,
+   * String value, or binary value is used. Exactly one of stringValue(), 
+   * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
+  public TokenStream tokenStreamValue();

  /** True iff the value of the field is to be stored in the index for return
    with search hits.  It is an error for this to be true if a field is
--- a/src/java/org/apache/lucene/index/DocumentWriter.java
+++ b/src/java/org/apache/lucene/index/DocumentWriter.java
@ -162,18 +162,28 @@ final class DocumentWriter {
          offset += stringValue.length();
          length++;
        } else 
-        {
-          Reader reader;			  // find or make Reader
-          if (field.readerValue() != null)
-            reader = field.readerValue();
-          else if (field.stringValue() != null)
-            reader = new StringReader(field.stringValue());
-          else
-            throw new IllegalArgumentException
-                    ("field must have either String or Reader value");
-
-          // Tokenize field and add to postingTable
-          TokenStream stream = analyzer.tokenStream(fieldName, reader);
+        { // tokenized field
+          TokenStream stream = field.tokenStreamValue();
+          
+          // the field does not have a TokenStream,
+          // so we have to obtain one from the analyzer
+          if (stream == null) {
+            Reader reader;			  // find or make Reader
+            if (field.readerValue() != null)
+              reader = field.readerValue();
+            else if (field.stringValue() != null)
+              reader = new StringReader(field.stringValue());
+            else
+              throw new IllegalArgumentException
+                      ("field must have either String or Reader value");
+  
+            // Tokenize field and add to postingTable
+            stream = analyzer.tokenStream(fieldName, reader);
+          }
+          
+          // reset the TokenStream to the first token
+          stream.reset();
+          
          try {
            Token lastToken = null;
            for (Token t = stream.next(); t != null; t = stream.next()) {
--- a/src/java/org/apache/lucene/index/FieldsReader.java
+++ b/src/java/org/apache/lucene/index/FieldsReader.java
@ -17,6 +17,7 @@ package org.apache.lucene.index;
 * limitations under the License.
 */

+import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.document.*;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.IndexInput;
@ -331,11 +332,9 @@ final class FieldsReader {
      return localFieldsStream;
    }

-    /**
-     * The value of the field in Binary, or null.  If null, the Reader or
-     * String value is used.  Exactly one of stringValue(), readerValue() and
-     * binaryValue() must be set.
-     */
+    /** The value of the field in Binary, or null.  If null, the Reader value,
+     * String value, or TokenStream value is used. Exactly one of stringValue(), 
+     * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
    public byte[] binaryValue() {
      ensureOpen();
      if (fieldsData == null) {
@ -358,21 +357,26 @@ final class FieldsReader {
      return fieldsData instanceof byte[] ? (byte[]) fieldsData : null;
    }

-    /**
-     * The value of the field as a Reader, or null.  If null, the String value
-     * or binary value is  used.  Exactly one of stringValue(), readerValue(),
-     * and binaryValue() must be set.
-     */
+    /** The value of the field as a Reader, or null.  If null, the String value,
+     * binary value, or TokenStream value is used.  Exactly one of stringValue(), 
+     * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
    public Reader readerValue() {
      ensureOpen();
      return fieldsData instanceof Reader ? (Reader) fieldsData : null;
    }

-    /**
-     * The value of the field as a String, or null.  If null, the Reader value
-     * or binary value is used.  Exactly one of stringValue(), readerValue(), and
-     * binaryValue() must be set.
-     */
+    /** The value of the field as a TokesStream, or null.  If null, the Reader value,
+     * String value, or binary value is used. Exactly one of stringValue(), 
+     * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
+    public TokenStream tokenStreamValue() {
+      ensureOpen();
+      return fieldsData instanceof TokenStream ? (TokenStream) fieldsData : null;
+    }
+
+    
+    /** The value of the field as a String, or null.  If null, the Reader value,
+     * binary value, or TokenStream value is used.  Exactly one of stringValue(), 
+     * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
    public String stringValue() {
      ensureOpen();
      if (fieldsData == null) {
@ -462,6 +466,11 @@ final class FieldsReader {
    public byte[] binaryValue() {
      return (byte[]) this.fieldsData;
    }
+
+    public TokenStream tokenStreamValue() {
+      // not needed for merge
+      return null;
+    }
    
    public FieldForMerge(Object value, FieldInfo fi, boolean binary, boolean compressed, boolean tokenize) {
      this.isStored = true;  
--- a/src/test/org/apache/lucene/analysis/TestCachingTokenFilter.java
+++ b/src/test/org/apache/lucene/analysis/TestCachingTokenFilter.java
@ -0,0 +1,103 @@
+package org.apache.lucene.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import java.io.IOException;
+
+import junit.framework.TestCase;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.Field.TermVector;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermPositions;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.RAMDirectory;
+
+public class TestCachingTokenFilter extends TestCase {
+  private String[] tokens = new String[] {"term1", "term2", "term3", "term2"};
+  
+  public void testCaching() throws IOException {
+    Directory dir = new RAMDirectory();
+    IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer());
+    Document doc = new Document();
+    TokenStream stream = new TokenStream() {
+      private int index = 0;
+      
+      public Token next() throws IOException {
+        if (index == tokens.length) {
+          return null;
+        } else {
+          return new Token(tokens[index++], 0, 0);
+        }        
+      }
+      
+    };
+    
+    stream = new CachingTokenFilter(stream);
+    
+    doc.add(new Field("preanalyzed", stream, TermVector.NO));
+    
+    // 1) we consume all tokens twice before we add the doc to the index
+    checkTokens(stream);
+    stream.reset();  
+    checkTokens(stream);
+    
+    // 2) now add the document to the index and verify if all tokens are indexed
+    //    don't reset the stream here, the DocumentWriter should do that implicitly
+    writer.addDocument(doc);
+    writer.close();
+    
+    IndexReader reader = IndexReader.open(dir);
+    TermPositions termPositions = reader.termPositions(new Term("preanalyzed", "term1"));
+    assertTrue(termPositions.next());
+    assertEquals(1, termPositions.freq());
+    assertEquals(0, termPositions.nextPosition());
+
+    termPositions.seek(new Term("preanalyzed", "term2"));
+    assertTrue(termPositions.next());
+    assertEquals(2, termPositions.freq());
+    assertEquals(1, termPositions.nextPosition());
+    assertEquals(3, termPositions.nextPosition());
+    
+    termPositions.seek(new Term("preanalyzed", "term3"));
+    assertTrue(termPositions.next());
+    assertEquals(1, termPositions.freq());
+    assertEquals(2, termPositions.nextPosition());
+    reader.close();
+    
+    // 3) reset stream and consume tokens again
+    stream.reset();
+    checkTokens(stream);
+  }
+  
+  private void checkTokens(TokenStream stream) throws IOException {
+    int count = 0;
+    Token token;
+    while ((token = stream.next()) != null) {
+      assertTrue(count < tokens.length);
+      assertEquals(tokens[count], token.termText);
+      count++;
+    }
+    
+    assertEquals(tokens.length, count);
+  }
+}
--- a/src/test/org/apache/lucene/index/TestDocumentWriter.java
+++ b/src/test/org/apache/lucene/index/TestDocumentWriter.java
@ -19,10 +19,13 @@ package org.apache.lucene.index;

 import junit.framework.TestCase;
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.SimpleAnalyzer;
+import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.WhitespaceAnalyzer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.WhitespaceTokenizer;
 import org.apache.lucene.document.*;
+import org.apache.lucene.document.Field.TermVector;
 import org.apache.lucene.search.Similarity;
 import org.apache.lucene.store.RAMDirectory;

@ -124,4 +127,45 @@ public class TestDocumentWriter extends TestCase {
    assertEquals(0, termPositions.nextPosition());
    assertEquals(502, termPositions.nextPosition());
  }
+  
+  public void testPreAnalyzedField() throws IOException {
+    Similarity similarity = Similarity.getDefault();
+    DocumentWriter writer = new DocumentWriter(dir, new SimpleAnalyzer(), similarity, 50);
+    Document doc = new Document();
+    
+    doc.add(new Field("preanalyzed", new TokenStream() {
+      private String[] tokens = new String[] {"term1", "term2", "term3", "term2"};
+      private int index = 0;
+      
+      public Token next() throws IOException {
+        if (index == tokens.length) {
+          return null;
+        } else {
+          return new Token(tokens[index++], 0, 0);
+        }        
+      }
+      
+    }, TermVector.NO));
+    
+    String segName = "test";
+    writer.addDocument(segName, doc);
+    SegmentReader reader = SegmentReader.get(new SegmentInfo(segName, 1, dir));
+
+    TermPositions termPositions = reader.termPositions(new Term("preanalyzed", "term1"));
+    assertTrue(termPositions.next());
+    assertEquals(1, termPositions.freq());
+    assertEquals(0, termPositions.nextPosition());
+
+    termPositions.seek(new Term("preanalyzed", "term2"));
+    assertTrue(termPositions.next());
+    assertEquals(2, termPositions.freq());
+    assertEquals(1, termPositions.nextPosition());
+    assertEquals(3, termPositions.nextPosition());
+    
+    termPositions.seek(new Term("preanalyzed", "term3"));
+    assertTrue(termPositions.next());
+    assertEquals(1, termPositions.freq());
+    assertEquals(2, termPositions.nextPosition());
+
+  }
 }