From 80c0f267f6ea53820392ef658024dea4b0ffe84e Mon Sep 17 00:00:00 2001
From: Michael Busch <buschmi@apache.org>
Date: Sun, 29 Apr 2007 19:26:11 +0000
Subject: [PATCH] LUCENE-580: - Added the public method reset() to TokenStream.
 - Added a new constructor to Field that takes a TokenStream as argument,
 useful for pre-analyzed fields.

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@533549 13f79535-47bb-0310-9956-ffa450edef68
---
 CHANGES.txt                                   |  11 +-
 .../lucene/analysis/CachingTokenFilter.java   |  68 ++++++++++++
 .../apache/lucene/analysis/TokenStream.java   |   9 ++
 .../org/apache/lucene/document/Field.java     |  72 ++++++++++--
 .../org/apache/lucene/document/Fieldable.java |  35 +++---
 .../apache/lucene/index/DocumentWriter.java   |  34 ++++--
 .../org/apache/lucene/index/FieldsReader.java |  39 ++++---
 .../analysis/TestCachingTokenFilter.java      | 103 ++++++++++++++++++
 .../lucene/index/TestDocumentWriter.java      |  44 ++++++++
 9 files changed, 364 insertions(+), 51 deletions(-)
 create mode 100644 src/java/org/apache/lucene/analysis/CachingTokenFilter.java
 create mode 100644 src/test/org/apache/lucene/analysis/TestCachingTokenFilter.java

diff --git a/CHANGES.txt b/CHANGES.txt
index 0bb9ffe099d..6100929aacd 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -47,7 +47,16 @@ API Changes
     (Chris Hostetter, Otis Gospodnetic)
 
  8. LUCENE-869: Changed FSIndexInput and FSIndexOutput to inner classes of FSDirectory
-    to enable extensibility of these classes.
+    to enable extensibility of these classes. (Michael Busch)
+
+ 9. LUCENE-580: Added the public method reset() to TokenStream. This method does
+    nothing by default, but may be overwritten by subclasses to support consuming
+    the TokenStream more than once. (Michael Busch)
+
+10. LUCENE-580: Added a new constructor to Field that takes a TokenStream as
+    argument, available as tokenStreamValue(). This is useful to avoid the need of 
+    "dummy analyzers" for pre-analyzed fields. (Karl Wettin, Michael Busch)
+
 
 Bug fixes
 
diff --git a/src/java/org/apache/lucene/analysis/CachingTokenFilter.java b/src/java/org/apache/lucene/analysis/CachingTokenFilter.java
new file mode 100644
index 00000000000..c49729eca22
--- /dev/null
+++ b/src/java/org/apache/lucene/analysis/CachingTokenFilter.java
@@ -0,0 +1,68 @@
+package org.apache.lucene.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.LinkedList;
+import java.util.List;
+
+/**
+ * This class can be used if the Tokens of a TokenStream
+ * are intended to be consumed more than once. It caches
+ * all Tokens locally in a List.
+ * 
+ * CachingTokenFilter implements the optional method
+ * {@link TokenStream#reset()}, which repositions the
+ * stream to the first Token. 
+ *
+ */
+public class CachingTokenFilter extends TokenFilter {
+  private List cache;
+  private int index;
+  
+  public CachingTokenFilter(TokenStream input) {
+    super(input);
+  }
+  
+  public Token next() throws IOException {
+    if (cache == null) {
+      // fill cache lazily
+      cache = new LinkedList();
+      fillCache();
+    }
+    
+    if (index == cache.size()) {
+      // the cache is exhausted, return null
+      return null;
+    }
+    
+    return (Token) cache.get(index++);
+  }
+  
+  public void reset() throws IOException {
+    index = 0;
+  }
+  
+  private void fillCache() throws IOException {
+    Token token;
+    while ( (token = input.next()) != null) {
+      cache.add(token);
+    }
+  }
+
+}
diff --git a/src/java/org/apache/lucene/analysis/TokenStream.java b/src/java/org/apache/lucene/analysis/TokenStream.java
index a1f97b0ecbf..98ba85a1a4a 100644
--- a/src/java/org/apache/lucene/analysis/TokenStream.java
+++ b/src/java/org/apache/lucene/analysis/TokenStream.java
@@ -35,6 +35,15 @@ public abstract class TokenStream {
   /** Returns the next token in the stream, or null at EOS. */
   public abstract Token next() throws IOException;
 
+  /** Resets this stream to the beginning. This is an
+   *  optional operation, so subclasses may or may not
+   *  implement this method. Reset() is not needed for
+   *  the standard indexing process. However, if the Tokens 
+   *  of a TokenStream are intended to be consumed more than 
+   *  once, it is neccessary to implement reset(). 
+   */
+  public void reset() throws IOException {}
+  
   /** Releases resources associated with this stream. */
   public void close() throws IOException {}
 }
diff --git a/src/java/org/apache/lucene/document/Field.java b/src/java/org/apache/lucene/document/Field.java
index 840d66e8b79..5a1b4c3e581 100644
--- a/src/java/org/apache/lucene/document/Field.java
+++ b/src/java/org/apache/lucene/document/Field.java
@@ -17,6 +17,7 @@ package org.apache.lucene.document;
  * limitations under the License.
  */
 
+import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.index.IndexWriter;   // for javadoc
 import org.apache.lucene.util.Parameter;
 
@@ -134,21 +135,26 @@ public final class Field extends AbstractField implements Fieldable, Serializabl
   }
   
   
-  /** The value of the field as a String, or null.  If null, the Reader value
-   * or binary value is used.  Exactly one of stringValue(), readerValue(), and
-   * binaryValue() must be set. */
+  /** The value of the field as a String, or null.  If null, the Reader value,
+   * binary value, or TokenStream value is used.  Exactly one of stringValue(), 
+   * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
   public String stringValue()   { return fieldsData instanceof String ? (String)fieldsData : null; }
   
-  /** The value of the field as a Reader, or null.  If null, the String value
-   * or binary value is  used.  Exactly one of stringValue(), readerValue(),
-   * and binaryValue() must be set. */
+  /** The value of the field as a Reader, or null.  If null, the String value,
+   * binary value, or TokenStream value is used.  Exactly one of stringValue(), 
+   * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
   public Reader readerValue()   { return fieldsData instanceof Reader ? (Reader)fieldsData : null; }
   
-  /** The value of the field in Binary, or null.  If null, the Reader or
-   * String value is used.  Exactly one of stringValue(), readerValue() and
-   * binaryValue() must be set. */
+  /** The value of the field in Binary, or null.  If null, the Reader value,
+   * String value, or TokenStream value is used. Exactly one of stringValue(), 
+   * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
   public byte[] binaryValue()   { return fieldsData instanceof byte[] ? (byte[])fieldsData : null; }
   
+  /** The value of the field as a TokesStream, or null.  If null, the Reader value,
+   * String value, or binary value is used. Exactly one of stringValue(), 
+   * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
+  public TokenStream tokenStreamValue()   { return fieldsData instanceof TokenStream ? (TokenStream)fieldsData : null; }
+  
   /**
    * Create a field by specifying its name, value and how it will
    * be saved in the index. Term vectors will not be stored in the index.
@@ -280,6 +286,54 @@ public final class Field extends AbstractField implements Fieldable, Serializabl
     
     setStoreTermVector(termVector);
   }
+
+  /**
+   * Create a tokenized and indexed field that is not stored. Term vectors will
+   * not be stored. This is useful for pre-analyzed fields.
+   * The TokenStream is read only when the Document is added to the index,
+   * i.e. you may not close the TokenStream until {@link IndexWriter#addDocument(Document)}
+   * has been called.
+   * 
+   * @param name The name of the field
+   * @param tokenStream The TokenStream with the content
+   * @throws NullPointerException if name or tokenStream is <code>null</code>
+   */ 
+  public Field(String name, TokenStream tokenStream) {
+    this(name, tokenStream, TermVector.NO);
+  }
+  
+  /**
+   * Create a tokenized and indexed field that is not stored, optionally with 
+   * storing term vectors.  This is useful for pre-analyzed fields.
+   * The TokenStream is read only when the Document is added to the index,
+   * i.e. you may not close the TokenStream until {@link IndexWriter#addDocument(Document)}
+   * has been called.
+   * 
+   * @param name The name of the field
+   * @param tokenStream The TokenStream with the content
+   * @param termVector Whether term vector should be stored
+   * @throws NullPointerException if name or tokenStream is <code>null</code>
+   */ 
+  public Field(String name, TokenStream tokenStream, TermVector termVector) {
+    if (name == null)
+      throw new NullPointerException("name cannot be null");
+    if (tokenStream == null)
+      throw new NullPointerException("tokenStream cannot be null");
+    
+    this.name = name.intern();        // field names are interned
+    this.fieldsData = tokenStream;
+    
+    this.isStored = false;
+    this.isCompressed = false;
+    
+    this.isIndexed = true;
+    this.isTokenized = true;
+    
+    this.isBinary = false;
+    
+    setStoreTermVector(termVector);
+  }
+
   
   /**
    * Create a stored field with binary value. Optionally the value may be compressed.
diff --git a/src/java/org/apache/lucene/document/Fieldable.java b/src/java/org/apache/lucene/document/Fieldable.java
index 58494382b26..dba02262b69 100755
--- a/src/java/org/apache/lucene/document/Fieldable.java
+++ b/src/java/org/apache/lucene/document/Fieldable.java
@@ -19,6 +19,8 @@ package org.apache.lucene.document;
 import java.io.Reader;
 import java.io.Serializable;
 
+import org.apache.lucene.analysis.TokenStream;
+
 /**
  * Synonymous with {@link Field}.
  *
@@ -60,20 +62,25 @@ public interface Fieldable extends Serializable {
    */
   String name();
 
-  /** The value of the field as a String, or null.  If null, the Reader value
-   * or binary value is used.  Exactly one of stringValue(), readerValue(), and
-   * binaryValue() must be set. */
-  String stringValue();
-
-  /** The value of the field as a Reader, or null.  If null, the String value
-   * or binary value is  used.  Exactly one of stringValue(), readerValue(),
-   * and binaryValue() must be set. */
-  Reader readerValue();
-
-  /** The value of the field in Binary, or null.  If null, the Reader or
-   * String value is used.  Exactly one of stringValue(), readerValue() and
-   * binaryValue() must be set. */
-  byte[] binaryValue();
+  /** The value of the field as a String, or null.  If null, the Reader value,
+   * binary value, or TokenStream value is used.  Exactly one of stringValue(), 
+   * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
+  public String stringValue();
+  
+  /** The value of the field as a Reader, or null.  If null, the String value,
+   * binary value, or TokenStream value is used.  Exactly one of stringValue(), 
+   * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
+  public Reader readerValue();
+  
+  /** The value of the field in Binary, or null.  If null, the Reader value,
+   * String value, or TokenStream value is used. Exactly one of stringValue(), 
+   * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
+  public byte[] binaryValue();
+  
+  /** The value of the field as a TokesStream, or null.  If null, the Reader value,
+   * String value, or binary value is used. Exactly one of stringValue(), 
+   * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
+  public TokenStream tokenStreamValue();
 
   /** True iff the value of the field is to be stored in the index for return
     with search hits.  It is an error for this to be true if a field is
diff --git a/src/java/org/apache/lucene/index/DocumentWriter.java b/src/java/org/apache/lucene/index/DocumentWriter.java
index 6d482dc5c7c..45ed02f7170 100644
--- a/src/java/org/apache/lucene/index/DocumentWriter.java
+++ b/src/java/org/apache/lucene/index/DocumentWriter.java
@@ -162,18 +162,28 @@ final class DocumentWriter {
           offset += stringValue.length();
           length++;
         } else 
-        {
-          Reader reader;			  // find or make Reader
-          if (field.readerValue() != null)
-            reader = field.readerValue();
-          else if (field.stringValue() != null)
-            reader = new StringReader(field.stringValue());
-          else
-            throw new IllegalArgumentException
-                    ("field must have either String or Reader value");
-
-          // Tokenize field and add to postingTable
-          TokenStream stream = analyzer.tokenStream(fieldName, reader);
+        { // tokenized field
+          TokenStream stream = field.tokenStreamValue();
+          
+          // the field does not have a TokenStream,
+          // so we have to obtain one from the analyzer
+          if (stream == null) {
+            Reader reader;			  // find or make Reader
+            if (field.readerValue() != null)
+              reader = field.readerValue();
+            else if (field.stringValue() != null)
+              reader = new StringReader(field.stringValue());
+            else
+              throw new IllegalArgumentException
+                      ("field must have either String or Reader value");
+  
+            // Tokenize field and add to postingTable
+            stream = analyzer.tokenStream(fieldName, reader);
+          }
+          
+          // reset the TokenStream to the first token
+          stream.reset();
+          
           try {
             Token lastToken = null;
             for (Token t = stream.next(); t != null; t = stream.next()) {
diff --git a/src/java/org/apache/lucene/index/FieldsReader.java b/src/java/org/apache/lucene/index/FieldsReader.java
index 6a56883c38a..6cd9667b669 100644
--- a/src/java/org/apache/lucene/index/FieldsReader.java
+++ b/src/java/org/apache/lucene/index/FieldsReader.java
@@ -17,6 +17,7 @@ package org.apache.lucene.index;
  * limitations under the License.
  */
 
+import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.document.*;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.IndexInput;
@@ -331,11 +332,9 @@ final class FieldsReader {
       return localFieldsStream;
     }
 
-    /**
-     * The value of the field in Binary, or null.  If null, the Reader or
-     * String value is used.  Exactly one of stringValue(), readerValue() and
-     * binaryValue() must be set.
-     */
+    /** The value of the field in Binary, or null.  If null, the Reader value,
+     * String value, or TokenStream value is used. Exactly one of stringValue(), 
+     * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
     public byte[] binaryValue() {
       ensureOpen();
       if (fieldsData == null) {
@@ -358,21 +357,26 @@ final class FieldsReader {
       return fieldsData instanceof byte[] ? (byte[]) fieldsData : null;
     }
 
-    /**
-     * The value of the field as a Reader, or null.  If null, the String value
-     * or binary value is  used.  Exactly one of stringValue(), readerValue(),
-     * and binaryValue() must be set.
-     */
+    /** The value of the field as a Reader, or null.  If null, the String value,
+     * binary value, or TokenStream value is used.  Exactly one of stringValue(), 
+     * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
     public Reader readerValue() {
       ensureOpen();
       return fieldsData instanceof Reader ? (Reader) fieldsData : null;
     }
 
-    /**
-     * The value of the field as a String, or null.  If null, the Reader value
-     * or binary value is used.  Exactly one of stringValue(), readerValue(), and
-     * binaryValue() must be set.
-     */
+    /** The value of the field as a TokesStream, or null.  If null, the Reader value,
+     * String value, or binary value is used. Exactly one of stringValue(), 
+     * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
+    public TokenStream tokenStreamValue() {
+      ensureOpen();
+      return fieldsData instanceof TokenStream ? (TokenStream) fieldsData : null;
+    }
+
+    
+    /** The value of the field as a String, or null.  If null, the Reader value,
+     * binary value, or TokenStream value is used.  Exactly one of stringValue(), 
+     * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
     public String stringValue() {
       ensureOpen();
       if (fieldsData == null) {
@@ -462,6 +466,11 @@ final class FieldsReader {
     public byte[] binaryValue() {
       return (byte[]) this.fieldsData;
     }
+
+    public TokenStream tokenStreamValue() {
+      // not needed for merge
+      return null;
+    }
     
     public FieldForMerge(Object value, FieldInfo fi, boolean binary, boolean compressed, boolean tokenize) {
       this.isStored = true;  
diff --git a/src/test/org/apache/lucene/analysis/TestCachingTokenFilter.java b/src/test/org/apache/lucene/analysis/TestCachingTokenFilter.java
new file mode 100644
index 00000000000..94a17313343
--- /dev/null
+++ b/src/test/org/apache/lucene/analysis/TestCachingTokenFilter.java
@@ -0,0 +1,103 @@
+package org.apache.lucene.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import java.io.IOException;
+
+import junit.framework.TestCase;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.Field.TermVector;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermPositions;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.RAMDirectory;
+
+public class TestCachingTokenFilter extends TestCase {
+  private String[] tokens = new String[] {"term1", "term2", "term3", "term2"};
+  
+  public void testCaching() throws IOException {
+    Directory dir = new RAMDirectory();
+    IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer());
+    Document doc = new Document();
+    TokenStream stream = new TokenStream() {
+      private int index = 0;
+      
+      public Token next() throws IOException {
+        if (index == tokens.length) {
+          return null;
+        } else {
+          return new Token(tokens[index++], 0, 0);
+        }        
+      }
+      
+    };
+    
+    stream = new CachingTokenFilter(stream);
+    
+    doc.add(new Field("preanalyzed", stream, TermVector.NO));
+    
+    // 1) we consume all tokens twice before we add the doc to the index
+    checkTokens(stream);
+    stream.reset();  
+    checkTokens(stream);
+    
+    // 2) now add the document to the index and verify if all tokens are indexed
+    //    don't reset the stream here, the DocumentWriter should do that implicitly
+    writer.addDocument(doc);
+    writer.close();
+    
+    IndexReader reader = IndexReader.open(dir);
+    TermPositions termPositions = reader.termPositions(new Term("preanalyzed", "term1"));
+    assertTrue(termPositions.next());
+    assertEquals(1, termPositions.freq());
+    assertEquals(0, termPositions.nextPosition());
+
+    termPositions.seek(new Term("preanalyzed", "term2"));
+    assertTrue(termPositions.next());
+    assertEquals(2, termPositions.freq());
+    assertEquals(1, termPositions.nextPosition());
+    assertEquals(3, termPositions.nextPosition());
+    
+    termPositions.seek(new Term("preanalyzed", "term3"));
+    assertTrue(termPositions.next());
+    assertEquals(1, termPositions.freq());
+    assertEquals(2, termPositions.nextPosition());
+    reader.close();
+    
+    // 3) reset stream and consume tokens again
+    stream.reset();
+    checkTokens(stream);
+  }
+  
+  private void checkTokens(TokenStream stream) throws IOException {
+    int count = 0;
+    Token token;
+    while ((token = stream.next()) != null) {
+      assertTrue(count < tokens.length);
+      assertEquals(tokens[count], token.termText);
+      count++;
+    }
+    
+    assertEquals(tokens.length, count);
+  }
+}
diff --git a/src/test/org/apache/lucene/index/TestDocumentWriter.java b/src/test/org/apache/lucene/index/TestDocumentWriter.java
index 9279c7ff039..df3560dfa46 100644
--- a/src/test/org/apache/lucene/index/TestDocumentWriter.java
+++ b/src/test/org/apache/lucene/index/TestDocumentWriter.java
@@ -19,10 +19,13 @@ package org.apache.lucene.index;
 
 import junit.framework.TestCase;
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.SimpleAnalyzer;
+import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.WhitespaceAnalyzer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.WhitespaceTokenizer;
 import org.apache.lucene.document.*;
+import org.apache.lucene.document.Field.TermVector;
 import org.apache.lucene.search.Similarity;
 import org.apache.lucene.store.RAMDirectory;
 
@@ -124,4 +127,45 @@ public class TestDocumentWriter extends TestCase {
     assertEquals(0, termPositions.nextPosition());
     assertEquals(502, termPositions.nextPosition());
   }
+  
+  public void testPreAnalyzedField() throws IOException {
+    Similarity similarity = Similarity.getDefault();
+    DocumentWriter writer = new DocumentWriter(dir, new SimpleAnalyzer(), similarity, 50);
+    Document doc = new Document();
+    
+    doc.add(new Field("preanalyzed", new TokenStream() {
+      private String[] tokens = new String[] {"term1", "term2", "term3", "term2"};
+      private int index = 0;
+      
+      public Token next() throws IOException {
+        if (index == tokens.length) {
+          return null;
+        } else {
+          return new Token(tokens[index++], 0, 0);
+        }        
+      }
+      
+    }, TermVector.NO));
+    
+    String segName = "test";
+    writer.addDocument(segName, doc);
+    SegmentReader reader = SegmentReader.get(new SegmentInfo(segName, 1, dir));
+
+    TermPositions termPositions = reader.termPositions(new Term("preanalyzed", "term1"));
+    assertTrue(termPositions.next());
+    assertEquals(1, termPositions.freq());
+    assertEquals(0, termPositions.nextPosition());
+
+    termPositions.seek(new Term("preanalyzed", "term2"));
+    assertTrue(termPositions.next());
+    assertEquals(2, termPositions.freq());
+    assertEquals(1, termPositions.nextPosition());
+    assertEquals(3, termPositions.nextPosition());
+    
+    termPositions.seek(new Term("preanalyzed", "term3"));
+    assertTrue(termPositions.next());
+    assertEquals(1, termPositions.freq());
+    assertEquals(2, termPositions.nextPosition());
+
+  }
 }