LUCENE-1699: make Field.tokenStream usable with other stored field mechanisms

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@787437 13f79535-47bb-0310-9956-ffa450edef68
2009-06-22 23:06:46 +00:00 · 2009-06-22 23:06:46 +00:00 · 8171a06632
parent ec8088654d
commit 8171a06632
5 changed files with 128 additions and 50 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -425,6 +425,11 @@ Bug fixes
 28. LUCENE-1405: Added support for Ant resource collections in contrib/ant
    <index> task.  (Przemyslaw Sztoch via Erik Hatcher)

+29. LUCENE-1699: Allow setting a TokenStream on Field/Fieldable for indexing
+    in conjunction with any other ways to specify stored field values,
+    currently binary or string values.  (yonik)
+    
+    
 Optimizations

 1. LUCENE-1427: Fixed QueryWrapperFilter to not waste time computing
--- a/src/java/org/apache/lucene/document/AbstractField.java
+++ b/src/java/org/apache/lucene/document/AbstractField.java
@ -16,7 +16,8 @@ package org.apache.lucene.document;
 */

 import org.apache.lucene.search.PhraseQuery; // for javadocs
-import org.apache.lucene.search.spans.SpanQuery; // for javadocs
+import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.analysis.TokenStream; // for javadocs


 /**
@ -38,9 +39,11 @@ public abstract class AbstractField implements Fieldable {
  protected boolean lazy = false;
  protected boolean omitTermFreqAndPositions = false;
  protected float boost = 1.0f;
-  // the one and only data object for all different kind of field values
+  // the data object for all different kind of field values
  protected Object fieldsData = null;
-  //length/offset for all primitive types
+  // pre-analyzed tokenStream for indexed fields
+  protected TokenStream tokenStream;
+  // length/offset for all primitive types
  protected int binaryLength;
  protected int binaryOffset;

--- a/src/java/org/apache/lucene/document/Field.java
+++ b/src/java/org/apache/lucene/document/Field.java
@ -94,7 +94,7 @@ public final class Field extends AbstractField implements Fieldable, Serializabl
    /** Expert: Index the field's value without an Analyzer,
     * and also disable the storing of norms.  Note that you
     * can also separately enable/disable norms by calling
-     * {@link #setOmitNorms}.  No norms means that
+     * {@link Field#setOmitNorms}.  No norms means that
     * index-time field and document boosting and field
     * length normalization are disabled.  The benefit is
     * less memory usage as norms take up one byte of RAM
@ -159,19 +159,19 @@ public final class Field extends AbstractField implements Fieldable, Serializabl
  }
  
  
-  /** The value of the field as a String, or null.  If null, the Reader value,
-   * binary value, or TokenStream value is used.  Exactly one of stringValue(), 
-   * readerValue(), getBinaryValue(), and tokenStreamValue() must be set. */
+  /** The value of the field as a String, or null.  If null, the Reader value or
+   * binary value is used.  Exactly one of stringValue(),
+   * readerValue(), and getBinaryValue() must be set. */
  public String stringValue()   { return fieldsData instanceof String ? (String)fieldsData : null; }
  
-  /** The value of the field as a Reader, or null.  If null, the String value,
-   * binary value, or TokenStream value is used.  Exactly one of stringValue(), 
-   * readerValue(), getBinaryValue(), and tokenStreamValue() must be set. */
+  /** The value of the field as a Reader, or null.  If null, the String value or
+   * binary value is used.  Exactly one of stringValue(),
+   * readerValue(), and getBinaryValue() must be set. */
  public Reader readerValue()   { return fieldsData instanceof Reader ? (Reader)fieldsData : null; }
  
  /** The value of the field in Binary, or null.  If null, the Reader value,
-   * String value, or TokenStream value is used. Exactly one of stringValue(), 
-   * readerValue(), getBinaryValue(), and tokenStreamValue() must be set.
+   * or String value is used. Exactly one of stringValue(),
+   * readerValue(), and getBinaryValue() must be set.
   * @deprecated This method must allocate a new byte[] if
   * the {@link AbstractField#getBinaryOffset()} is non-zero
   * or {@link AbstractField#getBinaryLength()} is not the
@ -191,10 +191,9 @@ public final class Field extends AbstractField implements Fieldable, Serializabl
    return ret;    
  }
  
-  /** The value of the field as a TokesStream, or null.  If null, the Reader value,
-   * String value, or binary value is used. Exactly one of stringValue(), 
-   * readerValue(), getBinaryValue(), and tokenStreamValue() must be set. */
-  public TokenStream tokenStreamValue()   { return fieldsData instanceof TokenStream ? (TokenStream)fieldsData : null; }
+  /** The TokesStream for this field to be used when indexing, or null.  If null, the Reader value
+   * or String value is analyzed to produce the indexed tokens. */
+  public TokenStream tokenStreamValue()   { return tokenStream; }
  

  /** <p>Expert: change the value of this field.  This can
@ -204,10 +203,7 @@ public final class Field extends AbstractField implements Fieldable, Serializabl
   *  a single {@link Document} instance is re-used as
   *  well.  This helps most on small documents.</p>
   * 
-   *  <p>Note that you should only use this method after the
-   *  Field has been consumed (ie, the {@link Document}
-   *  containing this Field has been added to the index).
-   *  Also, each Field instance should only be used once
+   *  <p>Each Field instance should only be used once
   *  within a single {@link Document} instance.  See <a
   *  href="http://wiki.apache.org/lucene-java/ImproveIndexingSpeed">ImproveIndexingSpeed</a>
   *  for details.</p> */
@ -250,7 +246,8 @@ public final class Field extends AbstractField implements Fieldable, Serializabl
  }
  
  
-  /** Expert: change the value of this field.  See <a href="#setValue(java.lang.String)">setValue(String)</a>. */
+  /** Expert: change the value of this field.  See <a href="#setValue(java.lang.String)">setValue(String)</a>.
+   * @deprecated use {@link #setTokenStream} */
  public void setValue(TokenStream value) {
    if (isBinary) {
      throw new IllegalArgumentException("cannot set a TokenStream value on a binary field");
@ -258,7 +255,16 @@ public final class Field extends AbstractField implements Fieldable, Serializabl
    if (isStored) {
      throw new IllegalArgumentException("cannot set a TokenStream value on a stored field");
    }
-    fieldsData = value;
+    fieldsData = null;
+    tokenStream = value;
+  }
+
+  /** Expert: sets the token stream to be used for indexing and causes isIndexed() and isTokenized() to return true.
+   *  May be combined with stored values from stringValue() or binaryValue() */
+  public void setTokenStream(TokenStream tokenStream) {
+    this.isIndexed = true;
+    this.isTokenized = true;
+    this.tokenStream = tokenStream;
  }

  /**
@ -459,7 +465,8 @@ public final class Field extends AbstractField implements Fieldable, Serializabl
      throw new NullPointerException("tokenStream cannot be null");
    
    this.name = name.intern();        // field names are interned
-    this.fieldsData = tokenStream;
+    this.fieldsData = null;
+    this.tokenStream = tokenStream;

    this.isStored = false;
    this.isCompressed = false;
--- a/src/java/org/apache/lucene/document/Fieldable.java
+++ b/src/java/org/apache/lucene/document/Fieldable.java
@ -74,36 +74,41 @@ public interface Fieldable extends Serializable {
   */
  String name();

-  /** The value of the field as a String, or null.  If null, the Reader value,
-   * binary value, or TokenStream value is used.  Exactly one of stringValue(), 
-   * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
+  /** The value of the field as a String, or null.
+   * <p>
+   * For indexing, if isStored()==true, the stringValue() will be used as the stored field value
+   * unless isBinary()==true, in which case binaryValue() will be used.
+   *
+   * If isIndexed()==true and isTokenized()==false, this String value will be indexed as a single token.
+   * If isIndexed()==true and isTokenized()==true, then tokenStreamValue() will be used to generate indexed tokens if not null,
+   * else readerValue() will be used to generate indexed tokens if not null, else stringValue() will be used to generate tokens.
+   */
  public String stringValue();
  
-  /** The value of the field as a Reader, or null.  If null, the String value,
-   * binary value, or TokenStream value is used.  Exactly one of stringValue(), 
-   * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
+  /** The value of the field as a Reader, which can be used at index time to generate indexed tokens.
+   * @see #stringValue()
+   */
  public Reader readerValue();
  
-  /** The value of the field in Binary, or null.  If null, the Reader value,
-   * String value, or TokenStream value is used. Exactly one of stringValue(), 
-   * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
+  /** The value of the field in Binary, or null.
+   * @see #stringValue()
+   */
  public byte[] binaryValue();
  
-  /** The value of the field as a TokenStream, or null.  If null, the Reader value,
-   * String value, or binary value is used. Exactly one of stringValue(), 
-   * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
+  /** The TokenStream for this field to be used when indexing, or null.
+   * @see #stringValue()
+   */
  public TokenStream tokenStreamValue();

-  /** True iff the value of the field is to be stored in the index for return
-    with search hits.  It is an error for this to be true if a field is
-    Reader-valued. */
+  /** True if the value of the field is to be stored in the index for return
+    with search hits. */
  boolean  isStored();

-  /** True iff the value of the field is to be indexed, so that it may be
+  /** True if the value of the field is to be indexed, so that it may be
    searched on. */
  boolean  isIndexed();

-  /** True iff the value of the field should be tokenized as text prior to
+  /** True if the value of the field should be tokenized as text prior to
    indexing.  Un-tokenized fields are indexed as a single word and may not be
    Reader-valued. */
  boolean  isTokenized();
@ -111,7 +116,7 @@ public interface Fieldable extends Serializable {
  /** True if the value of the field is stored and compressed within the index */
  boolean  isCompressed();

-  /** True iff the term or terms used to index this field are stored as a term
+  /** True if the term or terms used to index this field are stored as a term
   *  vector, available from {@link org.apache.lucene.index.IndexReader#getTermFreqVector(int,String)}.
   *  These methods do not provide access to the original content of the field,
   *  only to terms used to index it. If the original content must be
@ -122,17 +127,17 @@ public interface Fieldable extends Serializable {
  boolean isTermVectorStored();

  /**
-   * True iff terms are stored as term vector together with their offsets 
+   * True if terms are stored as term vector together with their offsets 
   * (start and end positon in source text).
   */
  boolean isStoreOffsetWithTermVector();

  /**
-   * True iff terms are stored as term vector together with their token positions.
+   * True if terms are stored as term vector together with their token positions.
   */
  boolean isStorePositionWithTermVector();

-  /** True iff the value of the filed is stored as binary */
+  /** True if the value of the field is stored as binary */
  boolean  isBinary();

  /** True if norms are omitted for this indexed field */
--- a/src/test/org/apache/lucene/index/TestIndexWriter.java
+++ b/src/test/org/apache/lucene/index/TestIndexWriter.java
@ -17,11 +17,7 @@ package org.apache.lucene.index;
 * limitations under the License.
 */

-import java.io.ByteArrayOutputStream;
-import java.io.File;
-import java.io.IOException;
-import java.io.PrintStream;
-import java.io.Reader;
+import java.io.*;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
@ -4350,4 +4346,66 @@ public class TestIndexWriter extends LuceneTestCase
    t.join();
    assertFalse(t.failed);
  }
+
+
+  public void testIndexStoreCombos() throws Exception {
+    MockRAMDirectory dir = new MockRAMDirectory();
+    IndexWriter w = new IndexWriter(dir, new WhitespaceAnalyzer(), true, IndexWriter.MaxFieldLength.UNLIMITED);
+    byte[] b = new byte[50];
+    for(int i=0;i<50;i++)
+      b[i] = (byte) (i+77);
+
+    Document doc = new Document();
+    Field f = new Field("binary", b, 10, 17, Field.Store.YES);
+    f.setTokenStream(new WhitespaceTokenizer(new StringReader("doc1field1")));
+    Field f2 = new Field("string", "value", Field.Store.YES,Field.Index.ANALYZED);
+    f2.setTokenStream(new WhitespaceTokenizer(new StringReader("doc1field2")));
+    doc.add(f);
+    doc.add(f2);
+    w.addDocument(doc);
+    
+    // add 2 docs to test in-memory merging
+    f.setTokenStream(new WhitespaceTokenizer(new StringReader("doc2field1")));
+    f2.setTokenStream(new WhitespaceTokenizer(new StringReader("doc2field2")));
+    w.addDocument(doc);
+  
+    // force segment flush so we can force a segment merge with doc3 later.
+    w.commit();
+
+    f.setTokenStream(new WhitespaceTokenizer(new StringReader("doc3field1")));
+    f2.setTokenStream(new WhitespaceTokenizer(new StringReader("doc3field2")));
+
+    w.addDocument(doc);
+    w.commit();
+    w.optimize();   // force segment merge.
+
+    IndexReader ir = IndexReader.open(dir);
+    doc = ir.document(0);
+    f = doc.getField("binary");
+    b = f.getBinaryValue();
+    assertTrue(b != null);
+    assertEquals(17, b.length, 17);
+    assertEquals(87, b[0]);
+
+    assertTrue(ir.document(0).getFieldable("binary").isBinary());
+    assertTrue(ir.document(1).getFieldable("binary").isBinary());
+    assertTrue(ir.document(2).getFieldable("binary").isBinary());
+    
+    assertEquals("value", ir.document(0).get("string"));
+    assertEquals("value", ir.document(1).get("string"));
+    assertEquals("value", ir.document(2).get("string"));
+
+
+    // test that the terms were indexed.
+    assertTrue(ir.termDocs(new Term("binary","doc1field1")).next());
+    assertTrue(ir.termDocs(new Term("binary","doc2field1")).next());
+    assertTrue(ir.termDocs(new Term("binary","doc3field1")).next());
+    assertTrue(ir.termDocs(new Term("string","doc1field2")).next());
+    assertTrue(ir.termDocs(new Term("string","doc2field2")).next());
+    assertTrue(ir.termDocs(new Term("string","doc3field2")).next());
+
+    ir.close();
+    dir.close();
+
+  }
 }