From 8171a066324e68aee44a3b427d59587ba490575e Mon Sep 17 00:00:00 2001 From: Yonik Seeley Date: Mon, 22 Jun 2009 23:06:46 +0000 Subject: [PATCH] LUCENE-1699: make Field.tokenStream usable with other stored field mechanisms git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@787437 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 5 ++ .../apache/lucene/document/AbstractField.java | 9 ++- .../org/apache/lucene/document/Field.java | 49 +++++++------ .../org/apache/lucene/document/Fieldable.java | 47 +++++++------ .../apache/lucene/index/TestIndexWriter.java | 68 +++++++++++++++++-- 5 files changed, 128 insertions(+), 50 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 196fa0fa840..a7c5d2c87c5 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -424,6 +424,11 @@ Bug fixes 28. LUCENE-1405: Added support for Ant resource collections in contrib/ant task. (Przemyslaw Sztoch via Erik Hatcher) + +29. LUCENE-1699: Allow setting a TokenStream on Field/Fieldable for indexing + in conjunction with any other ways to specify stored field values, + currently binary or string values. (yonik) + Optimizations diff --git a/src/java/org/apache/lucene/document/AbstractField.java b/src/java/org/apache/lucene/document/AbstractField.java index fc730457b7f..c92be7cb4fb 100755 --- a/src/java/org/apache/lucene/document/AbstractField.java +++ b/src/java/org/apache/lucene/document/AbstractField.java @@ -16,7 +16,8 @@ package org.apache.lucene.document; */ import org.apache.lucene.search.PhraseQuery; // for javadocs -import org.apache.lucene.search.spans.SpanQuery; // for javadocs +import org.apache.lucene.search.spans.SpanQuery; +import org.apache.lucene.analysis.TokenStream; // for javadocs /** @@ -38,9 +39,11 @@ public abstract class AbstractField implements Fieldable { protected boolean lazy = false; protected boolean omitTermFreqAndPositions = false; protected float boost = 1.0f; - // the one and only data object for all different kind of field values + // the data object for all different kind of field values protected Object fieldsData = null; - //length/offset for all primitive types + // pre-analyzed tokenStream for indexed fields + protected TokenStream tokenStream; + // length/offset for all primitive types protected int binaryLength; protected int binaryOffset; diff --git a/src/java/org/apache/lucene/document/Field.java b/src/java/org/apache/lucene/document/Field.java index 031083ca48d..487b9a17503 100644 --- a/src/java/org/apache/lucene/document/Field.java +++ b/src/java/org/apache/lucene/document/Field.java @@ -94,7 +94,7 @@ public final class Field extends AbstractField implements Fieldable, Serializabl /** Expert: Index the field's value without an Analyzer, * and also disable the storing of norms. Note that you * can also separately enable/disable norms by calling - * {@link #setOmitNorms}. No norms means that + * {@link Field#setOmitNorms}. No norms means that * index-time field and document boosting and field * length normalization are disabled. The benefit is * less memory usage as norms take up one byte of RAM @@ -159,19 +159,19 @@ public final class Field extends AbstractField implements Fieldable, Serializabl } - /** The value of the field as a String, or null. If null, the Reader value, - * binary value, or TokenStream value is used. Exactly one of stringValue(), - * readerValue(), getBinaryValue(), and tokenStreamValue() must be set. */ + /** The value of the field as a String, or null. If null, the Reader value or + * binary value is used. Exactly one of stringValue(), + * readerValue(), and getBinaryValue() must be set. */ public String stringValue() { return fieldsData instanceof String ? (String)fieldsData : null; } - /** The value of the field as a Reader, or null. If null, the String value, - * binary value, or TokenStream value is used. Exactly one of stringValue(), - * readerValue(), getBinaryValue(), and tokenStreamValue() must be set. */ + /** The value of the field as a Reader, or null. If null, the String value or + * binary value is used. Exactly one of stringValue(), + * readerValue(), and getBinaryValue() must be set. */ public Reader readerValue() { return fieldsData instanceof Reader ? (Reader)fieldsData : null; } /** The value of the field in Binary, or null. If null, the Reader value, - * String value, or TokenStream value is used. Exactly one of stringValue(), - * readerValue(), getBinaryValue(), and tokenStreamValue() must be set. + * or String value is used. Exactly one of stringValue(), + * readerValue(), and getBinaryValue() must be set. * @deprecated This method must allocate a new byte[] if * the {@link AbstractField#getBinaryOffset()} is non-zero * or {@link AbstractField#getBinaryLength()} is not the @@ -191,10 +191,9 @@ public final class Field extends AbstractField implements Fieldable, Serializabl return ret; } - /** The value of the field as a TokesStream, or null. If null, the Reader value, - * String value, or binary value is used. Exactly one of stringValue(), - * readerValue(), getBinaryValue(), and tokenStreamValue() must be set. */ - public TokenStream tokenStreamValue() { return fieldsData instanceof TokenStream ? (TokenStream)fieldsData : null; } + /** The TokesStream for this field to be used when indexing, or null. If null, the Reader value + * or String value is analyzed to produce the indexed tokens. */ + public TokenStream tokenStreamValue() { return tokenStream; } /**

Expert: change the value of this field. This can @@ -204,10 +203,7 @@ public final class Field extends AbstractField implements Fieldable, Serializabl * a single {@link Document} instance is re-used as * well. This helps most on small documents.

* - *

Note that you should only use this method after the - * Field has been consumed (ie, the {@link Document} - * containing this Field has been added to the index). - * Also, each Field instance should only be used once + *

Each Field instance should only be used once * within a single {@link Document} instance. See ImproveIndexingSpeed * for details.

*/ @@ -250,7 +246,8 @@ public final class Field extends AbstractField implements Fieldable, Serializabl } - /** Expert: change the value of this field. See setValue(String). */ + /** Expert: change the value of this field. See setValue(String). + * @deprecated use {@link #setTokenStream} */ public void setValue(TokenStream value) { if (isBinary) { throw new IllegalArgumentException("cannot set a TokenStream value on a binary field"); @@ -258,7 +255,16 @@ public final class Field extends AbstractField implements Fieldable, Serializabl if (isStored) { throw new IllegalArgumentException("cannot set a TokenStream value on a stored field"); } - fieldsData = value; + fieldsData = null; + tokenStream = value; + } + + /** Expert: sets the token stream to be used for indexing and causes isIndexed() and isTokenized() to return true. + * May be combined with stored values from stringValue() or binaryValue() */ + public void setTokenStream(TokenStream tokenStream) { + this.isIndexed = true; + this.isTokenized = true; + this.tokenStream = tokenStream; } /** @@ -459,8 +465,9 @@ public final class Field extends AbstractField implements Fieldable, Serializabl throw new NullPointerException("tokenStream cannot be null"); this.name = name.intern(); // field names are interned - this.fieldsData = tokenStream; - + this.fieldsData = null; + this.tokenStream = tokenStream; + this.isStored = false; this.isCompressed = false; diff --git a/src/java/org/apache/lucene/document/Fieldable.java b/src/java/org/apache/lucene/document/Fieldable.java index 09032dd7bff..11cf31b2758 100755 --- a/src/java/org/apache/lucene/document/Fieldable.java +++ b/src/java/org/apache/lucene/document/Fieldable.java @@ -74,36 +74,41 @@ public interface Fieldable extends Serializable { */ String name(); - /** The value of the field as a String, or null. If null, the Reader value, - * binary value, or TokenStream value is used. Exactly one of stringValue(), - * readerValue(), binaryValue(), and tokenStreamValue() must be set. */ + /** The value of the field as a String, or null. + *

+ * For indexing, if isStored()==true, the stringValue() will be used as the stored field value + * unless isBinary()==true, in which case binaryValue() will be used. + * + * If isIndexed()==true and isTokenized()==false, this String value will be indexed as a single token. + * If isIndexed()==true and isTokenized()==true, then tokenStreamValue() will be used to generate indexed tokens if not null, + * else readerValue() will be used to generate indexed tokens if not null, else stringValue() will be used to generate tokens. + */ public String stringValue(); - /** The value of the field as a Reader, or null. If null, the String value, - * binary value, or TokenStream value is used. Exactly one of stringValue(), - * readerValue(), binaryValue(), and tokenStreamValue() must be set. */ + /** The value of the field as a Reader, which can be used at index time to generate indexed tokens. + * @see #stringValue() + */ public Reader readerValue(); - /** The value of the field in Binary, or null. If null, the Reader value, - * String value, or TokenStream value is used. Exactly one of stringValue(), - * readerValue(), binaryValue(), and tokenStreamValue() must be set. */ + /** The value of the field in Binary, or null. + * @see #stringValue() + */ public byte[] binaryValue(); - /** The value of the field as a TokenStream, or null. If null, the Reader value, - * String value, or binary value is used. Exactly one of stringValue(), - * readerValue(), binaryValue(), and tokenStreamValue() must be set. */ + /** The TokenStream for this field to be used when indexing, or null. + * @see #stringValue() + */ public TokenStream tokenStreamValue(); - /** True iff the value of the field is to be stored in the index for return - with search hits. It is an error for this to be true if a field is - Reader-valued. */ + /** True if the value of the field is to be stored in the index for return + with search hits. */ boolean isStored(); - /** True iff the value of the field is to be indexed, so that it may be + /** True if the value of the field is to be indexed, so that it may be searched on. */ boolean isIndexed(); - /** True iff the value of the field should be tokenized as text prior to + /** True if the value of the field should be tokenized as text prior to indexing. Un-tokenized fields are indexed as a single word and may not be Reader-valued. */ boolean isTokenized(); @@ -111,7 +116,7 @@ public interface Fieldable extends Serializable { /** True if the value of the field is stored and compressed within the index */ boolean isCompressed(); - /** True iff the term or terms used to index this field are stored as a term + /** True if the term or terms used to index this field are stored as a term * vector, available from {@link org.apache.lucene.index.IndexReader#getTermFreqVector(int,String)}. * These methods do not provide access to the original content of the field, * only to terms used to index it. If the original content must be @@ -122,17 +127,17 @@ public interface Fieldable extends Serializable { boolean isTermVectorStored(); /** - * True iff terms are stored as term vector together with their offsets + * True if terms are stored as term vector together with their offsets * (start and end positon in source text). */ boolean isStoreOffsetWithTermVector(); /** - * True iff terms are stored as term vector together with their token positions. + * True if terms are stored as term vector together with their token positions. */ boolean isStorePositionWithTermVector(); - /** True iff the value of the filed is stored as binary */ + /** True if the value of the field is stored as binary */ boolean isBinary(); /** True if norms are omitted for this indexed field */ diff --git a/src/test/org/apache/lucene/index/TestIndexWriter.java b/src/test/org/apache/lucene/index/TestIndexWriter.java index 04a9225b31f..3538fcb6379 100644 --- a/src/test/org/apache/lucene/index/TestIndexWriter.java +++ b/src/test/org/apache/lucene/index/TestIndexWriter.java @@ -17,11 +17,7 @@ package org.apache.lucene.index; * limitations under the License. */ -import java.io.ByteArrayOutputStream; -import java.io.File; -import java.io.IOException; -import java.io.PrintStream; -import java.io.Reader; +import java.io.*; import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -4350,4 +4346,66 @@ public class TestIndexWriter extends LuceneTestCase t.join(); assertFalse(t.failed); } + + + public void testIndexStoreCombos() throws Exception { + MockRAMDirectory dir = new MockRAMDirectory(); + IndexWriter w = new IndexWriter(dir, new WhitespaceAnalyzer(), true, IndexWriter.MaxFieldLength.UNLIMITED); + byte[] b = new byte[50]; + for(int i=0;i<50;i++) + b[i] = (byte) (i+77); + + Document doc = new Document(); + Field f = new Field("binary", b, 10, 17, Field.Store.YES); + f.setTokenStream(new WhitespaceTokenizer(new StringReader("doc1field1"))); + Field f2 = new Field("string", "value", Field.Store.YES,Field.Index.ANALYZED); + f2.setTokenStream(new WhitespaceTokenizer(new StringReader("doc1field2"))); + doc.add(f); + doc.add(f2); + w.addDocument(doc); + + // add 2 docs to test in-memory merging + f.setTokenStream(new WhitespaceTokenizer(new StringReader("doc2field1"))); + f2.setTokenStream(new WhitespaceTokenizer(new StringReader("doc2field2"))); + w.addDocument(doc); + + // force segment flush so we can force a segment merge with doc3 later. + w.commit(); + + f.setTokenStream(new WhitespaceTokenizer(new StringReader("doc3field1"))); + f2.setTokenStream(new WhitespaceTokenizer(new StringReader("doc3field2"))); + + w.addDocument(doc); + w.commit(); + w.optimize(); // force segment merge. + + IndexReader ir = IndexReader.open(dir); + doc = ir.document(0); + f = doc.getField("binary"); + b = f.getBinaryValue(); + assertTrue(b != null); + assertEquals(17, b.length, 17); + assertEquals(87, b[0]); + + assertTrue(ir.document(0).getFieldable("binary").isBinary()); + assertTrue(ir.document(1).getFieldable("binary").isBinary()); + assertTrue(ir.document(2).getFieldable("binary").isBinary()); + + assertEquals("value", ir.document(0).get("string")); + assertEquals("value", ir.document(1).get("string")); + assertEquals("value", ir.document(2).get("string")); + + + // test that the terms were indexed. + assertTrue(ir.termDocs(new Term("binary","doc1field1")).next()); + assertTrue(ir.termDocs(new Term("binary","doc2field1")).next()); + assertTrue(ir.termDocs(new Term("binary","doc3field1")).next()); + assertTrue(ir.termDocs(new Term("string","doc1field2")).next()); + assertTrue(ir.termDocs(new Term("string","doc2field2")).next()); + assertTrue(ir.termDocs(new Term("string","doc3field2")).next()); + + ir.close(); + dir.close(); + + } }