LUCENE-1699: make Field.tokenStream usable with other stored field mechanisms

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@787437 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Yonik Seeley 2009-06-22 23:06:46 +00:00
parent ec8088654d
commit 8171a06632
5 changed files with 128 additions and 50 deletions

View File

@ -424,6 +424,11 @@ Bug fixes
28. LUCENE-1405: Added support for Ant resource collections in contrib/ant
<index> task. (Przemyslaw Sztoch via Erik Hatcher)
29. LUCENE-1699: Allow setting a TokenStream on Field/Fieldable for indexing
in conjunction with any other ways to specify stored field values,
currently binary or string values. (yonik)
Optimizations

View File

@ -16,7 +16,8 @@ package org.apache.lucene.document;
*/
import org.apache.lucene.search.PhraseQuery; // for javadocs
import org.apache.lucene.search.spans.SpanQuery; // for javadocs
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.analysis.TokenStream; // for javadocs
/**
@ -38,9 +39,11 @@ public abstract class AbstractField implements Fieldable {
protected boolean lazy = false;
protected boolean omitTermFreqAndPositions = false;
protected float boost = 1.0f;
// the one and only data object for all different kind of field values
// the data object for all different kind of field values
protected Object fieldsData = null;
//length/offset for all primitive types
// pre-analyzed tokenStream for indexed fields
protected TokenStream tokenStream;
// length/offset for all primitive types
protected int binaryLength;
protected int binaryOffset;

View File

@ -94,7 +94,7 @@ public final class Field extends AbstractField implements Fieldable, Serializabl
/** Expert: Index the field's value without an Analyzer,
* and also disable the storing of norms. Note that you
* can also separately enable/disable norms by calling
* {@link #setOmitNorms}. No norms means that
* {@link Field#setOmitNorms}. No norms means that
* index-time field and document boosting and field
* length normalization are disabled. The benefit is
* less memory usage as norms take up one byte of RAM
@ -159,19 +159,19 @@ public final class Field extends AbstractField implements Fieldable, Serializabl
}
/** The value of the field as a String, or null. If null, the Reader value,
* binary value, or TokenStream value is used. Exactly one of stringValue(),
* readerValue(), getBinaryValue(), and tokenStreamValue() must be set. */
/** The value of the field as a String, or null. If null, the Reader value or
* binary value is used. Exactly one of stringValue(),
* readerValue(), and getBinaryValue() must be set. */
public String stringValue() { return fieldsData instanceof String ? (String)fieldsData : null; }
/** The value of the field as a Reader, or null. If null, the String value,
* binary value, or TokenStream value is used. Exactly one of stringValue(),
* readerValue(), getBinaryValue(), and tokenStreamValue() must be set. */
/** The value of the field as a Reader, or null. If null, the String value or
* binary value is used. Exactly one of stringValue(),
* readerValue(), and getBinaryValue() must be set. */
public Reader readerValue() { return fieldsData instanceof Reader ? (Reader)fieldsData : null; }
/** The value of the field in Binary, or null. If null, the Reader value,
* String value, or TokenStream value is used. Exactly one of stringValue(),
* readerValue(), getBinaryValue(), and tokenStreamValue() must be set.
* or String value is used. Exactly one of stringValue(),
* readerValue(), and getBinaryValue() must be set.
* @deprecated This method must allocate a new byte[] if
* the {@link AbstractField#getBinaryOffset()} is non-zero
* or {@link AbstractField#getBinaryLength()} is not the
@ -191,10 +191,9 @@ public final class Field extends AbstractField implements Fieldable, Serializabl
return ret;
}
/** The value of the field as a TokesStream, or null. If null, the Reader value,
* String value, or binary value is used. Exactly one of stringValue(),
* readerValue(), getBinaryValue(), and tokenStreamValue() must be set. */
public TokenStream tokenStreamValue() { return fieldsData instanceof TokenStream ? (TokenStream)fieldsData : null; }
/** The TokesStream for this field to be used when indexing, or null. If null, the Reader value
* or String value is analyzed to produce the indexed tokens. */
public TokenStream tokenStreamValue() { return tokenStream; }
/** <p>Expert: change the value of this field. This can
@ -204,10 +203,7 @@ public final class Field extends AbstractField implements Fieldable, Serializabl
* a single {@link Document} instance is re-used as
* well. This helps most on small documents.</p>
*
* <p>Note that you should only use this method after the
* Field has been consumed (ie, the {@link Document}
* containing this Field has been added to the index).
* Also, each Field instance should only be used once
* <p>Each Field instance should only be used once
* within a single {@link Document} instance. See <a
* href="http://wiki.apache.org/lucene-java/ImproveIndexingSpeed">ImproveIndexingSpeed</a>
* for details.</p> */
@ -250,7 +246,8 @@ public final class Field extends AbstractField implements Fieldable, Serializabl
}
/** Expert: change the value of this field. See <a href="#setValue(java.lang.String)">setValue(String)</a>. */
/** Expert: change the value of this field. See <a href="#setValue(java.lang.String)">setValue(String)</a>.
* @deprecated use {@link #setTokenStream} */
public void setValue(TokenStream value) {
if (isBinary) {
throw new IllegalArgumentException("cannot set a TokenStream value on a binary field");
@ -258,7 +255,16 @@ public final class Field extends AbstractField implements Fieldable, Serializabl
if (isStored) {
throw new IllegalArgumentException("cannot set a TokenStream value on a stored field");
}
fieldsData = value;
fieldsData = null;
tokenStream = value;
}
/** Expert: sets the token stream to be used for indexing and causes isIndexed() and isTokenized() to return true.
* May be combined with stored values from stringValue() or binaryValue() */
public void setTokenStream(TokenStream tokenStream) {
this.isIndexed = true;
this.isTokenized = true;
this.tokenStream = tokenStream;
}
/**
@ -459,8 +465,9 @@ public final class Field extends AbstractField implements Fieldable, Serializabl
throw new NullPointerException("tokenStream cannot be null");
this.name = name.intern(); // field names are interned
this.fieldsData = tokenStream;
this.fieldsData = null;
this.tokenStream = tokenStream;
this.isStored = false;
this.isCompressed = false;

View File

@ -74,36 +74,41 @@ public interface Fieldable extends Serializable {
*/
String name();
/** The value of the field as a String, or null. If null, the Reader value,
* binary value, or TokenStream value is used. Exactly one of stringValue(),
* readerValue(), binaryValue(), and tokenStreamValue() must be set. */
/** The value of the field as a String, or null.
* <p>
* For indexing, if isStored()==true, the stringValue() will be used as the stored field value
* unless isBinary()==true, in which case binaryValue() will be used.
*
* If isIndexed()==true and isTokenized()==false, this String value will be indexed as a single token.
* If isIndexed()==true and isTokenized()==true, then tokenStreamValue() will be used to generate indexed tokens if not null,
* else readerValue() will be used to generate indexed tokens if not null, else stringValue() will be used to generate tokens.
*/
public String stringValue();
/** The value of the field as a Reader, or null. If null, the String value,
* binary value, or TokenStream value is used. Exactly one of stringValue(),
* readerValue(), binaryValue(), and tokenStreamValue() must be set. */
/** The value of the field as a Reader, which can be used at index time to generate indexed tokens.
* @see #stringValue()
*/
public Reader readerValue();
/** The value of the field in Binary, or null. If null, the Reader value,
* String value, or TokenStream value is used. Exactly one of stringValue(),
* readerValue(), binaryValue(), and tokenStreamValue() must be set. */
/** The value of the field in Binary, or null.
* @see #stringValue()
*/
public byte[] binaryValue();
/** The value of the field as a TokenStream, or null. If null, the Reader value,
* String value, or binary value is used. Exactly one of stringValue(),
* readerValue(), binaryValue(), and tokenStreamValue() must be set. */
/** The TokenStream for this field to be used when indexing, or null.
* @see #stringValue()
*/
public TokenStream tokenStreamValue();
/** True iff the value of the field is to be stored in the index for return
with search hits. It is an error for this to be true if a field is
Reader-valued. */
/** True if the value of the field is to be stored in the index for return
with search hits. */
boolean isStored();
/** True iff the value of the field is to be indexed, so that it may be
/** True if the value of the field is to be indexed, so that it may be
searched on. */
boolean isIndexed();
/** True iff the value of the field should be tokenized as text prior to
/** True if the value of the field should be tokenized as text prior to
indexing. Un-tokenized fields are indexed as a single word and may not be
Reader-valued. */
boolean isTokenized();
@ -111,7 +116,7 @@ public interface Fieldable extends Serializable {
/** True if the value of the field is stored and compressed within the index */
boolean isCompressed();
/** True iff the term or terms used to index this field are stored as a term
/** True if the term or terms used to index this field are stored as a term
* vector, available from {@link org.apache.lucene.index.IndexReader#getTermFreqVector(int,String)}.
* These methods do not provide access to the original content of the field,
* only to terms used to index it. If the original content must be
@ -122,17 +127,17 @@ public interface Fieldable extends Serializable {
boolean isTermVectorStored();
/**
* True iff terms are stored as term vector together with their offsets
* True if terms are stored as term vector together with their offsets
* (start and end positon in source text).
*/
boolean isStoreOffsetWithTermVector();
/**
* True iff terms are stored as term vector together with their token positions.
* True if terms are stored as term vector together with their token positions.
*/
boolean isStorePositionWithTermVector();
/** True iff the value of the filed is stored as binary */
/** True if the value of the field is stored as binary */
boolean isBinary();
/** True if norms are omitted for this indexed field */

View File

@ -17,11 +17,7 @@ package org.apache.lucene.index;
* limitations under the License.
*/
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.PrintStream;
import java.io.Reader;
import java.io.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
@ -4350,4 +4346,66 @@ public class TestIndexWriter extends LuceneTestCase
t.join();
assertFalse(t.failed);
}
public void testIndexStoreCombos() throws Exception {
MockRAMDirectory dir = new MockRAMDirectory();
IndexWriter w = new IndexWriter(dir, new WhitespaceAnalyzer(), true, IndexWriter.MaxFieldLength.UNLIMITED);
byte[] b = new byte[50];
for(int i=0;i<50;i++)
b[i] = (byte) (i+77);
Document doc = new Document();
Field f = new Field("binary", b, 10, 17, Field.Store.YES);
f.setTokenStream(new WhitespaceTokenizer(new StringReader("doc1field1")));
Field f2 = new Field("string", "value", Field.Store.YES,Field.Index.ANALYZED);
f2.setTokenStream(new WhitespaceTokenizer(new StringReader("doc1field2")));
doc.add(f);
doc.add(f2);
w.addDocument(doc);
// add 2 docs to test in-memory merging
f.setTokenStream(new WhitespaceTokenizer(new StringReader("doc2field1")));
f2.setTokenStream(new WhitespaceTokenizer(new StringReader("doc2field2")));
w.addDocument(doc);
// force segment flush so we can force a segment merge with doc3 later.
w.commit();
f.setTokenStream(new WhitespaceTokenizer(new StringReader("doc3field1")));
f2.setTokenStream(new WhitespaceTokenizer(new StringReader("doc3field2")));
w.addDocument(doc);
w.commit();
w.optimize(); // force segment merge.
IndexReader ir = IndexReader.open(dir);
doc = ir.document(0);
f = doc.getField("binary");
b = f.getBinaryValue();
assertTrue(b != null);
assertEquals(17, b.length, 17);
assertEquals(87, b[0]);
assertTrue(ir.document(0).getFieldable("binary").isBinary());
assertTrue(ir.document(1).getFieldable("binary").isBinary());
assertTrue(ir.document(2).getFieldable("binary").isBinary());
assertEquals("value", ir.document(0).get("string"));
assertEquals("value", ir.document(1).get("string"));
assertEquals("value", ir.document(2).get("string"));
// test that the terms were indexed.
assertTrue(ir.termDocs(new Term("binary","doc1field1")).next());
assertTrue(ir.termDocs(new Term("binary","doc2field1")).next());
assertTrue(ir.termDocs(new Term("binary","doc3field1")).next());
assertTrue(ir.termDocs(new Term("string","doc1field2")).next());
assertTrue(ir.termDocs(new Term("string","doc2field2")).next());
assertTrue(ir.termDocs(new Term("string","doc3field2")).next());
ir.close();
dir.close();
}
}