mirror of https://github.com/apache/lucene.git
LUCENE-1699: make Field.tokenStream usable with other stored field mechanisms
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@787437 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
ec8088654d
commit
8171a06632
|
@ -424,6 +424,11 @@ Bug fixes
|
|||
|
||||
28. LUCENE-1405: Added support for Ant resource collections in contrib/ant
|
||||
<index> task. (Przemyslaw Sztoch via Erik Hatcher)
|
||||
|
||||
29. LUCENE-1699: Allow setting a TokenStream on Field/Fieldable for indexing
|
||||
in conjunction with any other ways to specify stored field values,
|
||||
currently binary or string values. (yonik)
|
||||
|
||||
|
||||
Optimizations
|
||||
|
||||
|
|
|
@ -16,7 +16,8 @@ package org.apache.lucene.document;
|
|||
*/
|
||||
|
||||
import org.apache.lucene.search.PhraseQuery; // for javadocs
|
||||
import org.apache.lucene.search.spans.SpanQuery; // for javadocs
|
||||
import org.apache.lucene.search.spans.SpanQuery;
|
||||
import org.apache.lucene.analysis.TokenStream; // for javadocs
|
||||
|
||||
|
||||
/**
|
||||
|
@ -38,9 +39,11 @@ public abstract class AbstractField implements Fieldable {
|
|||
protected boolean lazy = false;
|
||||
protected boolean omitTermFreqAndPositions = false;
|
||||
protected float boost = 1.0f;
|
||||
// the one and only data object for all different kind of field values
|
||||
// the data object for all different kind of field values
|
||||
protected Object fieldsData = null;
|
||||
//length/offset for all primitive types
|
||||
// pre-analyzed tokenStream for indexed fields
|
||||
protected TokenStream tokenStream;
|
||||
// length/offset for all primitive types
|
||||
protected int binaryLength;
|
||||
protected int binaryOffset;
|
||||
|
||||
|
|
|
@ -94,7 +94,7 @@ public final class Field extends AbstractField implements Fieldable, Serializabl
|
|||
/** Expert: Index the field's value without an Analyzer,
|
||||
* and also disable the storing of norms. Note that you
|
||||
* can also separately enable/disable norms by calling
|
||||
* {@link #setOmitNorms}. No norms means that
|
||||
* {@link Field#setOmitNorms}. No norms means that
|
||||
* index-time field and document boosting and field
|
||||
* length normalization are disabled. The benefit is
|
||||
* less memory usage as norms take up one byte of RAM
|
||||
|
@ -159,19 +159,19 @@ public final class Field extends AbstractField implements Fieldable, Serializabl
|
|||
}
|
||||
|
||||
|
||||
/** The value of the field as a String, or null. If null, the Reader value,
|
||||
* binary value, or TokenStream value is used. Exactly one of stringValue(),
|
||||
* readerValue(), getBinaryValue(), and tokenStreamValue() must be set. */
|
||||
/** The value of the field as a String, or null. If null, the Reader value or
|
||||
* binary value is used. Exactly one of stringValue(),
|
||||
* readerValue(), and getBinaryValue() must be set. */
|
||||
public String stringValue() { return fieldsData instanceof String ? (String)fieldsData : null; }
|
||||
|
||||
/** The value of the field as a Reader, or null. If null, the String value,
|
||||
* binary value, or TokenStream value is used. Exactly one of stringValue(),
|
||||
* readerValue(), getBinaryValue(), and tokenStreamValue() must be set. */
|
||||
/** The value of the field as a Reader, or null. If null, the String value or
|
||||
* binary value is used. Exactly one of stringValue(),
|
||||
* readerValue(), and getBinaryValue() must be set. */
|
||||
public Reader readerValue() { return fieldsData instanceof Reader ? (Reader)fieldsData : null; }
|
||||
|
||||
/** The value of the field in Binary, or null. If null, the Reader value,
|
||||
* String value, or TokenStream value is used. Exactly one of stringValue(),
|
||||
* readerValue(), getBinaryValue(), and tokenStreamValue() must be set.
|
||||
* or String value is used. Exactly one of stringValue(),
|
||||
* readerValue(), and getBinaryValue() must be set.
|
||||
* @deprecated This method must allocate a new byte[] if
|
||||
* the {@link AbstractField#getBinaryOffset()} is non-zero
|
||||
* or {@link AbstractField#getBinaryLength()} is not the
|
||||
|
@ -191,10 +191,9 @@ public final class Field extends AbstractField implements Fieldable, Serializabl
|
|||
return ret;
|
||||
}
|
||||
|
||||
/** The value of the field as a TokesStream, or null. If null, the Reader value,
|
||||
* String value, or binary value is used. Exactly one of stringValue(),
|
||||
* readerValue(), getBinaryValue(), and tokenStreamValue() must be set. */
|
||||
public TokenStream tokenStreamValue() { return fieldsData instanceof TokenStream ? (TokenStream)fieldsData : null; }
|
||||
/** The TokesStream for this field to be used when indexing, or null. If null, the Reader value
|
||||
* or String value is analyzed to produce the indexed tokens. */
|
||||
public TokenStream tokenStreamValue() { return tokenStream; }
|
||||
|
||||
|
||||
/** <p>Expert: change the value of this field. This can
|
||||
|
@ -204,10 +203,7 @@ public final class Field extends AbstractField implements Fieldable, Serializabl
|
|||
* a single {@link Document} instance is re-used as
|
||||
* well. This helps most on small documents.</p>
|
||||
*
|
||||
* <p>Note that you should only use this method after the
|
||||
* Field has been consumed (ie, the {@link Document}
|
||||
* containing this Field has been added to the index).
|
||||
* Also, each Field instance should only be used once
|
||||
* <p>Each Field instance should only be used once
|
||||
* within a single {@link Document} instance. See <a
|
||||
* href="http://wiki.apache.org/lucene-java/ImproveIndexingSpeed">ImproveIndexingSpeed</a>
|
||||
* for details.</p> */
|
||||
|
@ -250,7 +246,8 @@ public final class Field extends AbstractField implements Fieldable, Serializabl
|
|||
}
|
||||
|
||||
|
||||
/** Expert: change the value of this field. See <a href="#setValue(java.lang.String)">setValue(String)</a>. */
|
||||
/** Expert: change the value of this field. See <a href="#setValue(java.lang.String)">setValue(String)</a>.
|
||||
* @deprecated use {@link #setTokenStream} */
|
||||
public void setValue(TokenStream value) {
|
||||
if (isBinary) {
|
||||
throw new IllegalArgumentException("cannot set a TokenStream value on a binary field");
|
||||
|
@ -258,7 +255,16 @@ public final class Field extends AbstractField implements Fieldable, Serializabl
|
|||
if (isStored) {
|
||||
throw new IllegalArgumentException("cannot set a TokenStream value on a stored field");
|
||||
}
|
||||
fieldsData = value;
|
||||
fieldsData = null;
|
||||
tokenStream = value;
|
||||
}
|
||||
|
||||
/** Expert: sets the token stream to be used for indexing and causes isIndexed() and isTokenized() to return true.
|
||||
* May be combined with stored values from stringValue() or binaryValue() */
|
||||
public void setTokenStream(TokenStream tokenStream) {
|
||||
this.isIndexed = true;
|
||||
this.isTokenized = true;
|
||||
this.tokenStream = tokenStream;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -459,8 +465,9 @@ public final class Field extends AbstractField implements Fieldable, Serializabl
|
|||
throw new NullPointerException("tokenStream cannot be null");
|
||||
|
||||
this.name = name.intern(); // field names are interned
|
||||
this.fieldsData = tokenStream;
|
||||
|
||||
this.fieldsData = null;
|
||||
this.tokenStream = tokenStream;
|
||||
|
||||
this.isStored = false;
|
||||
this.isCompressed = false;
|
||||
|
||||
|
|
|
@ -74,36 +74,41 @@ public interface Fieldable extends Serializable {
|
|||
*/
|
||||
String name();
|
||||
|
||||
/** The value of the field as a String, or null. If null, the Reader value,
|
||||
* binary value, or TokenStream value is used. Exactly one of stringValue(),
|
||||
* readerValue(), binaryValue(), and tokenStreamValue() must be set. */
|
||||
/** The value of the field as a String, or null.
|
||||
* <p>
|
||||
* For indexing, if isStored()==true, the stringValue() will be used as the stored field value
|
||||
* unless isBinary()==true, in which case binaryValue() will be used.
|
||||
*
|
||||
* If isIndexed()==true and isTokenized()==false, this String value will be indexed as a single token.
|
||||
* If isIndexed()==true and isTokenized()==true, then tokenStreamValue() will be used to generate indexed tokens if not null,
|
||||
* else readerValue() will be used to generate indexed tokens if not null, else stringValue() will be used to generate tokens.
|
||||
*/
|
||||
public String stringValue();
|
||||
|
||||
/** The value of the field as a Reader, or null. If null, the String value,
|
||||
* binary value, or TokenStream value is used. Exactly one of stringValue(),
|
||||
* readerValue(), binaryValue(), and tokenStreamValue() must be set. */
|
||||
/** The value of the field as a Reader, which can be used at index time to generate indexed tokens.
|
||||
* @see #stringValue()
|
||||
*/
|
||||
public Reader readerValue();
|
||||
|
||||
/** The value of the field in Binary, or null. If null, the Reader value,
|
||||
* String value, or TokenStream value is used. Exactly one of stringValue(),
|
||||
* readerValue(), binaryValue(), and tokenStreamValue() must be set. */
|
||||
/** The value of the field in Binary, or null.
|
||||
* @see #stringValue()
|
||||
*/
|
||||
public byte[] binaryValue();
|
||||
|
||||
/** The value of the field as a TokenStream, or null. If null, the Reader value,
|
||||
* String value, or binary value is used. Exactly one of stringValue(),
|
||||
* readerValue(), binaryValue(), and tokenStreamValue() must be set. */
|
||||
/** The TokenStream for this field to be used when indexing, or null.
|
||||
* @see #stringValue()
|
||||
*/
|
||||
public TokenStream tokenStreamValue();
|
||||
|
||||
/** True iff the value of the field is to be stored in the index for return
|
||||
with search hits. It is an error for this to be true if a field is
|
||||
Reader-valued. */
|
||||
/** True if the value of the field is to be stored in the index for return
|
||||
with search hits. */
|
||||
boolean isStored();
|
||||
|
||||
/** True iff the value of the field is to be indexed, so that it may be
|
||||
/** True if the value of the field is to be indexed, so that it may be
|
||||
searched on. */
|
||||
boolean isIndexed();
|
||||
|
||||
/** True iff the value of the field should be tokenized as text prior to
|
||||
/** True if the value of the field should be tokenized as text prior to
|
||||
indexing. Un-tokenized fields are indexed as a single word and may not be
|
||||
Reader-valued. */
|
||||
boolean isTokenized();
|
||||
|
@ -111,7 +116,7 @@ public interface Fieldable extends Serializable {
|
|||
/** True if the value of the field is stored and compressed within the index */
|
||||
boolean isCompressed();
|
||||
|
||||
/** True iff the term or terms used to index this field are stored as a term
|
||||
/** True if the term or terms used to index this field are stored as a term
|
||||
* vector, available from {@link org.apache.lucene.index.IndexReader#getTermFreqVector(int,String)}.
|
||||
* These methods do not provide access to the original content of the field,
|
||||
* only to terms used to index it. If the original content must be
|
||||
|
@ -122,17 +127,17 @@ public interface Fieldable extends Serializable {
|
|||
boolean isTermVectorStored();
|
||||
|
||||
/**
|
||||
* True iff terms are stored as term vector together with their offsets
|
||||
* True if terms are stored as term vector together with their offsets
|
||||
* (start and end positon in source text).
|
||||
*/
|
||||
boolean isStoreOffsetWithTermVector();
|
||||
|
||||
/**
|
||||
* True iff terms are stored as term vector together with their token positions.
|
||||
* True if terms are stored as term vector together with their token positions.
|
||||
*/
|
||||
boolean isStorePositionWithTermVector();
|
||||
|
||||
/** True iff the value of the filed is stored as binary */
|
||||
/** True if the value of the field is stored as binary */
|
||||
boolean isBinary();
|
||||
|
||||
/** True if norms are omitted for this indexed field */
|
||||
|
|
|
@ -17,11 +17,7 @@ package org.apache.lucene.index;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.PrintStream;
|
||||
import java.io.Reader;
|
||||
import java.io.*;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
@ -4350,4 +4346,66 @@ public class TestIndexWriter extends LuceneTestCase
|
|||
t.join();
|
||||
assertFalse(t.failed);
|
||||
}
|
||||
|
||||
|
||||
public void testIndexStoreCombos() throws Exception {
|
||||
MockRAMDirectory dir = new MockRAMDirectory();
|
||||
IndexWriter w = new IndexWriter(dir, new WhitespaceAnalyzer(), true, IndexWriter.MaxFieldLength.UNLIMITED);
|
||||
byte[] b = new byte[50];
|
||||
for(int i=0;i<50;i++)
|
||||
b[i] = (byte) (i+77);
|
||||
|
||||
Document doc = new Document();
|
||||
Field f = new Field("binary", b, 10, 17, Field.Store.YES);
|
||||
f.setTokenStream(new WhitespaceTokenizer(new StringReader("doc1field1")));
|
||||
Field f2 = new Field("string", "value", Field.Store.YES,Field.Index.ANALYZED);
|
||||
f2.setTokenStream(new WhitespaceTokenizer(new StringReader("doc1field2")));
|
||||
doc.add(f);
|
||||
doc.add(f2);
|
||||
w.addDocument(doc);
|
||||
|
||||
// add 2 docs to test in-memory merging
|
||||
f.setTokenStream(new WhitespaceTokenizer(new StringReader("doc2field1")));
|
||||
f2.setTokenStream(new WhitespaceTokenizer(new StringReader("doc2field2")));
|
||||
w.addDocument(doc);
|
||||
|
||||
// force segment flush so we can force a segment merge with doc3 later.
|
||||
w.commit();
|
||||
|
||||
f.setTokenStream(new WhitespaceTokenizer(new StringReader("doc3field1")));
|
||||
f2.setTokenStream(new WhitespaceTokenizer(new StringReader("doc3field2")));
|
||||
|
||||
w.addDocument(doc);
|
||||
w.commit();
|
||||
w.optimize(); // force segment merge.
|
||||
|
||||
IndexReader ir = IndexReader.open(dir);
|
||||
doc = ir.document(0);
|
||||
f = doc.getField("binary");
|
||||
b = f.getBinaryValue();
|
||||
assertTrue(b != null);
|
||||
assertEquals(17, b.length, 17);
|
||||
assertEquals(87, b[0]);
|
||||
|
||||
assertTrue(ir.document(0).getFieldable("binary").isBinary());
|
||||
assertTrue(ir.document(1).getFieldable("binary").isBinary());
|
||||
assertTrue(ir.document(2).getFieldable("binary").isBinary());
|
||||
|
||||
assertEquals("value", ir.document(0).get("string"));
|
||||
assertEquals("value", ir.document(1).get("string"));
|
||||
assertEquals("value", ir.document(2).get("string"));
|
||||
|
||||
|
||||
// test that the terms were indexed.
|
||||
assertTrue(ir.termDocs(new Term("binary","doc1field1")).next());
|
||||
assertTrue(ir.termDocs(new Term("binary","doc2field1")).next());
|
||||
assertTrue(ir.termDocs(new Term("binary","doc3field1")).next());
|
||||
assertTrue(ir.termDocs(new Term("string","doc1field2")).next());
|
||||
assertTrue(ir.termDocs(new Term("string","doc2field2")).next());
|
||||
assertTrue(ir.termDocs(new Term("string","doc3field2")).next());
|
||||
|
||||
ir.close();
|
||||
dir.close();
|
||||
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue