LUCENE-2309: Moved to Field.tokenStream(Analyzer)

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1174506 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Christopher John Male 2011-09-23 03:16:37 +00:00
parent c8b7bb7aac
commit 5d4502ad0a
11 changed files with 149 additions and 172 deletions

View File

@ -172,7 +172,7 @@ Changes in backwards compatibility policy
(Nikola Tankovic, Mike McCandless, Chris Male)
* LUCENE-3396: ReusableAnalyzerBase.TokenStreamComponents.reset(Reader) now returns void instead
of boolean. If a Component cannot be reset, it should throw an Exception.
of boolean. If a Component cannot be reset, it should throw an Exception. (Chris Male)
Changes in Runtime Behavior
@ -536,6 +536,9 @@ New features
ScoreDoc (e.g. last document on the previous page) to support deep paging use cases.
(Aaron McCurry, Grant Ingersoll, Robert Muir)
* LUCENE-2309: Added IndexableField.tokenStream(Analyzer) which is now responsible for
creating the TokenStreams for Fields when they are to be indexed. (Chris Male)
Optimizations
* LUCENE-2588: Don't store unnecessary suffixes when writing the terms

View File

@ -525,14 +525,7 @@ public class InstantiatedIndexWriter implements Closeable {
tokensByField.put(field, tokens);
if (field.fieldType().tokenized()) {
final TokenStream tokenStream;
// todo readerValue(), binaryValue()
if (field.tokenStreamValue() != null) {
tokenStream = field.tokenStreamValue();
} else {
tokenStream = analyzer.reusableTokenStream(field.name(), new StringReader(field.stringValue()));
}
final TokenStream tokenStream = field.tokenStream(analyzer);
// reset the TokenStream to the first token
tokenStream.reset();

View File

@ -19,7 +19,6 @@ package org.apache.lucene.document;
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.NumericField.DataType;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldReaderException;
@ -246,14 +245,6 @@ public class FieldSelectorVisitor extends StoredFieldVisitor {
return null;
}
/** The value of the field as a TokenStream, or null. If null, the Reader value,
* String value, or binary value is used. Exactly one of stringValue(),
* readerValue(), getBinaryValue(), and tokenStreamValue() must be set. */
@Override
public TokenStream tokenStreamValue() {
return null;
}
/** The value of the field as a String, or null. If null, the Reader value,
* binary value, or TokenStream value is used. Exactly one of stringValue(),
* readerValue(), getBinaryValue(), and tokenStreamValue() must be set. */

View File

@ -17,9 +17,14 @@ package org.apache.lucene.document;
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.index.IndexableFieldType;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.values.PerDocFieldValues;
@ -62,6 +67,9 @@ public class Field implements IndexableField {
if (reader == null) {
throw new NullPointerException("reader cannot be null");
}
if (type.indexed() && !type.tokenized()) {
throw new IllegalArgumentException("Non-tokenized fields must use String values");
}
this.name = name;
this.fieldsData = reader;
@ -75,6 +83,9 @@ public class Field implements IndexableField {
if (tokenStream == null) {
throw new NullPointerException("tokenStream cannot be null");
}
if (type.indexed() && !type.tokenized()) {
throw new IllegalArgumentException("Non-tokenized fields must use String values");
}
this.name = name;
this.fieldsData = null;
@ -87,12 +98,14 @@ public class Field implements IndexableField {
}
public Field(String name, IndexableFieldType type, byte[] value, int offset, int length) {
this.fieldsData = new BytesRef(value, offset, length);
this.type = type;
this.name = name;
this(name, type, new BytesRef(value, offset, length));
}
public Field(String name, IndexableFieldType type, BytesRef bytes) {
if (type.indexed() && !type.tokenized()) {
throw new IllegalArgumentException("Non-tokenized fields must use String values");
}
this.fieldsData = bytes;
this.type = type;
this.name = name;
@ -297,4 +310,51 @@ public class Field implements IndexableField {
public IndexableFieldType fieldType() {
return type;
}
/**
* {@inheritDoc}
*/
public TokenStream tokenStream(Analyzer analyzer) throws IOException {
if (!fieldType().indexed()) {
return null;
}
if (!fieldType().tokenized()) {
if (stringValue() == null) {
throw new IllegalArgumentException("Non-Tokenized Fields must have a String value");
}
return new TokenStream() {
CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
boolean used;
@Override
public boolean incrementToken() throws IOException {
if (used) {
return false;
}
termAttribute.setEmpty().append(stringValue());
offsetAttribute.setOffset(0, stringValue().length());
used = true;
return true;
}
@Override
public void reset() throws IOException {
used = false;
}
};
}
if (tokenStream != null) {
return tokenStream;
} else if (readerValue() != null) {
return analyzer.reusableTokenStream(name(), readerValue());
} else if (stringValue() != null) {
return analyzer.reusableTokenStream(name(), new StringReader(stringValue()));
}
throw new IllegalArgumentException("Field must have either TokenStream, String or Reader value");
}
}

View File

@ -307,13 +307,6 @@ public class IndexDocValuesField extends Field implements PerDocFieldValues {
return null;
}
/**
* Returns always <code>null</code>
*/
public TokenStream tokenStreamValue() {
return null;
}
@Override
public ValueType docValuesType() {
return type;

View File

@ -19,6 +19,7 @@ package org.apache.lucene.document;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.NumericTokenStream;
import org.apache.lucene.index.FieldInfo.IndexOptions;
@ -237,7 +238,7 @@ public final class NumericField extends Field {
}
/** Returns a {@link NumericTokenStream} for indexing the numeric value. */
public TokenStream tokenStreamValue() {
public TokenStream tokenStream(Analyzer analyzer) {
if (!type.indexed()) return null;
if (numericTS == null) {
// lazy init the TokenStream as it is heavy to instantiate

View File

@ -75,121 +75,70 @@ final class DocInverterPerField extends DocFieldConsumerPerField {
// consumer if it wants to see this particular field
// tokenized.
if (field.fieldType().indexed() && doInvert) {
if (i > 0)
fieldState.position += docState.analyzer == null ? 0 : docState.analyzer.getPositionIncrementGap(fieldInfo.name);
// TODO (LUCENE-2309): this analysis logic should be
// outside of indexer -- field should simply give us
// a TokenStream, even for multi-valued fields
final TokenStream stream = field.tokenStream(docState.analyzer);
// reset the TokenStream to the first token
stream.reset();
try {
boolean hasMoreTokens = stream.incrementToken();
fieldState.attributeSource = stream;
OffsetAttribute offsetAttribute = fieldState.attributeSource.addAttribute(OffsetAttribute.class);
PositionIncrementAttribute posIncrAttribute = fieldState.attributeSource.addAttribute(PositionIncrementAttribute.class);
if (!field.fieldType().tokenized()) { // un-tokenized field
final String stringValue = field.stringValue();
assert stringValue != null;
final int valueLength = stringValue.length();
parent.singleToken.reinit(stringValue, 0, valueLength);
fieldState.attributeSource = parent.singleToken;
consumer.start(field);
boolean success = false;
try {
consumer.add();
success = true;
} finally {
if (!success) {
docState.docWriter.setAborting();
for (;;) {
// If we hit an exception in stream.next below
// (which is fairly common, eg if analyzer
// chokes on a given document), then it's
// non-aborting and (above) this one document
// will be marked as deleted, but still
// consume a docID
if (!hasMoreTokens) break;
final int posIncr = posIncrAttribute.getPositionIncrement();
fieldState.position += posIncr;
if (fieldState.position > 0) {
fieldState.position--;
}
}
fieldState.offset += valueLength;
fieldState.length++;
fieldState.position++;
} else { // tokenized field
final TokenStream stream;
final TokenStream streamValue = field.tokenStreamValue();
if (streamValue != null) {
stream = streamValue;
} else {
// the field does not have a TokenStream,
// so we have to obtain one from the analyzer
final Reader reader; // find or make Reader
final Reader readerValue = field.readerValue();
if (posIncr == 0)
fieldState.numOverlap++;
if (readerValue != null) {
reader = readerValue;
} else {
String stringValue = field.stringValue();
if (stringValue == null) {
throw new IllegalArgumentException("field must have either TokenStream, String or Reader value");
boolean success = false;
try {
// If we hit an exception in here, we abort
// all buffered documents since the last
// flush, on the likelihood that the
// internal state of the consumer is now
// corrupt and should not be flushed to a
// new segment:
consumer.add();
success = true;
} finally {
if (!success) {
docState.docWriter.setAborting();
}
parent.stringReader.init(stringValue);
reader = parent.stringReader;
}
// Tokenize field and add to postingTable
stream = docState.analyzer.reusableTokenStream(fieldInfo.name, reader);
fieldState.length++;
fieldState.position++;
hasMoreTokens = stream.incrementToken();
}
// trigger streams to perform end-of-stream operations
stream.end();
// reset the TokenStream to the first token
stream.reset();
try {
boolean hasMoreTokens = stream.incrementToken();
fieldState.attributeSource = stream;
OffsetAttribute offsetAttribute = fieldState.attributeSource.addAttribute(OffsetAttribute.class);
PositionIncrementAttribute posIncrAttribute = fieldState.attributeSource.addAttribute(PositionIncrementAttribute.class);
consumer.start(field);
for(;;) {
// If we hit an exception in stream.next below
// (which is fairly common, eg if analyzer
// chokes on a given document), then it's
// non-aborting and (above) this one document
// will be marked as deleted, but still
// consume a docID
if (!hasMoreTokens) break;
final int posIncr = posIncrAttribute.getPositionIncrement();
fieldState.position += posIncr;
if (fieldState.position > 0) {
fieldState.position--;
}
if (posIncr == 0)
fieldState.numOverlap++;
boolean success = false;
try {
// If we hit an exception in here, we abort
// all buffered documents since the last
// flush, on the likelihood that the
// internal state of the consumer is now
// corrupt and should not be flushed to a
// new segment:
consumer.add();
success = true;
} finally {
if (!success) {
docState.docWriter.setAborting();
}
}
fieldState.length++;
fieldState.position++;
hasMoreTokens = stream.incrementToken();
}
// trigger streams to perform end-of-stream operations
stream.end();
fieldState.offset += offsetAttribute.endOffset();
} finally {
stream.close();
}
fieldState.offset += offsetAttribute.endOffset();
} finally {
stream.close();
}
fieldState.offset += docState.analyzer == null ? 0 : docState.analyzer.getOffsetGap(field);

View File

@ -17,8 +17,10 @@ package org.apache.lucene.index;
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.NumericField;
import org.apache.lucene.index.values.PerDocFieldValues;
@ -56,9 +58,6 @@ public interface IndexableField {
/* Non-null if this field has a Reader value */
public Reader readerValue();
/* Non-null if this field has a pre-tokenized ({@link TokenStream}) value */
public TokenStream tokenStreamValue();
// Numeric field:
/* True if this field is numeric */
public boolean numeric();
@ -82,4 +81,15 @@ public interface IndexableField {
/* DocValues type; only used if docValues is non-null */
public ValueType docValuesType();
/**
* Creates the TokenStream used for indexing this field. If appropriate,
* implementations should use the given Analyzer to create the TokenStreams.
*
* @param analyzer Analyzer that should be used to create the TokenStreams from
* @return TokenStream value for indexing the document. Should always return
* a non-null value if the field is to be indexed
* @throws IOException Can be thrown while creating the TokenStream
*/
public TokenStream tokenStream(Analyzer analyzer) throws IOException;
}

View File

@ -17,10 +17,12 @@ package org.apache.lucene.index;
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.Iterator;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.NumericField.DataType;
@ -132,15 +134,6 @@ public class TestIndexableField extends LuceneTestCase {
}
}
@Override
public TokenStream tokenStreamValue() {
if (numeric()) {
return new NumericField(name()).setIntValue(counter).tokenStreamValue();
} else {
return null;
}
}
// Numeric field:
@Override
public boolean numeric() {
@ -172,6 +165,15 @@ public class TestIndexableField extends LuceneTestCase {
public ValueType docValuesType() {
return null;
}
@Override
public TokenStream tokenStream(Analyzer analyzer) throws IOException {
if (numeric()) {
return new NumericField(name()).setIntValue(counter).tokenStream(analyzer);
}
return readerValue() != null ? analyzer.reusableTokenStream(name(), readerValue()) :
analyzer.reusableTokenStream(name(), new StringReader(stringValue()));
}
}
// Silly test showing how to index documents w/o using Lucene's core

View File

@ -71,31 +71,7 @@ public class ReadTokensTask extends PerfTask {
for(final IndexableField field : fields) {
if (!field.fieldType().tokenized() || field instanceof NumericField) continue;
final TokenStream stream;
final TokenStream streamValue = field.tokenStreamValue();
if (streamValue != null)
stream = streamValue;
else {
// the field does not have a TokenStream,
// so we have to obtain one from the analyzer
final Reader reader; // find or make Reader
final Reader readerValue = field.readerValue();
if (readerValue != null)
reader = readerValue;
else {
String stringValue = field.stringValue();
if (stringValue == null)
throw new IllegalArgumentException("field must have either TokenStream, String or Reader value");
stringReader.init(stringValue);
reader = stringReader;
}
// Tokenize field
stream = analyzer.reusableTokenStream(field.name(), reader);
}
final TokenStream stream = field.tokenStream(analyzer);
// reset the TokenStream to the first token
stream.reset();

View File

@ -87,10 +87,9 @@ public class PolyFieldTest extends SolrTestCaseJ4 {
assertEquals(fields.length, 3);//should be 3, we have a stored field
//first two fields contain the values, third is just stored and contains the original
for (int i = 0; i < 3; i++) {
boolean hasValue = fields[1].tokenStreamValue() != null
|| fields[1].binaryValue() != null
|| fields[1].stringValue() != null;
assertTrue("Doesn't have a value: " + fields[1], hasValue);
boolean hasValue = fields[i].binaryValue() != null
|| fields[i].stringValue() != null;
assertTrue("Doesn't have a value: " + fields[i], hasValue);
}
/*assertTrue("first field " + fields[0].tokenStreamValue() + " is not 35.0", pt.getSubType().toExternal(fields[0]).equals(String.valueOf(xy[0])));
assertTrue("second field is not -79.34", pt.getSubType().toExternal(fields[1]).equals(String.valueOf(xy[1])));