mirror of https://github.com/apache/lucene.git
LUCENE-2309: Moved to Field.tokenStream(Analyzer)
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1174506 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
c8b7bb7aac
commit
5d4502ad0a
|
@ -172,7 +172,7 @@ Changes in backwards compatibility policy
|
|||
(Nikola Tankovic, Mike McCandless, Chris Male)
|
||||
|
||||
* LUCENE-3396: ReusableAnalyzerBase.TokenStreamComponents.reset(Reader) now returns void instead
|
||||
of boolean. If a Component cannot be reset, it should throw an Exception.
|
||||
of boolean. If a Component cannot be reset, it should throw an Exception. (Chris Male)
|
||||
|
||||
Changes in Runtime Behavior
|
||||
|
||||
|
@ -536,6 +536,9 @@ New features
|
|||
ScoreDoc (e.g. last document on the previous page) to support deep paging use cases.
|
||||
(Aaron McCurry, Grant Ingersoll, Robert Muir)
|
||||
|
||||
* LUCENE-2309: Added IndexableField.tokenStream(Analyzer) which is now responsible for
|
||||
creating the TokenStreams for Fields when they are to be indexed. (Chris Male)
|
||||
|
||||
Optimizations
|
||||
|
||||
* LUCENE-2588: Don't store unnecessary suffixes when writing the terms
|
||||
|
|
|
@ -525,14 +525,7 @@ public class InstantiatedIndexWriter implements Closeable {
|
|||
tokensByField.put(field, tokens);
|
||||
|
||||
if (field.fieldType().tokenized()) {
|
||||
final TokenStream tokenStream;
|
||||
// todo readerValue(), binaryValue()
|
||||
if (field.tokenStreamValue() != null) {
|
||||
tokenStream = field.tokenStreamValue();
|
||||
} else {
|
||||
tokenStream = analyzer.reusableTokenStream(field.name(), new StringReader(field.stringValue()));
|
||||
}
|
||||
|
||||
final TokenStream tokenStream = field.tokenStream(analyzer);
|
||||
// reset the TokenStream to the first token
|
||||
tokenStream.reset();
|
||||
|
||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.document;
|
|||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.document.NumericField.DataType;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.FieldReaderException;
|
||||
|
@ -246,14 +245,6 @@ public class FieldSelectorVisitor extends StoredFieldVisitor {
|
|||
return null;
|
||||
}
|
||||
|
||||
/** The value of the field as a TokenStream, or null. If null, the Reader value,
|
||||
* String value, or binary value is used. Exactly one of stringValue(),
|
||||
* readerValue(), getBinaryValue(), and tokenStreamValue() must be set. */
|
||||
@Override
|
||||
public TokenStream tokenStreamValue() {
|
||||
return null;
|
||||
}
|
||||
|
||||
/** The value of the field as a String, or null. If null, the Reader value,
|
||||
* binary value, or TokenStream value is used. Exactly one of stringValue(),
|
||||
* readerValue(), getBinaryValue(), and tokenStreamValue() must be set. */
|
||||
|
|
|
@ -17,9 +17,14 @@ package org.apache.lucene.document;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.index.IndexableFieldType;
|
||||
import org.apache.lucene.index.IndexableField;
|
||||
import org.apache.lucene.index.values.PerDocFieldValues;
|
||||
|
@ -62,6 +67,9 @@ public class Field implements IndexableField {
|
|||
if (reader == null) {
|
||||
throw new NullPointerException("reader cannot be null");
|
||||
}
|
||||
if (type.indexed() && !type.tokenized()) {
|
||||
throw new IllegalArgumentException("Non-tokenized fields must use String values");
|
||||
}
|
||||
|
||||
this.name = name;
|
||||
this.fieldsData = reader;
|
||||
|
@ -75,6 +83,9 @@ public class Field implements IndexableField {
|
|||
if (tokenStream == null) {
|
||||
throw new NullPointerException("tokenStream cannot be null");
|
||||
}
|
||||
if (type.indexed() && !type.tokenized()) {
|
||||
throw new IllegalArgumentException("Non-tokenized fields must use String values");
|
||||
}
|
||||
|
||||
this.name = name;
|
||||
this.fieldsData = null;
|
||||
|
@ -87,12 +98,14 @@ public class Field implements IndexableField {
|
|||
}
|
||||
|
||||
public Field(String name, IndexableFieldType type, byte[] value, int offset, int length) {
|
||||
this.fieldsData = new BytesRef(value, offset, length);
|
||||
this.type = type;
|
||||
this.name = name;
|
||||
this(name, type, new BytesRef(value, offset, length));
|
||||
}
|
||||
|
||||
public Field(String name, IndexableFieldType type, BytesRef bytes) {
|
||||
if (type.indexed() && !type.tokenized()) {
|
||||
throw new IllegalArgumentException("Non-tokenized fields must use String values");
|
||||
}
|
||||
|
||||
this.fieldsData = bytes;
|
||||
this.type = type;
|
||||
this.name = name;
|
||||
|
@ -297,4 +310,51 @@ public class Field implements IndexableField {
|
|||
public IndexableFieldType fieldType() {
|
||||
return type;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
public TokenStream tokenStream(Analyzer analyzer) throws IOException {
|
||||
if (!fieldType().indexed()) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (!fieldType().tokenized()) {
|
||||
if (stringValue() == null) {
|
||||
throw new IllegalArgumentException("Non-Tokenized Fields must have a String value");
|
||||
}
|
||||
|
||||
return new TokenStream() {
|
||||
CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
|
||||
OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
|
||||
boolean used;
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (used) {
|
||||
return false;
|
||||
}
|
||||
termAttribute.setEmpty().append(stringValue());
|
||||
offsetAttribute.setOffset(0, stringValue().length());
|
||||
used = true;
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
used = false;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
if (tokenStream != null) {
|
||||
return tokenStream;
|
||||
} else if (readerValue() != null) {
|
||||
return analyzer.reusableTokenStream(name(), readerValue());
|
||||
} else if (stringValue() != null) {
|
||||
return analyzer.reusableTokenStream(name(), new StringReader(stringValue()));
|
||||
}
|
||||
|
||||
throw new IllegalArgumentException("Field must have either TokenStream, String or Reader value");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -307,13 +307,6 @@ public class IndexDocValuesField extends Field implements PerDocFieldValues {
|
|||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns always <code>null</code>
|
||||
*/
|
||||
public TokenStream tokenStreamValue() {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ValueType docValuesType() {
|
||||
return type;
|
||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.lucene.document;
|
|||
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.NumericTokenStream;
|
||||
import org.apache.lucene.index.FieldInfo.IndexOptions;
|
||||
|
@ -237,7 +238,7 @@ public final class NumericField extends Field {
|
|||
}
|
||||
|
||||
/** Returns a {@link NumericTokenStream} for indexing the numeric value. */
|
||||
public TokenStream tokenStreamValue() {
|
||||
public TokenStream tokenStream(Analyzer analyzer) {
|
||||
if (!type.indexed()) return null;
|
||||
if (numericTS == null) {
|
||||
// lazy init the TokenStream as it is heavy to instantiate
|
||||
|
|
|
@ -75,121 +75,70 @@ final class DocInverterPerField extends DocFieldConsumerPerField {
|
|||
// consumer if it wants to see this particular field
|
||||
// tokenized.
|
||||
if (field.fieldType().indexed() && doInvert) {
|
||||
|
||||
|
||||
if (i > 0)
|
||||
fieldState.position += docState.analyzer == null ? 0 : docState.analyzer.getPositionIncrementGap(fieldInfo.name);
|
||||
|
||||
// TODO (LUCENE-2309): this analysis logic should be
|
||||
// outside of indexer -- field should simply give us
|
||||
// a TokenStream, even for multi-valued fields
|
||||
final TokenStream stream = field.tokenStream(docState.analyzer);
|
||||
// reset the TokenStream to the first token
|
||||
stream.reset();
|
||||
|
||||
try {
|
||||
boolean hasMoreTokens = stream.incrementToken();
|
||||
|
||||
fieldState.attributeSource = stream;
|
||||
|
||||
OffsetAttribute offsetAttribute = fieldState.attributeSource.addAttribute(OffsetAttribute.class);
|
||||
PositionIncrementAttribute posIncrAttribute = fieldState.attributeSource.addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
if (!field.fieldType().tokenized()) { // un-tokenized field
|
||||
final String stringValue = field.stringValue();
|
||||
assert stringValue != null;
|
||||
final int valueLength = stringValue.length();
|
||||
parent.singleToken.reinit(stringValue, 0, valueLength);
|
||||
fieldState.attributeSource = parent.singleToken;
|
||||
consumer.start(field);
|
||||
|
||||
boolean success = false;
|
||||
try {
|
||||
consumer.add();
|
||||
success = true;
|
||||
} finally {
|
||||
if (!success) {
|
||||
docState.docWriter.setAborting();
|
||||
for (;;) {
|
||||
|
||||
// If we hit an exception in stream.next below
|
||||
// (which is fairly common, eg if analyzer
|
||||
// chokes on a given document), then it's
|
||||
// non-aborting and (above) this one document
|
||||
// will be marked as deleted, but still
|
||||
// consume a docID
|
||||
|
||||
if (!hasMoreTokens) break;
|
||||
|
||||
final int posIncr = posIncrAttribute.getPositionIncrement();
|
||||
fieldState.position += posIncr;
|
||||
if (fieldState.position > 0) {
|
||||
fieldState.position--;
|
||||
}
|
||||
}
|
||||
fieldState.offset += valueLength;
|
||||
fieldState.length++;
|
||||
fieldState.position++;
|
||||
} else { // tokenized field
|
||||
final TokenStream stream;
|
||||
final TokenStream streamValue = field.tokenStreamValue();
|
||||
|
||||
if (streamValue != null) {
|
||||
stream = streamValue;
|
||||
} else {
|
||||
// the field does not have a TokenStream,
|
||||
// so we have to obtain one from the analyzer
|
||||
final Reader reader; // find or make Reader
|
||||
final Reader readerValue = field.readerValue();
|
||||
if (posIncr == 0)
|
||||
fieldState.numOverlap++;
|
||||
|
||||
if (readerValue != null) {
|
||||
reader = readerValue;
|
||||
} else {
|
||||
String stringValue = field.stringValue();
|
||||
if (stringValue == null) {
|
||||
throw new IllegalArgumentException("field must have either TokenStream, String or Reader value");
|
||||
boolean success = false;
|
||||
try {
|
||||
// If we hit an exception in here, we abort
|
||||
// all buffered documents since the last
|
||||
// flush, on the likelihood that the
|
||||
// internal state of the consumer is now
|
||||
// corrupt and should not be flushed to a
|
||||
// new segment:
|
||||
consumer.add();
|
||||
success = true;
|
||||
} finally {
|
||||
if (!success) {
|
||||
docState.docWriter.setAborting();
|
||||
}
|
||||
parent.stringReader.init(stringValue);
|
||||
reader = parent.stringReader;
|
||||
}
|
||||
|
||||
// Tokenize field and add to postingTable
|
||||
stream = docState.analyzer.reusableTokenStream(fieldInfo.name, reader);
|
||||
fieldState.length++;
|
||||
fieldState.position++;
|
||||
|
||||
hasMoreTokens = stream.incrementToken();
|
||||
}
|
||||
// trigger streams to perform end-of-stream operations
|
||||
stream.end();
|
||||
|
||||
// reset the TokenStream to the first token
|
||||
stream.reset();
|
||||
|
||||
try {
|
||||
boolean hasMoreTokens = stream.incrementToken();
|
||||
|
||||
fieldState.attributeSource = stream;
|
||||
|
||||
OffsetAttribute offsetAttribute = fieldState.attributeSource.addAttribute(OffsetAttribute.class);
|
||||
PositionIncrementAttribute posIncrAttribute = fieldState.attributeSource.addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
consumer.start(field);
|
||||
|
||||
for(;;) {
|
||||
|
||||
// If we hit an exception in stream.next below
|
||||
// (which is fairly common, eg if analyzer
|
||||
// chokes on a given document), then it's
|
||||
// non-aborting and (above) this one document
|
||||
// will be marked as deleted, but still
|
||||
// consume a docID
|
||||
|
||||
if (!hasMoreTokens) break;
|
||||
|
||||
final int posIncr = posIncrAttribute.getPositionIncrement();
|
||||
fieldState.position += posIncr;
|
||||
if (fieldState.position > 0) {
|
||||
fieldState.position--;
|
||||
}
|
||||
|
||||
if (posIncr == 0)
|
||||
fieldState.numOverlap++;
|
||||
|
||||
boolean success = false;
|
||||
try {
|
||||
// If we hit an exception in here, we abort
|
||||
// all buffered documents since the last
|
||||
// flush, on the likelihood that the
|
||||
// internal state of the consumer is now
|
||||
// corrupt and should not be flushed to a
|
||||
// new segment:
|
||||
consumer.add();
|
||||
success = true;
|
||||
} finally {
|
||||
if (!success) {
|
||||
docState.docWriter.setAborting();
|
||||
}
|
||||
}
|
||||
fieldState.length++;
|
||||
fieldState.position++;
|
||||
|
||||
hasMoreTokens = stream.incrementToken();
|
||||
}
|
||||
// trigger streams to perform end-of-stream operations
|
||||
stream.end();
|
||||
|
||||
fieldState.offset += offsetAttribute.endOffset();
|
||||
} finally {
|
||||
stream.close();
|
||||
}
|
||||
fieldState.offset += offsetAttribute.endOffset();
|
||||
} finally {
|
||||
stream.close();
|
||||
}
|
||||
|
||||
fieldState.offset += docState.analyzer == null ? 0 : docState.analyzer.getOffsetGap(field);
|
||||
|
|
|
@ -17,8 +17,10 @@ package org.apache.lucene.index;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.document.NumericField;
|
||||
import org.apache.lucene.index.values.PerDocFieldValues;
|
||||
|
@ -56,9 +58,6 @@ public interface IndexableField {
|
|||
/* Non-null if this field has a Reader value */
|
||||
public Reader readerValue();
|
||||
|
||||
/* Non-null if this field has a pre-tokenized ({@link TokenStream}) value */
|
||||
public TokenStream tokenStreamValue();
|
||||
|
||||
// Numeric field:
|
||||
/* True if this field is numeric */
|
||||
public boolean numeric();
|
||||
|
@ -82,4 +81,15 @@ public interface IndexableField {
|
|||
|
||||
/* DocValues type; only used if docValues is non-null */
|
||||
public ValueType docValuesType();
|
||||
|
||||
/**
|
||||
* Creates the TokenStream used for indexing this field. If appropriate,
|
||||
* implementations should use the given Analyzer to create the TokenStreams.
|
||||
*
|
||||
* @param analyzer Analyzer that should be used to create the TokenStreams from
|
||||
* @return TokenStream value for indexing the document. Should always return
|
||||
* a non-null value if the field is to be indexed
|
||||
* @throws IOException Can be thrown while creating the TokenStream
|
||||
*/
|
||||
public TokenStream tokenStream(Analyzer analyzer) throws IOException;
|
||||
}
|
||||
|
|
|
@ -17,10 +17,12 @@ package org.apache.lucene.index;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.Iterator;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.NumericField.DataType;
|
||||
|
@ -132,15 +134,6 @@ public class TestIndexableField extends LuceneTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream tokenStreamValue() {
|
||||
if (numeric()) {
|
||||
return new NumericField(name()).setIntValue(counter).tokenStreamValue();
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
// Numeric field:
|
||||
@Override
|
||||
public boolean numeric() {
|
||||
|
@ -172,6 +165,15 @@ public class TestIndexableField extends LuceneTestCase {
|
|||
public ValueType docValuesType() {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream tokenStream(Analyzer analyzer) throws IOException {
|
||||
if (numeric()) {
|
||||
return new NumericField(name()).setIntValue(counter).tokenStream(analyzer);
|
||||
}
|
||||
return readerValue() != null ? analyzer.reusableTokenStream(name(), readerValue()) :
|
||||
analyzer.reusableTokenStream(name(), new StringReader(stringValue()));
|
||||
}
|
||||
}
|
||||
|
||||
// Silly test showing how to index documents w/o using Lucene's core
|
||||
|
|
|
@ -71,31 +71,7 @@ public class ReadTokensTask extends PerfTask {
|
|||
for(final IndexableField field : fields) {
|
||||
if (!field.fieldType().tokenized() || field instanceof NumericField) continue;
|
||||
|
||||
final TokenStream stream;
|
||||
final TokenStream streamValue = field.tokenStreamValue();
|
||||
|
||||
if (streamValue != null)
|
||||
stream = streamValue;
|
||||
else {
|
||||
// the field does not have a TokenStream,
|
||||
// so we have to obtain one from the analyzer
|
||||
final Reader reader; // find or make Reader
|
||||
final Reader readerValue = field.readerValue();
|
||||
|
||||
if (readerValue != null)
|
||||
reader = readerValue;
|
||||
else {
|
||||
String stringValue = field.stringValue();
|
||||
if (stringValue == null)
|
||||
throw new IllegalArgumentException("field must have either TokenStream, String or Reader value");
|
||||
stringReader.init(stringValue);
|
||||
reader = stringReader;
|
||||
}
|
||||
|
||||
// Tokenize field
|
||||
stream = analyzer.reusableTokenStream(field.name(), reader);
|
||||
}
|
||||
|
||||
final TokenStream stream = field.tokenStream(analyzer);
|
||||
// reset the TokenStream to the first token
|
||||
stream.reset();
|
||||
|
||||
|
|
|
@ -87,10 +87,9 @@ public class PolyFieldTest extends SolrTestCaseJ4 {
|
|||
assertEquals(fields.length, 3);//should be 3, we have a stored field
|
||||
//first two fields contain the values, third is just stored and contains the original
|
||||
for (int i = 0; i < 3; i++) {
|
||||
boolean hasValue = fields[1].tokenStreamValue() != null
|
||||
|| fields[1].binaryValue() != null
|
||||
|| fields[1].stringValue() != null;
|
||||
assertTrue("Doesn't have a value: " + fields[1], hasValue);
|
||||
boolean hasValue = fields[i].binaryValue() != null
|
||||
|| fields[i].stringValue() != null;
|
||||
assertTrue("Doesn't have a value: " + fields[i], hasValue);
|
||||
}
|
||||
/*assertTrue("first field " + fields[0].tokenStreamValue() + " is not 35.0", pt.getSubType().toExternal(fields[0]).equals(String.valueOf(xy[0])));
|
||||
assertTrue("second field is not -79.34", pt.getSubType().toExternal(fields[1]).equals(String.valueOf(xy[1])));
|
||||
|
|
Loading…
Reference in New Issue