From 5d4502ad0a3249fec5fcc1e28ce7074f67e8a027 Mon Sep 17 00:00:00 2001 From: Christopher John Male Date: Fri, 23 Sep 2011 03:16:37 +0000 Subject: [PATCH] LUCENE-2309: Moved to Field.tokenStream(Analyzer) git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1174506 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 5 +- .../instantiated/InstantiatedIndexWriter.java | 9 +- .../lucene/document/FieldSelectorVisitor.java | 9 -- .../org/apache/lucene/document/Field.java | 66 +++++++- .../lucene/document/IndexDocValuesField.java | 7 - .../apache/lucene/document/NumericField.java | 3 +- .../lucene/index/DocInverterPerField.java | 153 ++++++------------ .../apache/lucene/index/IndexableField.java | 16 +- .../lucene/index/TestIndexableField.java | 20 +-- .../byTask/tasks/ReadTokensTask.java | 26 +-- .../org/apache/solr/schema/PolyFieldTest.java | 7 +- 11 files changed, 149 insertions(+), 172 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 59e54a5cdad..bfc37f7bf41 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -172,7 +172,7 @@ Changes in backwards compatibility policy (Nikola Tankovic, Mike McCandless, Chris Male) * LUCENE-3396: ReusableAnalyzerBase.TokenStreamComponents.reset(Reader) now returns void instead - of boolean. If a Component cannot be reset, it should throw an Exception. + of boolean. If a Component cannot be reset, it should throw an Exception. (Chris Male) Changes in Runtime Behavior @@ -536,6 +536,9 @@ New features ScoreDoc (e.g. last document on the previous page) to support deep paging use cases. (Aaron McCurry, Grant Ingersoll, Robert Muir) +* LUCENE-2309: Added IndexableField.tokenStream(Analyzer) which is now responsible for + creating the TokenStreams for Fields when they are to be indexed. (Chris Male) + Optimizations * LUCENE-2588: Don't store unnecessary suffixes when writing the terms diff --git a/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java b/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java index d989eaf264f..f49d8d75f73 100644 --- a/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java +++ b/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java @@ -525,14 +525,7 @@ public class InstantiatedIndexWriter implements Closeable { tokensByField.put(field, tokens); if (field.fieldType().tokenized()) { - final TokenStream tokenStream; - // todo readerValue(), binaryValue() - if (field.tokenStreamValue() != null) { - tokenStream = field.tokenStreamValue(); - } else { - tokenStream = analyzer.reusableTokenStream(field.name(), new StringReader(field.stringValue())); - } - + final TokenStream tokenStream = field.tokenStream(analyzer); // reset the TokenStream to the first token tokenStream.reset(); diff --git a/lucene/contrib/misc/src/java/org/apache/lucene/document/FieldSelectorVisitor.java b/lucene/contrib/misc/src/java/org/apache/lucene/document/FieldSelectorVisitor.java index ee53728d956..97982f9ec8d 100644 --- a/lucene/contrib/misc/src/java/org/apache/lucene/document/FieldSelectorVisitor.java +++ b/lucene/contrib/misc/src/java/org/apache/lucene/document/FieldSelectorVisitor.java @@ -19,7 +19,6 @@ package org.apache.lucene.document; import java.io.IOException; import java.io.Reader; -import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.document.NumericField.DataType; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldReaderException; @@ -246,14 +245,6 @@ public class FieldSelectorVisitor extends StoredFieldVisitor { return null; } - /** The value of the field as a TokenStream, or null. If null, the Reader value, - * String value, or binary value is used. Exactly one of stringValue(), - * readerValue(), getBinaryValue(), and tokenStreamValue() must be set. */ - @Override - public TokenStream tokenStreamValue() { - return null; - } - /** The value of the field as a String, or null. If null, the Reader value, * binary value, or TokenStream value is used. Exactly one of stringValue(), * readerValue(), getBinaryValue(), and tokenStreamValue() must be set. */ diff --git a/lucene/src/java/org/apache/lucene/document/Field.java b/lucene/src/java/org/apache/lucene/document/Field.java index c24cba94c67..7b2f14bfd34 100644 --- a/lucene/src/java/org/apache/lucene/document/Field.java +++ b/lucene/src/java/org/apache/lucene/document/Field.java @@ -17,9 +17,14 @@ package org.apache.lucene.document; * limitations under the License. */ +import java.io.IOException; import java.io.Reader; +import java.io.StringReader; +import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.index.IndexableFieldType; import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.values.PerDocFieldValues; @@ -62,6 +67,9 @@ public class Field implements IndexableField { if (reader == null) { throw new NullPointerException("reader cannot be null"); } + if (type.indexed() && !type.tokenized()) { + throw new IllegalArgumentException("Non-tokenized fields must use String values"); + } this.name = name; this.fieldsData = reader; @@ -75,6 +83,9 @@ public class Field implements IndexableField { if (tokenStream == null) { throw new NullPointerException("tokenStream cannot be null"); } + if (type.indexed() && !type.tokenized()) { + throw new IllegalArgumentException("Non-tokenized fields must use String values"); + } this.name = name; this.fieldsData = null; @@ -87,12 +98,14 @@ public class Field implements IndexableField { } public Field(String name, IndexableFieldType type, byte[] value, int offset, int length) { - this.fieldsData = new BytesRef(value, offset, length); - this.type = type; - this.name = name; + this(name, type, new BytesRef(value, offset, length)); } public Field(String name, IndexableFieldType type, BytesRef bytes) { + if (type.indexed() && !type.tokenized()) { + throw new IllegalArgumentException("Non-tokenized fields must use String values"); + } + this.fieldsData = bytes; this.type = type; this.name = name; @@ -297,4 +310,51 @@ public class Field implements IndexableField { public IndexableFieldType fieldType() { return type; } + + /** + * {@inheritDoc} + */ + public TokenStream tokenStream(Analyzer analyzer) throws IOException { + if (!fieldType().indexed()) { + return null; + } + + if (!fieldType().tokenized()) { + if (stringValue() == null) { + throw new IllegalArgumentException("Non-Tokenized Fields must have a String value"); + } + + return new TokenStream() { + CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class); + OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class); + boolean used; + + @Override + public boolean incrementToken() throws IOException { + if (used) { + return false; + } + termAttribute.setEmpty().append(stringValue()); + offsetAttribute.setOffset(0, stringValue().length()); + used = true; + return true; + } + + @Override + public void reset() throws IOException { + used = false; + } + }; + } + + if (tokenStream != null) { + return tokenStream; + } else if (readerValue() != null) { + return analyzer.reusableTokenStream(name(), readerValue()); + } else if (stringValue() != null) { + return analyzer.reusableTokenStream(name(), new StringReader(stringValue())); + } + + throw new IllegalArgumentException("Field must have either TokenStream, String or Reader value"); + } } diff --git a/lucene/src/java/org/apache/lucene/document/IndexDocValuesField.java b/lucene/src/java/org/apache/lucene/document/IndexDocValuesField.java index bb61a6207f0..a696505788a 100644 --- a/lucene/src/java/org/apache/lucene/document/IndexDocValuesField.java +++ b/lucene/src/java/org/apache/lucene/document/IndexDocValuesField.java @@ -307,13 +307,6 @@ public class IndexDocValuesField extends Field implements PerDocFieldValues { return null; } - /** - * Returns always null - */ - public TokenStream tokenStreamValue() { - return null; - } - @Override public ValueType docValuesType() { return type; diff --git a/lucene/src/java/org/apache/lucene/document/NumericField.java b/lucene/src/java/org/apache/lucene/document/NumericField.java index 79c844df9ed..e4cf9f5c241 100644 --- a/lucene/src/java/org/apache/lucene/document/NumericField.java +++ b/lucene/src/java/org/apache/lucene/document/NumericField.java @@ -19,6 +19,7 @@ package org.apache.lucene.document; import java.io.Reader; +import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.NumericTokenStream; import org.apache.lucene.index.FieldInfo.IndexOptions; @@ -237,7 +238,7 @@ public final class NumericField extends Field { } /** Returns a {@link NumericTokenStream} for indexing the numeric value. */ - public TokenStream tokenStreamValue() { + public TokenStream tokenStream(Analyzer analyzer) { if (!type.indexed()) return null; if (numericTS == null) { // lazy init the TokenStream as it is heavy to instantiate diff --git a/lucene/src/java/org/apache/lucene/index/DocInverterPerField.java b/lucene/src/java/org/apache/lucene/index/DocInverterPerField.java index 7ebfff750bd..fa3087cbb17 100644 --- a/lucene/src/java/org/apache/lucene/index/DocInverterPerField.java +++ b/lucene/src/java/org/apache/lucene/index/DocInverterPerField.java @@ -75,121 +75,70 @@ final class DocInverterPerField extends DocFieldConsumerPerField { // consumer if it wants to see this particular field // tokenized. if (field.fieldType().indexed() && doInvert) { - + if (i > 0) fieldState.position += docState.analyzer == null ? 0 : docState.analyzer.getPositionIncrementGap(fieldInfo.name); - // TODO (LUCENE-2309): this analysis logic should be - // outside of indexer -- field should simply give us - // a TokenStream, even for multi-valued fields + final TokenStream stream = field.tokenStream(docState.analyzer); + // reset the TokenStream to the first token + stream.reset(); + + try { + boolean hasMoreTokens = stream.incrementToken(); + + fieldState.attributeSource = stream; + + OffsetAttribute offsetAttribute = fieldState.attributeSource.addAttribute(OffsetAttribute.class); + PositionIncrementAttribute posIncrAttribute = fieldState.attributeSource.addAttribute(PositionIncrementAttribute.class); - if (!field.fieldType().tokenized()) { // un-tokenized field - final String stringValue = field.stringValue(); - assert stringValue != null; - final int valueLength = stringValue.length(); - parent.singleToken.reinit(stringValue, 0, valueLength); - fieldState.attributeSource = parent.singleToken; consumer.start(field); - boolean success = false; - try { - consumer.add(); - success = true; - } finally { - if (!success) { - docState.docWriter.setAborting(); + for (;;) { + + // If we hit an exception in stream.next below + // (which is fairly common, eg if analyzer + // chokes on a given document), then it's + // non-aborting and (above) this one document + // will be marked as deleted, but still + // consume a docID + + if (!hasMoreTokens) break; + + final int posIncr = posIncrAttribute.getPositionIncrement(); + fieldState.position += posIncr; + if (fieldState.position > 0) { + fieldState.position--; } - } - fieldState.offset += valueLength; - fieldState.length++; - fieldState.position++; - } else { // tokenized field - final TokenStream stream; - final TokenStream streamValue = field.tokenStreamValue(); - if (streamValue != null) { - stream = streamValue; - } else { - // the field does not have a TokenStream, - // so we have to obtain one from the analyzer - final Reader reader; // find or make Reader - final Reader readerValue = field.readerValue(); + if (posIncr == 0) + fieldState.numOverlap++; - if (readerValue != null) { - reader = readerValue; - } else { - String stringValue = field.stringValue(); - if (stringValue == null) { - throw new IllegalArgumentException("field must have either TokenStream, String or Reader value"); + boolean success = false; + try { + // If we hit an exception in here, we abort + // all buffered documents since the last + // flush, on the likelihood that the + // internal state of the consumer is now + // corrupt and should not be flushed to a + // new segment: + consumer.add(); + success = true; + } finally { + if (!success) { + docState.docWriter.setAborting(); } - parent.stringReader.init(stringValue); - reader = parent.stringReader; } - - // Tokenize field and add to postingTable - stream = docState.analyzer.reusableTokenStream(fieldInfo.name, reader); + fieldState.length++; + fieldState.position++; + + hasMoreTokens = stream.incrementToken(); } + // trigger streams to perform end-of-stream operations + stream.end(); - // reset the TokenStream to the first token - stream.reset(); - - try { - boolean hasMoreTokens = stream.incrementToken(); - - fieldState.attributeSource = stream; - - OffsetAttribute offsetAttribute = fieldState.attributeSource.addAttribute(OffsetAttribute.class); - PositionIncrementAttribute posIncrAttribute = fieldState.attributeSource.addAttribute(PositionIncrementAttribute.class); - - consumer.start(field); - - for(;;) { - - // If we hit an exception in stream.next below - // (which is fairly common, eg if analyzer - // chokes on a given document), then it's - // non-aborting and (above) this one document - // will be marked as deleted, but still - // consume a docID - - if (!hasMoreTokens) break; - - final int posIncr = posIncrAttribute.getPositionIncrement(); - fieldState.position += posIncr; - if (fieldState.position > 0) { - fieldState.position--; - } - - if (posIncr == 0) - fieldState.numOverlap++; - - boolean success = false; - try { - // If we hit an exception in here, we abort - // all buffered documents since the last - // flush, on the likelihood that the - // internal state of the consumer is now - // corrupt and should not be flushed to a - // new segment: - consumer.add(); - success = true; - } finally { - if (!success) { - docState.docWriter.setAborting(); - } - } - fieldState.length++; - fieldState.position++; - - hasMoreTokens = stream.incrementToken(); - } - // trigger streams to perform end-of-stream operations - stream.end(); - - fieldState.offset += offsetAttribute.endOffset(); - } finally { - stream.close(); - } + fieldState.offset += offsetAttribute.endOffset(); + } finally { + stream.close(); } fieldState.offset += docState.analyzer == null ? 0 : docState.analyzer.getOffsetGap(field); diff --git a/lucene/src/java/org/apache/lucene/index/IndexableField.java b/lucene/src/java/org/apache/lucene/index/IndexableField.java index 3f770b08f16..1ee3f138276 100644 --- a/lucene/src/java/org/apache/lucene/index/IndexableField.java +++ b/lucene/src/java/org/apache/lucene/index/IndexableField.java @@ -17,8 +17,10 @@ package org.apache.lucene.index; * limitations under the License. */ +import java.io.IOException; import java.io.Reader; +import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.document.NumericField; import org.apache.lucene.index.values.PerDocFieldValues; @@ -56,9 +58,6 @@ public interface IndexableField { /* Non-null if this field has a Reader value */ public Reader readerValue(); - /* Non-null if this field has a pre-tokenized ({@link TokenStream}) value */ - public TokenStream tokenStreamValue(); - // Numeric field: /* True if this field is numeric */ public boolean numeric(); @@ -82,4 +81,15 @@ public interface IndexableField { /* DocValues type; only used if docValues is non-null */ public ValueType docValuesType(); + + /** + * Creates the TokenStream used for indexing this field. If appropriate, + * implementations should use the given Analyzer to create the TokenStreams. + * + * @param analyzer Analyzer that should be used to create the TokenStreams from + * @return TokenStream value for indexing the document. Should always return + * a non-null value if the field is to be indexed + * @throws IOException Can be thrown while creating the TokenStream + */ + public TokenStream tokenStream(Analyzer analyzer) throws IOException; } diff --git a/lucene/src/test/org/apache/lucene/index/TestIndexableField.java b/lucene/src/test/org/apache/lucene/index/TestIndexableField.java index 1b6e0e66cc7..a61a30a2199 100644 --- a/lucene/src/test/org/apache/lucene/index/TestIndexableField.java +++ b/lucene/src/test/org/apache/lucene/index/TestIndexableField.java @@ -17,10 +17,12 @@ package org.apache.lucene.index; * limitations under the License. */ +import java.io.IOException; import java.io.Reader; import java.io.StringReader; import java.util.Iterator; +import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.document.Document; import org.apache.lucene.document.NumericField.DataType; @@ -132,15 +134,6 @@ public class TestIndexableField extends LuceneTestCase { } } - @Override - public TokenStream tokenStreamValue() { - if (numeric()) { - return new NumericField(name()).setIntValue(counter).tokenStreamValue(); - } else { - return null; - } - } - // Numeric field: @Override public boolean numeric() { @@ -172,6 +165,15 @@ public class TestIndexableField extends LuceneTestCase { public ValueType docValuesType() { return null; } + + @Override + public TokenStream tokenStream(Analyzer analyzer) throws IOException { + if (numeric()) { + return new NumericField(name()).setIntValue(counter).tokenStream(analyzer); + } + return readerValue() != null ? analyzer.reusableTokenStream(name(), readerValue()) : + analyzer.reusableTokenStream(name(), new StringReader(stringValue())); + } } // Silly test showing how to index documents w/o using Lucene's core diff --git a/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java b/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java index 727a34cb34d..3e2ce924bba 100644 --- a/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java +++ b/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java @@ -71,31 +71,7 @@ public class ReadTokensTask extends PerfTask { for(final IndexableField field : fields) { if (!field.fieldType().tokenized() || field instanceof NumericField) continue; - final TokenStream stream; - final TokenStream streamValue = field.tokenStreamValue(); - - if (streamValue != null) - stream = streamValue; - else { - // the field does not have a TokenStream, - // so we have to obtain one from the analyzer - final Reader reader; // find or make Reader - final Reader readerValue = field.readerValue(); - - if (readerValue != null) - reader = readerValue; - else { - String stringValue = field.stringValue(); - if (stringValue == null) - throw new IllegalArgumentException("field must have either TokenStream, String or Reader value"); - stringReader.init(stringValue); - reader = stringReader; - } - - // Tokenize field - stream = analyzer.reusableTokenStream(field.name(), reader); - } - + final TokenStream stream = field.tokenStream(analyzer); // reset the TokenStream to the first token stream.reset(); diff --git a/solr/core/src/test/org/apache/solr/schema/PolyFieldTest.java b/solr/core/src/test/org/apache/solr/schema/PolyFieldTest.java index 5a06b7e8368..b9873084858 100644 --- a/solr/core/src/test/org/apache/solr/schema/PolyFieldTest.java +++ b/solr/core/src/test/org/apache/solr/schema/PolyFieldTest.java @@ -87,10 +87,9 @@ public class PolyFieldTest extends SolrTestCaseJ4 { assertEquals(fields.length, 3);//should be 3, we have a stored field //first two fields contain the values, third is just stored and contains the original for (int i = 0; i < 3; i++) { - boolean hasValue = fields[1].tokenStreamValue() != null - || fields[1].binaryValue() != null - || fields[1].stringValue() != null; - assertTrue("Doesn't have a value: " + fields[1], hasValue); + boolean hasValue = fields[i].binaryValue() != null + || fields[i].stringValue() != null; + assertTrue("Doesn't have a value: " + fields[i], hasValue); } /*assertTrue("first field " + fields[0].tokenStreamValue() + " is not 35.0", pt.getSubType().toExternal(fields[0]).equals(String.valueOf(xy[0]))); assertTrue("second field is not -79.34", pt.getSubType().toExternal(fields[1]).equals(String.valueOf(xy[1])));