LUCENE-2309: Moved to Field.tokenStream(Analyzer)

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1174506 13f79535-47bb-0310-9956-ffa450edef68
2011-09-23 03:16:37 +00:00 · 2011-09-23 03:16:37 +00:00 · 5d4502ad0a
parent c8b7bb7aac
commit 5d4502ad0a
11 changed files with 149 additions and 172 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -172,7 +172,7 @@ Changes in backwards compatibility policy
  (Nikola Tankovic, Mike McCandless, Chris Male)

 * LUCENE-3396: ReusableAnalyzerBase.TokenStreamComponents.reset(Reader) now returns void instead
-  of boolean.  If a Component cannot be reset, it should throw an Exception. 
+  of boolean.  If a Component cannot be reset, it should throw an Exception.  (Chris Male)

 Changes in Runtime Behavior

@ -536,6 +536,9 @@ New features
  ScoreDoc (e.g. last document on the previous page) to support deep paging use cases.
  (Aaron McCurry, Grant Ingersoll, Robert Muir) 

+* LUCENE-2309: Added IndexableField.tokenStream(Analyzer) which is now responsible for
+  creating the TokenStreams for Fields when they are to be indexed.  (Chris Male)
+
 Optimizations

 * LUCENE-2588: Don't store unnecessary suffixes when writing the terms
--- a/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java
+++ b/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java
@ -525,14 +525,7 @@ public class InstantiatedIndexWriter implements Closeable {
        tokensByField.put(field, tokens);

        if (field.fieldType().tokenized()) {
-          final TokenStream tokenStream;
-          // todo readerValue(), binaryValue()
-          if (field.tokenStreamValue() != null) {
-            tokenStream = field.tokenStreamValue();
-          } else {
-            tokenStream = analyzer.reusableTokenStream(field.name(), new StringReader(field.stringValue()));
-          }
-
+          final TokenStream tokenStream = field.tokenStream(analyzer);
          // reset the TokenStream to the first token          
          tokenStream.reset();

--- a/lucene/contrib/misc/src/java/org/apache/lucene/document/FieldSelectorVisitor.java
+++ b/lucene/contrib/misc/src/java/org/apache/lucene/document/FieldSelectorVisitor.java
@ -19,7 +19,6 @@ package org.apache.lucene.document;
 import java.io.IOException;
 import java.io.Reader;

-import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.document.NumericField.DataType;
 import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.FieldReaderException;
@ -246,14 +245,6 @@ public class FieldSelectorVisitor extends StoredFieldVisitor {
      return null;
    }

-    /** The value of the field as a TokenStream, or null.  If null, the Reader value,
-     * String value, or binary value is used. Exactly one of stringValue(), 
-     * readerValue(), getBinaryValue(), and tokenStreamValue() must be set. */
-    @Override
-    public TokenStream tokenStreamValue() {
-      return null;
-    }
-
    /** The value of the field as a String, or null.  If null, the Reader value,
     * binary value, or TokenStream value is used.  Exactly one of stringValue(), 
     * readerValue(), getBinaryValue(), and tokenStreamValue() must be set. */
--- a/lucene/src/java/org/apache/lucene/document/Field.java
+++ b/lucene/src/java/org/apache/lucene/document/Field.java
@ -17,9 +17,14 @@ package org.apache.lucene.document;
 * limitations under the License.
 */

+import java.io.IOException;
 import java.io.Reader;
+import java.io.StringReader;

+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.index.IndexableFieldType;
 import org.apache.lucene.index.IndexableField;
 import org.apache.lucene.index.values.PerDocFieldValues;
@ -62,6 +67,9 @@ public class Field implements IndexableField {
    if (reader == null) {
      throw new NullPointerException("reader cannot be null");
    }
+    if (type.indexed() && !type.tokenized()) {
+      throw new IllegalArgumentException("Non-tokenized fields must use String values");
+    }
    
    this.name = name;
    this.fieldsData = reader;
@ -75,6 +83,9 @@ public class Field implements IndexableField {
    if (tokenStream == null) {
      throw new NullPointerException("tokenStream cannot be null");
    }
+    if (type.indexed() && !type.tokenized()) {
+      throw new IllegalArgumentException("Non-tokenized fields must use String values");
+    }
    
    this.name = name;
    this.fieldsData = null;
@ -87,12 +98,14 @@ public class Field implements IndexableField {
  }

  public Field(String name, IndexableFieldType type, byte[] value, int offset, int length) {
-    this.fieldsData = new BytesRef(value, offset, length);
-    this.type = type;
-    this.name = name;
+    this(name, type, new BytesRef(value, offset, length));
  }

  public Field(String name, IndexableFieldType type, BytesRef bytes) {
+    if (type.indexed() && !type.tokenized()) {
+      throw new IllegalArgumentException("Non-tokenized fields must use String values");
+    }
+
    this.fieldsData = bytes;
    this.type = type;
    this.name = name;
@ -297,4 +310,51 @@ public class Field implements IndexableField {
  public IndexableFieldType fieldType() {
    return type;
  }
+
+  /**
+   * {@inheritDoc}
+   */
+  public TokenStream tokenStream(Analyzer analyzer) throws IOException {
+    if (!fieldType().indexed()) {
+      return null;
+    }
+
+    if (!fieldType().tokenized()) {
+      if (stringValue() == null) {
+        throw new IllegalArgumentException("Non-Tokenized Fields must have a String value");
+      }
+
+      return new TokenStream() {
+        CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
+        OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
+        boolean used;
+
+        @Override
+        public boolean incrementToken() throws IOException {
+          if (used) {
+            return false;
+          }
+          termAttribute.setEmpty().append(stringValue());
+          offsetAttribute.setOffset(0, stringValue().length());
+          used = true;
+          return true;
+        }
+
+        @Override
+        public void reset() throws IOException {
+          used = false;
+        }
+      };
+    }
+
+    if (tokenStream != null) {
+      return tokenStream;
+    } else if (readerValue() != null) {
+      return analyzer.reusableTokenStream(name(), readerValue());
+    } else if (stringValue() != null) {
+      return analyzer.reusableTokenStream(name(), new StringReader(stringValue()));
+    }
+
+    throw new IllegalArgumentException("Field must have either TokenStream, String or Reader value");
+  }
 }
--- a/lucene/src/java/org/apache/lucene/document/IndexDocValuesField.java
+++ b/lucene/src/java/org/apache/lucene/document/IndexDocValuesField.java
@ -307,13 +307,6 @@ public class IndexDocValuesField extends Field implements PerDocFieldValues {
    return null;
  }

-  /**
-   * Returns always <code>null</code>
-   */
-  public TokenStream tokenStreamValue() {
-    return null;
-  }
-
  @Override
  public ValueType docValuesType() {
    return type;
--- a/lucene/src/java/org/apache/lucene/document/NumericField.java
+++ b/lucene/src/java/org/apache/lucene/document/NumericField.java
@ -19,6 +19,7 @@ package org.apache.lucene.document;

 import java.io.Reader;

+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.NumericTokenStream;
 import org.apache.lucene.index.FieldInfo.IndexOptions;
@ -237,7 +238,7 @@ public final class NumericField extends Field {
  }
  
  /** Returns a {@link NumericTokenStream} for indexing the numeric value. */
-  public TokenStream tokenStreamValue() {
+  public TokenStream tokenStream(Analyzer analyzer) {
    if (!type.indexed()) return null;
    if (numericTS == null) {
      // lazy init the TokenStream as it is heavy to instantiate
--- a/lucene/src/java/org/apache/lucene/index/DocInverterPerField.java
+++ b/lucene/src/java/org/apache/lucene/index/DocInverterPerField.java
@ -75,121 +75,70 @@ final class DocInverterPerField extends DocFieldConsumerPerField {
      // consumer if it wants to see this particular field
      // tokenized.
      if (field.fieldType().indexed() && doInvert) {
-        
+
        if (i > 0)
          fieldState.position += docState.analyzer == null ? 0 : docState.analyzer.getPositionIncrementGap(fieldInfo.name);

-        // TODO (LUCENE-2309): this analysis logic should be
-        // outside of indexer -- field should simply give us
-        // a TokenStream, even for multi-valued fields
+        final TokenStream stream = field.tokenStream(docState.analyzer);
+        // reset the TokenStream to the first token
+        stream.reset();
+
+        try {
+          boolean hasMoreTokens = stream.incrementToken();
+
+          fieldState.attributeSource = stream;
+
+          OffsetAttribute offsetAttribute = fieldState.attributeSource.addAttribute(OffsetAttribute.class);
+          PositionIncrementAttribute posIncrAttribute = fieldState.attributeSource.addAttribute(PositionIncrementAttribute.class);

-        if (!field.fieldType().tokenized()) {		  // un-tokenized field
-          final String stringValue = field.stringValue();
-          assert stringValue != null;
-          final int valueLength = stringValue.length();
-          parent.singleToken.reinit(stringValue, 0, valueLength);
-          fieldState.attributeSource = parent.singleToken;
          consumer.start(field);

-          boolean success = false;
-          try {
-            consumer.add();
-            success = true;
-          } finally {
-            if (!success) {
-              docState.docWriter.setAborting();
+          for (;;) {
+
+            // If we hit an exception in stream.next below
+            // (which is fairly common, eg if analyzer
+            // chokes on a given document), then it's
+            // non-aborting and (above) this one document
+            // will be marked as deleted, but still
+            // consume a docID
+
+            if (!hasMoreTokens) break;
+
+            final int posIncr = posIncrAttribute.getPositionIncrement();
+            fieldState.position += posIncr;
+            if (fieldState.position > 0) {
+              fieldState.position--;
            }
-          }
-          fieldState.offset += valueLength;
-          fieldState.length++;
-          fieldState.position++;
-        } else {                                  // tokenized field
-          final TokenStream stream;
-          final TokenStream streamValue = field.tokenStreamValue();

-          if (streamValue != null) {
-            stream = streamValue;
-          } else {
-            // the field does not have a TokenStream,
-            // so we have to obtain one from the analyzer
-            final Reader reader;			  // find or make Reader
-            final Reader readerValue = field.readerValue();
+            if (posIncr == 0)
+              fieldState.numOverlap++;

-            if (readerValue != null) {
-              reader = readerValue;
-            } else {
-              String stringValue = field.stringValue();
-              if (stringValue == null) {
-                throw new IllegalArgumentException("field must have either TokenStream, String or Reader value");
+            boolean success = false;
+            try {
+              // If we hit an exception in here, we abort
+              // all buffered documents since the last
+              // flush, on the likelihood that the
+              // internal state of the consumer is now
+              // corrupt and should not be flushed to a
+              // new segment:
+              consumer.add();
+              success = true;
+            } finally {
+              if (!success) {
+                docState.docWriter.setAborting();
              }
-              parent.stringReader.init(stringValue);
-              reader = parent.stringReader;
            }
-          
-            // Tokenize field and add to postingTable
-            stream = docState.analyzer.reusableTokenStream(fieldInfo.name, reader);
+            fieldState.length++;
+            fieldState.position++;
+
+            hasMoreTokens = stream.incrementToken();
          }
+          // trigger streams to perform end-of-stream operations
+          stream.end();

-          // reset the TokenStream to the first token
-          stream.reset();
-          
-          try {
-            boolean hasMoreTokens = stream.incrementToken();
-
-            fieldState.attributeSource = stream;
-
-            OffsetAttribute offsetAttribute = fieldState.attributeSource.addAttribute(OffsetAttribute.class);
-            PositionIncrementAttribute posIncrAttribute = fieldState.attributeSource.addAttribute(PositionIncrementAttribute.class);
-            
-            consumer.start(field);
-            
-            for(;;) {
-
-              // If we hit an exception in stream.next below
-              // (which is fairly common, eg if analyzer
-              // chokes on a given document), then it's
-              // non-aborting and (above) this one document
-              // will be marked as deleted, but still
-              // consume a docID
-              
-              if (!hasMoreTokens) break;
-              
-              final int posIncr = posIncrAttribute.getPositionIncrement();
-              fieldState.position += posIncr;
-              if (fieldState.position > 0) {
-                fieldState.position--;
-              }
-
-              if (posIncr == 0)
-                fieldState.numOverlap++;
-
-              boolean success = false;
-              try {
-                // If we hit an exception in here, we abort
-                // all buffered documents since the last
-                // flush, on the likelihood that the
-                // internal state of the consumer is now
-                // corrupt and should not be flushed to a
-                // new segment:
-                consumer.add();
-                success = true;
-              } finally {
-                if (!success) {
-                  docState.docWriter.setAborting();
-                }
-              }
-              fieldState.length++;
-              fieldState.position++;
-
-              hasMoreTokens = stream.incrementToken();
-            }
-            // trigger streams to perform end-of-stream operations
-            stream.end();
-            
-            fieldState.offset += offsetAttribute.endOffset();
-          } finally {
-            stream.close();
-          }
+          fieldState.offset += offsetAttribute.endOffset();
+        } finally {
+          stream.close();
        }

        fieldState.offset += docState.analyzer == null ? 0 : docState.analyzer.getOffsetGap(field);
--- a/lucene/src/java/org/apache/lucene/index/IndexableField.java
+++ b/lucene/src/java/org/apache/lucene/index/IndexableField.java
@ -17,8 +17,10 @@ package org.apache.lucene.index;
 * limitations under the License.
 */

+import java.io.IOException;
 import java.io.Reader;

+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.document.NumericField;
 import org.apache.lucene.index.values.PerDocFieldValues;
@ -56,9 +58,6 @@ public interface IndexableField {
  /* Non-null if this field has a Reader value */
  public Reader readerValue();

-  /* Non-null if this field has a pre-tokenized ({@link TokenStream}) value */
-  public TokenStream tokenStreamValue();
-
  // Numeric field:
  /* True if this field is numeric */
  public boolean numeric();
@ -82,4 +81,15 @@ public interface IndexableField {

  /* DocValues type; only used if docValues is non-null */
  public ValueType docValuesType();
+
+  /**
+   * Creates the TokenStream used for indexing this field.  If appropriate,
+   * implementations should use the given Analyzer to create the TokenStreams.
+   *
+   * @param analyzer Analyzer that should be used to create the TokenStreams from
+   * @return TokenStream value for indexing the document.  Should always return
+   *         a non-null value if the field is to be indexed
+   * @throws IOException Can be thrown while creating the TokenStream
+   */
+  public TokenStream tokenStream(Analyzer analyzer) throws IOException;
 }
--- a/lucene/src/test/org/apache/lucene/index/TestIndexableField.java
+++ b/lucene/src/test/org/apache/lucene/index/TestIndexableField.java
@ -17,10 +17,12 @@ package org.apache.lucene.index;
 * limitations under the License.
 */

+import java.io.IOException;
 import java.io.Reader;
 import java.io.StringReader;
 import java.util.Iterator;

+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.NumericField.DataType;
@ -132,15 +134,6 @@ public class TestIndexableField extends LuceneTestCase {
      }
    }

-    @Override
-    public TokenStream tokenStreamValue() {
-      if (numeric()) {
-        return new NumericField(name()).setIntValue(counter).tokenStreamValue();
-      } else {
-        return null;
-      }
-    }
-
    // Numeric field:
    @Override
    public boolean numeric() {
@ -172,6 +165,15 @@ public class TestIndexableField extends LuceneTestCase {
    public ValueType docValuesType() {
      return null;
    }
+
+    @Override
+    public TokenStream tokenStream(Analyzer analyzer) throws IOException {
+      if (numeric()) {
+        return new NumericField(name()).setIntValue(counter).tokenStream(analyzer);
+      }
+      return readerValue() != null ? analyzer.reusableTokenStream(name(), readerValue()) :
+          analyzer.reusableTokenStream(name(), new StringReader(stringValue()));
+    }
  }

  // Silly test showing how to index documents w/o using Lucene's core
--- a/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java
+++ b/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java
@ -71,31 +71,7 @@ public class ReadTokensTask extends PerfTask {
    for(final IndexableField field : fields) {
      if (!field.fieldType().tokenized() || field instanceof NumericField) continue;
      
-      final TokenStream stream;
-      final TokenStream streamValue = field.tokenStreamValue();
-
-      if (streamValue != null) 
-        stream = streamValue;
-      else {
-        // the field does not have a TokenStream,
-        // so we have to obtain one from the analyzer
-        final Reader reader;			  // find or make Reader
-        final Reader readerValue = field.readerValue();
-
-        if (readerValue != null)
-          reader = readerValue;
-        else {
-          String stringValue = field.stringValue();
-          if (stringValue == null)
-            throw new IllegalArgumentException("field must have either TokenStream, String or Reader value");
-          stringReader.init(stringValue);
-          reader = stringReader;
-        }
-        
-        // Tokenize field
-        stream = analyzer.reusableTokenStream(field.name(), reader);
-      }
-
+      final TokenStream stream = field.tokenStream(analyzer);
      // reset the TokenStream to the first token
      stream.reset();

--- a/solr/core/src/test/org/apache/solr/schema/PolyFieldTest.java
+++ b/solr/core/src/test/org/apache/solr/schema/PolyFieldTest.java
@ -87,10 +87,9 @@ public class PolyFieldTest extends SolrTestCaseJ4 {
    assertEquals(fields.length, 3);//should be 3, we have a stored field
    //first two fields contain the values, third is just stored and contains the original
    for (int i = 0; i < 3; i++) {
-      boolean hasValue = fields[1].tokenStreamValue() != null
-              || fields[1].binaryValue() != null
-              || fields[1].stringValue() != null;
-      assertTrue("Doesn't have a value: " + fields[1], hasValue);
+      boolean hasValue = fields[i].binaryValue() != null
+          || fields[i].stringValue() != null;
+      assertTrue("Doesn't have a value: " + fields[i], hasValue);
    }
    /*assertTrue("first field " + fields[0].tokenStreamValue() +  " is not 35.0", pt.getSubType().toExternal(fields[0]).equals(String.valueOf(xy[0])));
    assertTrue("second field is not -79.34", pt.getSubType().toExternal(fields[1]).equals(String.valueOf(xy[1])));