LUCENE-4240: don't invoke the Analyzer for not-analyzed fields, fix offsetGap to just take fieldName

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1363821 13f79535-47bb-0310-9956-ffa450edef68
2012-07-20 15:05:58 +00:00 · 2012-07-20 15:05:58 +00:00 · 6a4cdbeb05
parent cc90a37ed7
commit 6a4cdbeb05
5 changed files with 52 additions and 18 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -50,6 +50,11 @@ API Changes
  filter another reader and you override correct() for offset correction.
  (Robert Muir)

+* LUCENE-4240: Analyzer api now just takes fieldName for getOffsetGap. If the
+  field is not analyzed (e.g. StringField), then the analyzer is not invoked
+  at all. If you want to tweak things like positionIncrementGap and offsetGap,
+  analyze the field with KeywordTokenizer instead.  (Grant Ingersoll, Robert Muir)
+
 Optimizations

 * LUCENE-4171: Performance improvements to Packed64.
--- a/lucene/core/src/java/org/apache/lucene/analysis/Analyzer.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/Analyzer.java
@ -17,7 +17,6 @@ package org.apache.lucene.analysis;
 * limitations under the License.
 */

-import org.apache.lucene.index.IndexableField;
 import org.apache.lucene.store.AlreadyClosedException;
 import org.apache.lucene.util.CloseableThreadLocal;

@ -114,21 +113,15 @@ public abstract class Analyzer {

  /**
   * Just like {@link #getPositionIncrementGap}, except for
-   * Token offsets instead.  By default this returns 1 for
-   * tokenized fields and, as if the fields were joined
-   * with an extra space character, and 0 for un-tokenized
-   * fields.  This method is only called if the field
+   * Token offsets instead.  By default this returns 1.
+   * This method is only called if the field
   * produced at least one token for indexing.
   *
-   * @param field the field just indexed
+   * @param fieldName the field just indexed
   * @return offset gap, added to the next token emitted from {@link #tokenStream(String,Reader)}
   */
-  public int getOffsetGap(IndexableField field) {
-    if (field.fieldType().tokenized()) {
-      return 1;
-    } else {
-      return 0;
-    }
+  public int getOffsetGap(String fieldName) {
+    return 1;
  }

  /** Frees persistent resources used by this Analyzer */
--- a/lucene/core/src/java/org/apache/lucene/analysis/AnalyzerWrapper.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/AnalyzerWrapper.java
@ -17,8 +17,6 @@ package org.apache.lucene.analysis;
 * limitations under the License.
 */

-import org.apache.lucene.index.IndexableField;
-
 import java.io.Reader;

 /**
@ -83,8 +81,8 @@ public abstract class AnalyzerWrapper extends Analyzer {
   * {@inheritDoc}
   */
  @Override
-  public final int getOffsetGap(IndexableField field) {
-    return getWrappedAnalyzer(field.name()).getOffsetGap(field);
+  public final int getOffsetGap(String fieldName) {
+    return getWrappedAnalyzer(fieldName).getOffsetGap(fieldName);
  }

  @Override
--- a/lucene/core/src/java/org/apache/lucene/index/DocInverterPerField.java
+++ b/lucene/core/src/java/org/apache/lucene/index/DocInverterPerField.java
@ -76,6 +76,7 @@ final class DocInverterPerField extends DocFieldConsumerPerField {
      // consumer if it wants to see this particular field
      // tokenized.
      if (fieldType.indexed() && doInvert) {
+        final boolean analyzed = fieldType.tokenized() && docState.analyzer != null;
        
        // if the field omits norms, the boost cannot be indexed.
        if (fieldType.omitNorms() && field.boost() != 1.0f) {
@ -88,7 +89,7 @@ final class DocInverterPerField extends DocFieldConsumerPerField {
        int lastStartOffset = 0;

        if (i > 0) {
-          fieldState.position += docState.analyzer == null ? 0 : docState.analyzer.getPositionIncrementGap(fieldInfo.name);
+          fieldState.position += analyzed ? docState.analyzer.getPositionIncrementGap(fieldInfo.name) : 0;
        }

        final TokenStream stream = field.tokenStream(docState.analyzer);
@ -188,7 +189,7 @@ final class DocInverterPerField extends DocFieldConsumerPerField {
          }
        }

-        fieldState.offset += docState.analyzer == null ? 0 : docState.analyzer.getOffsetGap(field);
+        fieldState.offset += analyzed ? docState.analyzer.getOffsetGap(fieldInfo.name) : 0;
        fieldState.boost *= field.boost();
      }

--- a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java
@ -35,6 +35,7 @@ import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.FieldType;
 import org.apache.lucene.document.StoredField;
+import org.apache.lucene.document.StringField;
 import org.apache.lucene.document.TextField;
 import org.apache.lucene.index.FieldInfo.IndexOptions;
 import org.apache.lucene.index.IndexWriterConfig.OpenMode;
@ -1799,4 +1800,40 @@ public class TestIndexWriter extends LuceneTestCase {
    r.close();
    dir.close();
  }
+  
+  public void testDontInvokeAnalyzerForUnAnalyzedFields() throws Exception {
+    Analyzer analyzer = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        throw new IllegalStateException("don't invoke me!");
+      }
+
+      @Override
+      public int getPositionIncrementGap(String fieldName) {
+        throw new IllegalStateException("don't invoke me!");
+      }
+
+      @Override
+      public int getOffsetGap(String fieldName) {
+        throw new IllegalStateException("don't invoke me!");
+      }
+    };
+    Directory dir = newDirectory();
+    IndexWriter w = new IndexWriter(dir, newIndexWriterConfig( 
+        TEST_VERSION_CURRENT, analyzer));
+    Document doc = new Document();
+    FieldType customType = new FieldType(StringField.TYPE_NOT_STORED);
+    customType.setStoreTermVectors(true);
+    customType.setStoreTermVectorPositions(true);
+    customType.setStoreTermVectorOffsets(true);
+    Field f = newField("field", "abcd", customType);
+    doc.add(f);
+    doc.add(f);
+    Field f2 = newField("field", "", customType);
+    doc.add(f2);
+    doc.add(f);
+    w.addDocument(doc);
+    w.close();
+    dir.close();
+  }
 }