From 6a4cdbeb05af382d93e8440a88411b32ef60d5ae Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Fri, 20 Jul 2012 15:05:58 +0000 Subject: [PATCH] LUCENE-4240: don't invoke the Analyzer for not-analyzed fields, fix offsetGap to just take fieldName git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1363821 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 5 +++ .../org/apache/lucene/analysis/Analyzer.java | 17 +++------ .../lucene/analysis/AnalyzerWrapper.java | 6 +-- .../lucene/index/DocInverterPerField.java | 5 ++- .../apache/lucene/index/TestIndexWriter.java | 37 +++++++++++++++++++ 5 files changed, 52 insertions(+), 18 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 1aa1aee2671..f8e4ecae3c7 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -50,6 +50,11 @@ API Changes filter another reader and you override correct() for offset correction. (Robert Muir) +* LUCENE-4240: Analyzer api now just takes fieldName for getOffsetGap. If the + field is not analyzed (e.g. StringField), then the analyzer is not invoked + at all. If you want to tweak things like positionIncrementGap and offsetGap, + analyze the field with KeywordTokenizer instead. (Grant Ingersoll, Robert Muir) + Optimizations * LUCENE-4171: Performance improvements to Packed64. diff --git a/lucene/core/src/java/org/apache/lucene/analysis/Analyzer.java b/lucene/core/src/java/org/apache/lucene/analysis/Analyzer.java index fe2008c7cc8..dbeeabe050b 100644 --- a/lucene/core/src/java/org/apache/lucene/analysis/Analyzer.java +++ b/lucene/core/src/java/org/apache/lucene/analysis/Analyzer.java @@ -17,7 +17,6 @@ package org.apache.lucene.analysis; * limitations under the License. */ -import org.apache.lucene.index.IndexableField; import org.apache.lucene.store.AlreadyClosedException; import org.apache.lucene.util.CloseableThreadLocal; @@ -114,21 +113,15 @@ public abstract class Analyzer { /** * Just like {@link #getPositionIncrementGap}, except for - * Token offsets instead. By default this returns 1 for - * tokenized fields and, as if the fields were joined - * with an extra space character, and 0 for un-tokenized - * fields. This method is only called if the field + * Token offsets instead. By default this returns 1. + * This method is only called if the field * produced at least one token for indexing. * - * @param field the field just indexed + * @param fieldName the field just indexed * @return offset gap, added to the next token emitted from {@link #tokenStream(String,Reader)} */ - public int getOffsetGap(IndexableField field) { - if (field.fieldType().tokenized()) { - return 1; - } else { - return 0; - } + public int getOffsetGap(String fieldName) { + return 1; } /** Frees persistent resources used by this Analyzer */ diff --git a/lucene/core/src/java/org/apache/lucene/analysis/AnalyzerWrapper.java b/lucene/core/src/java/org/apache/lucene/analysis/AnalyzerWrapper.java index a7256e95dcc..261075c0676 100644 --- a/lucene/core/src/java/org/apache/lucene/analysis/AnalyzerWrapper.java +++ b/lucene/core/src/java/org/apache/lucene/analysis/AnalyzerWrapper.java @@ -17,8 +17,6 @@ package org.apache.lucene.analysis; * limitations under the License. */ -import org.apache.lucene.index.IndexableField; - import java.io.Reader; /** @@ -83,8 +81,8 @@ public abstract class AnalyzerWrapper extends Analyzer { * {@inheritDoc} */ @Override - public final int getOffsetGap(IndexableField field) { - return getWrappedAnalyzer(field.name()).getOffsetGap(field); + public final int getOffsetGap(String fieldName) { + return getWrappedAnalyzer(fieldName).getOffsetGap(fieldName); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/index/DocInverterPerField.java b/lucene/core/src/java/org/apache/lucene/index/DocInverterPerField.java index 0ef2a6f6a2c..044d4ede08e 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DocInverterPerField.java +++ b/lucene/core/src/java/org/apache/lucene/index/DocInverterPerField.java @@ -76,6 +76,7 @@ final class DocInverterPerField extends DocFieldConsumerPerField { // consumer if it wants to see this particular field // tokenized. if (fieldType.indexed() && doInvert) { + final boolean analyzed = fieldType.tokenized() && docState.analyzer != null; // if the field omits norms, the boost cannot be indexed. if (fieldType.omitNorms() && field.boost() != 1.0f) { @@ -88,7 +89,7 @@ final class DocInverterPerField extends DocFieldConsumerPerField { int lastStartOffset = 0; if (i > 0) { - fieldState.position += docState.analyzer == null ? 0 : docState.analyzer.getPositionIncrementGap(fieldInfo.name); + fieldState.position += analyzed ? docState.analyzer.getPositionIncrementGap(fieldInfo.name) : 0; } final TokenStream stream = field.tokenStream(docState.analyzer); @@ -188,7 +189,7 @@ final class DocInverterPerField extends DocFieldConsumerPerField { } } - fieldState.offset += docState.analyzer == null ? 0 : docState.analyzer.getOffsetGap(field); + fieldState.offset += analyzed ? docState.analyzer.getOffsetGap(fieldInfo.name) : 0; fieldState.boost *= field.boost(); } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java index 2bbc7f96d00..661d601b05b 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java @@ -35,6 +35,7 @@ import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; import org.apache.lucene.document.StoredField; +import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.index.IndexWriterConfig.OpenMode; @@ -1799,4 +1800,40 @@ public class TestIndexWriter extends LuceneTestCase { r.close(); dir.close(); } + + public void testDontInvokeAnalyzerForUnAnalyzedFields() throws Exception { + Analyzer analyzer = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + throw new IllegalStateException("don't invoke me!"); + } + + @Override + public int getPositionIncrementGap(String fieldName) { + throw new IllegalStateException("don't invoke me!"); + } + + @Override + public int getOffsetGap(String fieldName) { + throw new IllegalStateException("don't invoke me!"); + } + }; + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig( + TEST_VERSION_CURRENT, analyzer)); + Document doc = new Document(); + FieldType customType = new FieldType(StringField.TYPE_NOT_STORED); + customType.setStoreTermVectors(true); + customType.setStoreTermVectorPositions(true); + customType.setStoreTermVectorOffsets(true); + Field f = newField("field", "abcd", customType); + doc.add(f); + doc.add(f); + Field f2 = newField("field", "", customType); + doc.add(f2); + doc.add(f); + w.addDocument(doc); + w.close(); + dir.close(); + } }