LUCENE-4240: don't invoke the Analyzer for not-analyzed fields, fix offsetGap to just take fieldName

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1363821 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2012-07-20 15:05:58 +00:00
parent cc90a37ed7
commit 6a4cdbeb05
5 changed files with 52 additions and 18 deletions

View File

@ -50,6 +50,11 @@ API Changes
filter another reader and you override correct() for offset correction.
(Robert Muir)
* LUCENE-4240: Analyzer api now just takes fieldName for getOffsetGap. If the
field is not analyzed (e.g. StringField), then the analyzer is not invoked
at all. If you want to tweak things like positionIncrementGap and offsetGap,
analyze the field with KeywordTokenizer instead. (Grant Ingersoll, Robert Muir)
Optimizations
* LUCENE-4171: Performance improvements to Packed64.

View File

@ -17,7 +17,6 @@ package org.apache.lucene.analysis;
* limitations under the License.
*/
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.util.CloseableThreadLocal;
@ -114,21 +113,15 @@ public abstract class Analyzer {
/**
* Just like {@link #getPositionIncrementGap}, except for
* Token offsets instead. By default this returns 1 for
* tokenized fields and, as if the fields were joined
* with an extra space character, and 0 for un-tokenized
* fields. This method is only called if the field
* Token offsets instead. By default this returns 1.
* This method is only called if the field
* produced at least one token for indexing.
*
* @param field the field just indexed
* @param fieldName the field just indexed
* @return offset gap, added to the next token emitted from {@link #tokenStream(String,Reader)}
*/
public int getOffsetGap(IndexableField field) {
if (field.fieldType().tokenized()) {
return 1;
} else {
return 0;
}
public int getOffsetGap(String fieldName) {
return 1;
}
/** Frees persistent resources used by this Analyzer */

View File

@ -17,8 +17,6 @@ package org.apache.lucene.analysis;
* limitations under the License.
*/
import org.apache.lucene.index.IndexableField;
import java.io.Reader;
/**
@ -83,8 +81,8 @@ public abstract class AnalyzerWrapper extends Analyzer {
* {@inheritDoc}
*/
@Override
public final int getOffsetGap(IndexableField field) {
return getWrappedAnalyzer(field.name()).getOffsetGap(field);
public final int getOffsetGap(String fieldName) {
return getWrappedAnalyzer(fieldName).getOffsetGap(fieldName);
}
@Override

View File

@ -76,6 +76,7 @@ final class DocInverterPerField extends DocFieldConsumerPerField {
// consumer if it wants to see this particular field
// tokenized.
if (fieldType.indexed() && doInvert) {
final boolean analyzed = fieldType.tokenized() && docState.analyzer != null;
// if the field omits norms, the boost cannot be indexed.
if (fieldType.omitNorms() && field.boost() != 1.0f) {
@ -88,7 +89,7 @@ final class DocInverterPerField extends DocFieldConsumerPerField {
int lastStartOffset = 0;
if (i > 0) {
fieldState.position += docState.analyzer == null ? 0 : docState.analyzer.getPositionIncrementGap(fieldInfo.name);
fieldState.position += analyzed ? docState.analyzer.getPositionIncrementGap(fieldInfo.name) : 0;
}
final TokenStream stream = field.tokenStream(docState.analyzer);
@ -188,7 +189,7 @@ final class DocInverterPerField extends DocFieldConsumerPerField {
}
}
fieldState.offset += docState.analyzer == null ? 0 : docState.analyzer.getOffsetGap(field);
fieldState.offset += analyzed ? docState.analyzer.getOffsetGap(fieldInfo.name) : 0;
fieldState.boost *= field.boost();
}

View File

@ -35,6 +35,7 @@ import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
@ -1799,4 +1800,40 @@ public class TestIndexWriter extends LuceneTestCase {
r.close();
dir.close();
}
public void testDontInvokeAnalyzerForUnAnalyzedFields() throws Exception {
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
throw new IllegalStateException("don't invoke me!");
}
@Override
public int getPositionIncrementGap(String fieldName) {
throw new IllegalStateException("don't invoke me!");
}
@Override
public int getOffsetGap(String fieldName) {
throw new IllegalStateException("don't invoke me!");
}
};
Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(
TEST_VERSION_CURRENT, analyzer));
Document doc = new Document();
FieldType customType = new FieldType(StringField.TYPE_NOT_STORED);
customType.setStoreTermVectors(true);
customType.setStoreTermVectorPositions(true);
customType.setStoreTermVectorOffsets(true);
Field f = newField("field", "abcd", customType);
doc.add(f);
doc.add(f);
Field f2 = newField("field", "", customType);
doc.add(f2);
doc.add(f);
w.addDocument(doc);
w.close();
dir.close();
}
}