From c958e107705932c5496524ead824eadb9e0972f5 Mon Sep 17 00:00:00 2001 From: Erik Hatcher Date: Tue, 22 Nov 2005 01:46:24 +0000 Subject: [PATCH] added getPositionIncrementGap(String fieldName) to Analyzer git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@348060 13f79535-47bb-0310-9956-ffa450edef68 --- .../org/apache/lucene/analysis/Analyzer.java | 18 +++ .../apache/lucene/index/DocumentWriter.java | 1 + .../org/apache/lucene/index/DocHelper.java | 36 ++++-- .../lucene/index/TestDocumentWriter.java | 108 ++++++++++-------- .../apache/lucene/search/TestPhraseQuery.java | 29 ++++- 5 files changed, 130 insertions(+), 62 deletions(-) diff --git a/src/java/org/apache/lucene/analysis/Analyzer.java b/src/java/org/apache/lucene/analysis/Analyzer.java index 04efca83f58..2495e5d44a5 100644 --- a/src/java/org/apache/lucene/analysis/Analyzer.java +++ b/src/java/org/apache/lucene/analysis/Analyzer.java @@ -49,5 +49,23 @@ public abstract class Analyzer { { return tokenStream(null, reader); } + + /** + * Invoked, by DocumentWriter, before indexing a Field instance if + * terms have already been added to that field. This allows custom + * analyzers to place an automatic position increment gap between + * Field instances using the same field name. The default value + * position increment gap is 0. With a 0 position increment gap and + * the typical default token position increment of 1, all terms in a field, + * including across Field instances, are in successive positions, allowing + * exact PhraseQuery matches, for instance, across Field instance boundaries. + * + * @param fieldName Field name being indexed. + * @return position increment gap, added to the next token emitted from {@link #tokenStream(String,Reader)} + */ + public int getPositionIncrementGap(String fieldName) + { + return 0; + } } diff --git a/src/java/org/apache/lucene/index/DocumentWriter.java b/src/java/org/apache/lucene/index/DocumentWriter.java index b669d327bde..250496ea382 100644 --- a/src/java/org/apache/lucene/index/DocumentWriter.java +++ b/src/java/org/apache/lucene/index/DocumentWriter.java @@ -135,6 +135,7 @@ final class DocumentWriter { int length = fieldLengths[fieldNumber]; // length of field int position = fieldPositions[fieldNumber]; // position in field + if (length>0) position+=analyzer.getPositionIncrementGap(fieldName); int offset = fieldOffsets[fieldNumber]; // offset field if (field.isIndexed()) { diff --git a/src/test/org/apache/lucene/index/DocHelper.java b/src/test/org/apache/lucene/index/DocHelper.java index 2d1de60b8dc..5c39611242c 100644 --- a/src/test/org/apache/lucene/index/DocHelper.java +++ b/src/test/org/apache/lucene/index/DocHelper.java @@ -1,12 +1,19 @@ package org.apache.lucene.index; /** - * Created by IntelliJ IDEA. - * User: Grant Ingersoll - * Date: Feb 2, 2004 - * Time: 6:16:12 PM - * $Id$ - * Copyright 2004. Center For Natural Language Processing + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ import org.apache.lucene.analysis.Analyzer; @@ -21,10 +28,6 @@ import java.util.HashMap; import java.util.Map; import java.util.Enumeration; -/** - * - * - **/ class DocHelper { public static final String FIELD_1_TEXT = "field one text"; public static final String TEXT_FIELD_1_KEY = "textField1"; @@ -68,6 +71,14 @@ class DocHelper { public static Field unStoredField2 = new Field(UNSTORED_FIELD_2_KEY, UNSTORED_2_FIELD_TEXT, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES); + public static final String REPEATED_1_TEXT = "repeated one"; + public static final String REPEATED_KEY = "repeated"; + public static Field repeatedField1 = new Field(REPEATED_KEY, REPEATED_1_TEXT, + Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.NO); + public static final String REPEATED_2_TEXT = "repeated two"; + public static Field repeatedField2 = new Field(REPEATED_KEY, REPEATED_2_TEXT, + Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.NO); + public static Map nameValues = null; // ordered list of all the fields... @@ -81,6 +92,8 @@ class DocHelper { unIndField, unStoredField1, unStoredField2, + repeatedField1, + repeatedField2 }; // Map @@ -156,9 +169,8 @@ class DocHelper { */ public static void writeDoc(Directory dir, String segment, Document doc) throws IOException { - Analyzer analyzer = new WhitespaceAnalyzer(); Similarity similarity = Similarity.getDefault(); - writeDoc(dir, analyzer, similarity, segment, doc); + writeDoc(dir, new WhitespaceAnalyzer(), similarity, segment, doc); } /** diff --git a/src/test/org/apache/lucene/index/TestDocumentWriter.java b/src/test/org/apache/lucene/index/TestDocumentWriter.java index 94753f6b8ed..790a12d6c03 100644 --- a/src/test/org/apache/lucene/index/TestDocumentWriter.java +++ b/src/test/org/apache/lucene/index/TestDocumentWriter.java @@ -17,14 +17,16 @@ package org.apache.lucene.index; */ import junit.framework.TestCase; -import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.WhitespaceAnalyzer; -import org.apache.lucene.search.Similarity; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.WhitespaceTokenizer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; +import org.apache.lucene.search.Similarity; +import org.apache.lucene.store.RAMDirectory; -import java.io.IOException; +import java.io.Reader; public class TestDocumentWriter extends TestCase { private RAMDirectory dir = new RAMDirectory(); @@ -48,54 +50,62 @@ public class TestDocumentWriter extends TestCase { } - public void testAddDocument() { - Analyzer analyzer = new WhitespaceAnalyzer(); - Similarity similarity = Similarity.getDefault(); - DocumentWriter writer = new DocumentWriter(dir, analyzer, similarity, 50); - assertTrue(writer != null); - try { - String segName="test"; - writer.addDocument(segName, testDoc); - //After adding the document, we should be able to read it back in - SegmentReader reader = SegmentReader.get(new SegmentInfo(segName, 1, dir)); - assertTrue(reader != null); - Document doc = reader.document(0); - assertTrue(doc != null); - - //System.out.println("Document: " + doc); - Field [] fields = doc.getFields("textField2"); - assertTrue(fields != null && fields.length == 1); - assertTrue(fields[0].stringValue().equals(DocHelper.FIELD_2_TEXT)); - assertTrue(fields[0].isTermVectorStored() == true); - - fields = doc.getFields("textField1"); - assertTrue(fields != null && fields.length == 1); - assertTrue(fields[0].stringValue().equals(DocHelper.FIELD_1_TEXT)); - assertTrue(fields[0].isTermVectorStored() == false); - - fields = doc.getFields("keyField"); - assertTrue(fields != null && fields.length == 1); - assertTrue(fields[0].stringValue().equals(DocHelper.KEYWORD_TEXT)); - - fields = doc.getFields(DocHelper.NO_NORMS_KEY); - assertTrue(fields != null && fields.length == 1); - assertTrue(fields[0].stringValue().equals(DocHelper.NO_NORMS_TEXT)); - - fields = doc.getFields(DocHelper.TEXT_FIELD_3_KEY); - assertTrue(fields != null && fields.length == 1); - assertTrue(fields[0].stringValue().equals(DocHelper.FIELD_3_TEXT)); - - // test that the norm file is not present if omitNorms is true - for (int i=0; i