From c958e107705932c5496524ead824eadb9e0972f5 Mon Sep 17 00:00:00 2001
From: Erik Hatcher <ehatcher@apache.org>
Date: Tue, 22 Nov 2005 01:46:24 +0000
Subject: [PATCH] added getPositionIncrementGap(String fieldName) to Analyzer

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@348060 13f79535-47bb-0310-9956-ffa450edef68
---
 .../org/apache/lucene/analysis/Analyzer.java  |  18 +++
 .../apache/lucene/index/DocumentWriter.java   |   1 +
 .../org/apache/lucene/index/DocHelper.java    |  36 ++++--
 .../lucene/index/TestDocumentWriter.java      | 108 ++++++++++--------
 .../apache/lucene/search/TestPhraseQuery.java |  29 ++++-
 5 files changed, 130 insertions(+), 62 deletions(-)

diff --git a/src/java/org/apache/lucene/analysis/Analyzer.java b/src/java/org/apache/lucene/analysis/Analyzer.java
index 04efca83f58..2495e5d44a5 100644
--- a/src/java/org/apache/lucene/analysis/Analyzer.java
+++ b/src/java/org/apache/lucene/analysis/Analyzer.java
@@ -49,5 +49,23 @@ public abstract class Analyzer {
   {
 	  return tokenStream(null, reader);
   }
+
+  /**
+   * Invoked, by DocumentWriter, before indexing a Field instance if
+   * terms have already been added to that field.  This allows custom
+   * analyzers to place an automatic position increment gap between
+   * Field instances using the same field name.  The default value
+   * position increment gap is 0.  With a 0 position increment gap and
+   * the typical default token position increment of 1, all terms in a field,
+   * including across Field instances, are in successive positions, allowing
+   * exact PhraseQuery matches, for instance, across Field instance boundaries.
+   *
+   * @param fieldName Field name being indexed.
+   * @return position increment gap, added to the next token emitted from {@link #tokenStream(String,Reader)}
+   */
+  public int getPositionIncrementGap(String fieldName)
+  {
+    return 0;
+  }
 }
 
diff --git a/src/java/org/apache/lucene/index/DocumentWriter.java b/src/java/org/apache/lucene/index/DocumentWriter.java
index b669d327bde..250496ea382 100644
--- a/src/java/org/apache/lucene/index/DocumentWriter.java
+++ b/src/java/org/apache/lucene/index/DocumentWriter.java
@@ -135,6 +135,7 @@ final class DocumentWriter {
 
       int length = fieldLengths[fieldNumber];     // length of field
       int position = fieldPositions[fieldNumber]; // position in field
+      if (length>0) position+=analyzer.getPositionIncrementGap(fieldName);
       int offset = fieldOffsets[fieldNumber];       // offset field
 
       if (field.isIndexed()) {
diff --git a/src/test/org/apache/lucene/index/DocHelper.java b/src/test/org/apache/lucene/index/DocHelper.java
index 2d1de60b8dc..5c39611242c 100644
--- a/src/test/org/apache/lucene/index/DocHelper.java
+++ b/src/test/org/apache/lucene/index/DocHelper.java
@@ -1,12 +1,19 @@
 package org.apache.lucene.index;
 
 /**
- * Created by IntelliJ IDEA.
- * User: Grant Ingersoll
- * Date: Feb 2, 2004
- * Time: 6:16:12 PM
- * $Id$
- * Copyright 2004.  Center For Natural Language Processing
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 
 import org.apache.lucene.analysis.Analyzer;
@@ -21,10 +28,6 @@ import java.util.HashMap;
 import java.util.Map;
 import java.util.Enumeration;
 
-/**
- *
- *
- **/
 class DocHelper {
   public static final String FIELD_1_TEXT = "field one text";
   public static final String TEXT_FIELD_1_KEY = "textField1";
@@ -68,6 +71,14 @@ class DocHelper {
   public static Field unStoredField2 = new Field(UNSTORED_FIELD_2_KEY, UNSTORED_2_FIELD_TEXT,
       Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES);
 
+  public static final String REPEATED_1_TEXT = "repeated one";
+  public static final String REPEATED_KEY = "repeated";
+  public static Field repeatedField1 = new Field(REPEATED_KEY, REPEATED_1_TEXT,
+      Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.NO);
+  public static final String REPEATED_2_TEXT = "repeated two";
+  public static Field repeatedField2 = new Field(REPEATED_KEY, REPEATED_2_TEXT,
+      Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.NO);
+
   public static Map nameValues = null;
 
   // ordered list of all the fields...
@@ -81,6 +92,8 @@ class DocHelper {
     unIndField,
     unStoredField1,
     unStoredField2,
+    repeatedField1,
+    repeatedField2
   };
 
   // Map<String fieldName, Field field>
@@ -156,9 +169,8 @@ class DocHelper {
    */ 
   public static void writeDoc(Directory dir, String segment, Document doc) throws IOException
   {
-    Analyzer analyzer = new WhitespaceAnalyzer();
     Similarity similarity = Similarity.getDefault();
-    writeDoc(dir, analyzer, similarity, segment, doc);
+    writeDoc(dir, new WhitespaceAnalyzer(), similarity, segment, doc);
   }
 
   /**
diff --git a/src/test/org/apache/lucene/index/TestDocumentWriter.java b/src/test/org/apache/lucene/index/TestDocumentWriter.java
index 94753f6b8ed..790a12d6c03 100644
--- a/src/test/org/apache/lucene/index/TestDocumentWriter.java
+++ b/src/test/org/apache/lucene/index/TestDocumentWriter.java
@@ -17,14 +17,16 @@ package org.apache.lucene.index;
  */
 
 import junit.framework.TestCase;
-import org.apache.lucene.store.RAMDirectory;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.WhitespaceAnalyzer;
-import org.apache.lucene.search.Similarity;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
+import org.apache.lucene.search.Similarity;
+import org.apache.lucene.store.RAMDirectory;
 
-import java.io.IOException;
+import java.io.Reader;
 
 public class TestDocumentWriter extends TestCase {
   private RAMDirectory dir = new RAMDirectory();
@@ -48,54 +50,62 @@ public class TestDocumentWriter extends TestCase {
 
   }
 
-  public void testAddDocument() {
-    Analyzer analyzer = new WhitespaceAnalyzer();
-    Similarity similarity = Similarity.getDefault();
-    DocumentWriter writer = new DocumentWriter(dir, analyzer, similarity, 50);
-    assertTrue(writer != null);
-    try {
-      String segName="test";
-      writer.addDocument(segName, testDoc);
-      //After adding the document, we should be able to read it back in
-      SegmentReader reader = SegmentReader.get(new SegmentInfo(segName, 1, dir));
-      assertTrue(reader != null);
-      Document doc = reader.document(0);
-      assertTrue(doc != null);
-      
-      //System.out.println("Document: " + doc);
-      Field [] fields = doc.getFields("textField2");
-      assertTrue(fields != null && fields.length == 1);
-      assertTrue(fields[0].stringValue().equals(DocHelper.FIELD_2_TEXT));
-      assertTrue(fields[0].isTermVectorStored() == true);
-      
-      fields = doc.getFields("textField1");
-      assertTrue(fields != null && fields.length == 1);
-      assertTrue(fields[0].stringValue().equals(DocHelper.FIELD_1_TEXT));
-      assertTrue(fields[0].isTermVectorStored() == false);
-      
-      fields = doc.getFields("keyField");
-      assertTrue(fields != null && fields.length == 1);
-      assertTrue(fields[0].stringValue().equals(DocHelper.KEYWORD_TEXT));
-
-      fields = doc.getFields(DocHelper.NO_NORMS_KEY);
-      assertTrue(fields != null && fields.length == 1);
-      assertTrue(fields[0].stringValue().equals(DocHelper.NO_NORMS_TEXT));
-
-      fields = doc.getFields(DocHelper.TEXT_FIELD_3_KEY);
-      assertTrue(fields != null && fields.length == 1);
-      assertTrue(fields[0].stringValue().equals(DocHelper.FIELD_3_TEXT));
-
-      // test that the norm file is not present if omitNorms is true
-      for (int i=0; i<reader.fieldInfos.size(); i++) {
-        FieldInfo fi = reader.fieldInfos.fieldInfo(i);
-        if (fi.isIndexed) {
-          assertTrue(fi.omitNorms == !dir.fileExists(segName + ".f" + i));
-        }
+  public void testAddDocument() throws Exception {
+    Analyzer analyzer = new Analyzer() {
+      public TokenStream tokenStream(String fieldName, Reader reader) {
+        return new WhitespaceTokenizer(reader);
       }
 
-    } catch (IOException e) {
-      e.printStackTrace();
-      assertTrue(false);
+      public int getPositionIncrementGap(String fieldName) {
+        return 500;
+      }
+    };
+    Similarity similarity = Similarity.getDefault();
+    DocumentWriter writer = new DocumentWriter(dir, analyzer, similarity, 50);
+    String segName = "test";
+    writer.addDocument(segName, testDoc);
+    //After adding the document, we should be able to read it back in
+    SegmentReader reader = SegmentReader.get(new SegmentInfo(segName, 1, dir));
+    assertTrue(reader != null);
+    Document doc = reader.document(0);
+    assertTrue(doc != null);
+
+    //System.out.println("Document: " + doc);
+    Field [] fields = doc.getFields("textField2");
+    assertTrue(fields != null && fields.length == 1);
+    assertTrue(fields[0].stringValue().equals(DocHelper.FIELD_2_TEXT));
+    assertTrue(fields[0].isTermVectorStored());
+
+    fields = doc.getFields("textField1");
+    assertTrue(fields != null && fields.length == 1);
+    assertTrue(fields[0].stringValue().equals(DocHelper.FIELD_1_TEXT));
+    assertFalse(fields[0].isTermVectorStored());
+
+    fields = doc.getFields("keyField");
+    assertTrue(fields != null && fields.length == 1);
+    assertTrue(fields[0].stringValue().equals(DocHelper.KEYWORD_TEXT));
+
+    fields = doc.getFields(DocHelper.NO_NORMS_KEY);
+    assertTrue(fields != null && fields.length == 1);
+    assertTrue(fields[0].stringValue().equals(DocHelper.NO_NORMS_TEXT));
+
+    fields = doc.getFields(DocHelper.TEXT_FIELD_3_KEY);
+    assertTrue(fields != null && fields.length == 1);
+    assertTrue(fields[0].stringValue().equals(DocHelper.FIELD_3_TEXT));
+
+    // test that the norm file is not present if omitNorms is true
+    for (int i = 0; i < reader.fieldInfos.size(); i++) {
+      FieldInfo fi = reader.fieldInfos.fieldInfo(i);
+      if (fi.isIndexed) {
+        assertTrue(fi.omitNorms == !dir.fileExists(segName + ".f" + i));
+      }
     }
+
+    TermPositions termPositions = reader.termPositions(new Term(DocHelper.REPEATED_KEY, "repeated"));
+    assertTrue(termPositions.next());
+    int freq = termPositions.freq();
+    assertEquals(2, freq);
+    assertEquals(0, termPositions.nextPosition());
+    assertEquals(502, termPositions.nextPosition());
   }
 }
diff --git a/src/test/org/apache/lucene/search/TestPhraseQuery.java b/src/test/org/apache/lucene/search/TestPhraseQuery.java
index 940bf48d7f6..2fa0410b5b4 100644
--- a/src/test/org/apache/lucene/search/TestPhraseQuery.java
+++ b/src/test/org/apache/lucene/search/TestPhraseQuery.java
@@ -17,10 +17,14 @@ package org.apache.lucene.search;
  */
 
 import java.io.IOException;
+import java.io.Reader;
 
 import junit.framework.TestCase;
 import org.apache.lucene.analysis.WhitespaceAnalyzer;
 import org.apache.lucene.analysis.StopAnalyzer;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.index.IndexWriter;
@@ -41,10 +45,22 @@ public class TestPhraseQuery extends TestCase {
 
   public void setUp() throws Exception {
     directory = new RAMDirectory();
-    IndexWriter writer = new IndexWriter(directory, new WhitespaceAnalyzer(), true);
+    Analyzer analyzer = new Analyzer() {
+      public TokenStream tokenStream(String fieldName, Reader reader) {
+        return new WhitespaceTokenizer(reader);
+      }
+
+      public int getPositionIncrementGap(String fieldName) {
+        return 100;
+      }
+    };
+    IndexWriter writer = new IndexWriter(directory, analyzer, true);
     
     Document doc = new Document();
     doc.add(new Field("field", "one two three four five", Field.Store.YES, Field.Index.TOKENIZED));
+    doc.add(new Field("repeated", "this is a repeated field - first part", Field.Store.YES, Field.Index.TOKENIZED));
+    Field repeatedField = new Field("repeated", "second part of a repeated field", Field.Store.YES, Field.Index.TOKENIZED);
+    doc.add(repeatedField);
     writer.addDocument(doc);
     
     writer.optimize();
@@ -294,4 +310,15 @@ public class TestPhraseQuery extends TestCase {
     assertEquals(2, hits.id(2));
   }
 
+  public void testWrappedPhrase() throws IOException {
+    query.add(new Term("repeated", "first"));
+    query.add(new Term("repeated", "part"));
+    query.add(new Term("repeated", "second"));
+    query.add(new Term("repeated", "part"));
+    query.setSlop(99);
+
+    Hits hits = searcher.search(query);
+    assertEquals(0, hits.length());
+  }
+
 }