LUCENE-1542: properly index first token(s) with 0 position increment

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@780220 13f79535-47bb-0310-9956-ffa450edef68
2009-05-30 09:36:10 +00:00 · 2009-05-30 09:36:10 +00:00 · 80a79f5bee
parent 5f6d0c7bd6
commit 80a79f5bee
8 changed files with 230 additions and 7 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -71,6 +71,17 @@ Changes in runtime behavior
    with SortField.FIELD_DOC (it was unnecessary as Lucene breaks ties
    internally by docID). (Shai Erera via Michael McCandless)

+ 6. LUCENE-1542: When the first token(s) have 0 position increment,
+    IndexWriter used to incorrectly record the position as -1, if no
+    payload is present, or Integer.MAX_VALUE if a payload is present.
+    This causes positional queries to fail to match.  The bug is now
+    fixed, but if your app relies on the buggy behavior then you must
+    call IndexWriter.setAllowMinus1Position().  That API is deprecated
+    so you must fix your application, and rebuild your index, to not
+    rely on this behavior by the 3.0 release of Lucene. (Jonathan
+    Mamou, Mark Miller via Mike McCandless)
+
+
 API Changes

 1. LUCENE-1419: Add expert API to set custom indexing chain. This API is 
@ -186,6 +197,16 @@ Bug fixes
 10. LUCENE-1647: Fix case where IndexReader.undeleteAll would cause
    the segment's deletion count to be incorrect. (Mike McCandless)

+11. LUCENE-1542: When the first token(s) have 0 position increment,
+    IndexWriter used to incorrectly record the position as -1, if no
+    payload is present, or Integer.MAX_VALUE if a payload is present.
+    This causes positional queries to fail to match.  The bug is now
+    fixed, but if your app relies on the buggy behavior then you must
+    call IndexWriter.setAllowMinus1Position().  That API is deprecated
+    so you must fix your application, and rebuild your index, to not
+    rely on this behavior by the 3.0 release of Lucene. (Jonathan
+    Mamou, Mark Miller via Mike McCandless)
+
 New features

 1. LUCENE-1411: Added expert API to open an IndexWriter on a prior
--- a/common-build.xml
+++ b/common-build.xml
@ -42,7 +42,7 @@
  <property name="Name" value="Lucene"/>
  <property name="dev.version" value="2.9-dev"/>
  <property name="version" value="${dev.version}"/>
-  <property name="compatibility.tag" value="lucene_2_4_back_compat_tests_20090526"/>
+  <property name="compatibility.tag" value="lucene_2_4_back_compat_tests_20090530"/>
  <property name="spec.version" value="${version}"/>	
  <property name="year" value="2000-${current.year}"/>
  <property name="final.name" value="lucene-${name}-${version}"/>
--- a/src/java/org/apache/lucene/index/DocInverterPerField.java
+++ b/src/java/org/apache/lucene/index/DocInverterPerField.java
@ -126,6 +126,9 @@ final class DocInverterPerField extends DocFieldConsumerPerField {
          // reset the TokenStream to the first token
          stream.reset();

+          // deprecated
+          final boolean allowMinus1Position = docState.allowMinus1Position;
+
          try {
            int offsetEnd = fieldState.offset-1;
            
@ -162,7 +165,11 @@ final class DocInverterPerField extends DocFieldConsumerPerField {
              }
              
              final int posIncr = posIncrAttribute.getPositionIncrement();
-              fieldState.position += posIncr - 1;
+              fieldState.position += posIncr;
+              if (allowMinus1Position || fieldState.position > 0) {
+                fieldState.position--;
+              }
+
              if (posIncr == 0)
                fieldState.numOverlap++;

--- a/src/java/org/apache/lucene/index/DocumentsWriter.java
+++ b/src/java/org/apache/lucene/index/DocumentsWriter.java
@ -150,6 +150,9 @@ final class DocumentsWriter {
    Document doc;
    String maxTermPrefix;

+    // deprecated
+    boolean allowMinus1Position;
+
    // Only called by asserts
    public boolean testPoint(String name) {
      return docWriter.writer.testPoint(name);
@ -298,6 +301,11 @@ final class DocumentsWriter {
      threadStates[i].docState.similarity = similarity;
  }

+  synchronized void setAllowMinus1Position() {
+    for(int i=0;i<threadStates.length;i++)
+      threadStates[i].docState.allowMinus1Position = true;;
+  }
+
  /** Set how much RAM we can use before flushing. */
  synchronized void setRAMBufferSizeMB(double mb) {
    if (mb == IndexWriter.DISABLE_AUTO_FLUSH) {
--- a/src/java/org/apache/lucene/index/DocumentsWriterThreadState.java
+++ b/src/java/org/apache/lucene/index/DocumentsWriterThreadState.java
@ -40,6 +40,7 @@ final class DocumentsWriterThreadState {
    docState.infoStream = docWriter.infoStream;
    docState.similarity = docWriter.similarity;
    docState.docWriter = docWriter;
+    docState.allowMinus1Position = docWriter.writer.getAllowMinus1Position();
    consumer = docWriter.consumer.addThread(this);
  }

--- a/src/java/org/apache/lucene/index/IndexWriter.java
+++ b/src/java/org/apache/lucene/index/IndexWriter.java
@ -5485,6 +5485,22 @@ public class IndexWriter {
    throw oom;
  }

+  // deprecated
+  private boolean allowMinus1Position;
+
+  /** Deprecated: emulates IndexWriter's buggy behavior when
+   *  first token(s) have positionIncrement==0 (ie, prior to
+   *  fixing LUCENE-1542) */
+  public void setAllowMinus1Position() {
+    allowMinus1Position = true;
+    docWriter.setAllowMinus1Position();
+  }
+
+  // deprecated
+  boolean getAllowMinus1Position() {
+    return allowMinus1Position;
+  }
+
  // Used only by assert for testing.  Current points:
  //   startDoFlush
  //   startCommitMerge
--- a/src/test/org/apache/lucene/index/TestIndexWriter.java
+++ b/src/test/org/apache/lucene/index/TestIndexWriter.java
@ -3594,7 +3594,7 @@ public class TestIndexWriter extends LuceneTestCase
    TermPositions tps = s.getIndexReader().termPositions(new Term("field", "a"));
    assertTrue(tps.next());
    assertEquals(1, tps.freq());
-    assertEquals(-1, tps.nextPosition());
+    assertEquals(0, tps.nextPosition());
    w.close();

    assertTrue(_TestUtil.checkIndex(dir));
--- a/src/test/org/apache/lucene/search/TestPositionIncrement.java
+++ b/src/test/org/apache/lucene/search/TestPositionIncrement.java
@ -17,8 +17,11 @@ package org.apache.lucene.search;
 * limitations under the License.
 */

-import java.io.IOException;
 import java.io.Reader;
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.Collection;
+import java.util.Iterator;

 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.StopFilter;
@ -26,14 +29,27 @@ import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.WhitespaceAnalyzer;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermPositions;
 import org.apache.lucene.queryParser.QueryParser;
-import org.apache.lucene.store.RAMDirectory;
+import org.apache.lucene.store.MockRAMDirectory;
+import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.analysis.LowerCaseTokenizer;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.index.Payload;
+import org.apache.lucene.search.payloads.PayloadSpanUtil;
+import org.apache.lucene.search.spans.PayloadSpans;
+import org.apache.lucene.search.spans.SpanNearQuery;
+import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.search.spans.SpanTermQuery;
+import org.apache.lucene.search.spans.Spans;

 /**
 * Term position unit test.
@ -48,7 +64,7 @@ public class TestPositionIncrement extends LuceneTestCase {
      public TokenStream tokenStream(String fieldName, Reader reader) {
        return new TokenStream() {
          private final String[] TOKENS = {"1", "2", "3", "4", "5"};
-          private final int[] INCREMENTS = {1, 2, 1, 0, 1};
+          private final int[] INCREMENTS = {0, 2, 1, 0, 1};
          private int i = 0;

          PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
@ -67,7 +83,7 @@ public class TestPositionIncrement extends LuceneTestCase {
        };
      }
    };
-    RAMDirectory store = new RAMDirectory();
+    Directory store = new MockRAMDirectory();
    IndexWriter writer = new IndexWriter(store, analyzer, true,
                                         IndexWriter.MaxFieldLength.LIMITED);
    Document d = new Document();
@ -75,8 +91,20 @@ public class TestPositionIncrement extends LuceneTestCase {
    writer.addDocument(d);
    writer.optimize();
    writer.close();
+    

    IndexSearcher searcher = new IndexSearcher(store);
+    
+    TermPositions pos = searcher.getIndexReader().termPositions(new Term("field", "1"));
+    pos.next();
+    // first token should be at position 0
+    assertEquals(0, pos.nextPosition());
+    
+    pos = searcher.getIndexReader().termPositions(new Term("field", "2"));
+    pos.next();
+    // second token should be at position 2
+    assertEquals(2, pos.nextPosition());
+    
    PhraseQuery q;
    ScoreDoc[] hits;

@ -202,4 +230,146 @@ public class TestPositionIncrement extends LuceneTestCase {
      StopFilter.setEnablePositionIncrementsDefault(dflt);
    }
  }
+  
+  public void testPayloadsPos0() throws Exception {
+    for(int x=0;x<2;x++) {
+      Directory dir = new MockRAMDirectory();
+      IndexWriter writer = new IndexWriter(dir,
+                                           new TestPayloadAnalyzer(), true,
+                                           IndexWriter.MaxFieldLength.LIMITED);
+      if (x == 1) {
+        writer.setAllowMinus1Position();
+      }
+      Document doc = new Document();
+      doc.add(new Field("content",
+                        new StringReader("a a b c d e a f g h i j a b k k")));
+      writer.addDocument(doc);
+
+      IndexReader r = writer.getReader();
+
+      TermPositions tp = r.termPositions(new Term("content", "a"));
+      int count = 0;
+      assertTrue(tp.next());
+      // "a" occurs 4 times
+      assertEquals(4, tp.freq());
+      int expected;
+      if (x == 1) {
+        expected = Integer.MAX_VALUE;
+      } else {
+        expected = 0;
+      }
+      assertEquals(expected, tp.nextPosition());
+      if (x == 1) {
+        continue;
+      }
+      assertEquals(1, tp.nextPosition());
+      assertEquals(3, tp.nextPosition());
+      assertEquals(6, tp.nextPosition());
+
+      // only one doc has "a"
+      assertFalse(tp.next());
+
+      IndexSearcher is = new IndexSearcher(r);
+    
+      SpanTermQuery stq1 = new SpanTermQuery(new Term("content", "a"));
+      SpanTermQuery stq2 = new SpanTermQuery(new Term("content", "k"));
+      SpanQuery[] sqs = { stq1, stq2 };
+      SpanNearQuery snq = new SpanNearQuery(sqs, 30, false);
+
+      count = 0;
+      boolean sawZero = false;
+      //System.out.println("\ngetPayloadSpans test");
+      PayloadSpans pspans = snq.getPayloadSpans(is.getIndexReader());
+      while (pspans.next()) {
+        //System.out.println(pspans.doc() + " - " + pspans.start() + " - "+ pspans.end());
+        Collection payloads = pspans.getPayload();
+        sawZero |= pspans.start() == 0;
+        for (Iterator it = payloads.iterator(); it.hasNext();) {
+          count++;
+          it.next();
+          //System.out.println(new String((byte[]) it.next()));
+        }
+      }
+      assertEquals(5, count);
+      assertTrue(sawZero);
+
+      //System.out.println("\ngetSpans test");
+      Spans spans = snq.getSpans(is.getIndexReader());
+      count = 0;
+      sawZero = false;
+      while (spans.next()) {
+        count++;
+        sawZero |= spans.start() == 0;
+        //System.out.println(spans.doc() + " - " + spans.start() + " - " + spans.end());
+      }
+      assertEquals(4, count);
+      assertTrue(sawZero);
+		
+      //System.out.println("\nPayloadSpanUtil test");
+
+      sawZero = false;
+      PayloadSpanUtil psu = new PayloadSpanUtil(is.getIndexReader());
+      Collection pls = psu.getPayloadsForQuery(snq);
+      count = pls.size();
+      for (Iterator it = pls.iterator(); it.hasNext();) {
+        String s = new String((byte[]) it.next());
+        //System.out.println(s);
+        sawZero |= s.equals("pos: 0");
+      }
+      assertEquals(5, count);
+      assertTrue(sawZero);
+      writer.close();
+      is.getIndexReader().close();
+      dir.close();
+    }
+  }
+}
+
+class TestPayloadAnalyzer extends Analyzer {
+
+  public TokenStream tokenStream(String fieldName, Reader reader) {
+    TokenStream result = new LowerCaseTokenizer(reader);
+    return new PayloadFilter(result, fieldName);
+  }
+}
+
+class PayloadFilter extends TokenFilter {
+  String fieldName;
+
+  int pos;
+
+  int i;
+
+  final PositionIncrementAttribute posIncrAttr;
+  final PayloadAttribute payloadAttr;
+  final TermAttribute termAttr;
+
+  public PayloadFilter(TokenStream input, String fieldName) {
+    super(input);
+    this.fieldName = fieldName;
+    pos = 0;
+    i = 0;
+    posIncrAttr = (PositionIncrementAttribute) input.addAttribute(PositionIncrementAttribute.class);
+    payloadAttr = (PayloadAttribute) input.addAttribute(PayloadAttribute.class);
+    termAttr = (TermAttribute) input.addAttribute(TermAttribute.class);
+  }
+
+  public boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      payloadAttr.setPayload(new Payload(("pos: " + pos).getBytes()));
+      int posIncr;
+      if (i % 2 == 1) {
+        posIncr = 1;
+      } else {
+        posIncr = 0;
+      }
+      posIncrAttr.setPositionIncrement(posIncr);
+      pos += posIncr;
+      // System.out.println("term=" + termAttr.term() + " pos=" + pos);
+      i++;
+      return true;
+    } else {
+      return false;
+    }
+  }
 }