LUCENE-1542: properly index first token(s) with 0 position increment

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@780220 13f79535-47bb-0310-9956-ffa450edef68
2009-05-30 09:36:10 +00:00 · 2009-05-30 09:36:10 +00:00 · 80a79f5bee
parent 5f6d0c7bd6
commit 80a79f5bee
8 changed files with 230 additions and 7 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -71,6 +71,17 @@ Changes in runtime behavior
    with SortField.FIELD_DOC (it was unnecessary as Lucene breaks ties
    internally by docID). (Shai Erera via Michael McCandless)
 6. LUCENE-1542: When the first token(s) have 0 position increment,
    IndexWriter used to incorrectly record the position as -1, if no
    payload is present, or Integer.MAX_VALUE if a payload is present.
    This causes positional queries to fail to match.  The bug is now
    fixed, but if your app relies on the buggy behavior then you must
    call IndexWriter.setAllowMinus1Position().  That API is deprecated
    so you must fix your application, and rebuild your index, to not
    rely on this behavior by the 3.0 release of Lucene. (Jonathan
    Mamou, Mark Miller via Mike McCandless)
 API Changes
 1. LUCENE-1419: Add expert API to set custom indexing chain. This API is 
@ -186,6 +197,16 @@ Bug fixes
 10. LUCENE-1647: Fix case where IndexReader.undeleteAll would cause
    the segment's deletion count to be incorrect. (Mike McCandless)
 11. LUCENE-1542: When the first token(s) have 0 position increment,
    IndexWriter used to incorrectly record the position as -1, if no
    payload is present, or Integer.MAX_VALUE if a payload is present.
    This causes positional queries to fail to match.  The bug is now
    fixed, but if your app relies on the buggy behavior then you must
    call IndexWriter.setAllowMinus1Position().  That API is deprecated
    so you must fix your application, and rebuild your index, to not
    rely on this behavior by the 3.0 release of Lucene. (Jonathan
    Mamou, Mark Miller via Mike McCandless)
 New features
 1. LUCENE-1411: Added expert API to open an IndexWriter on a prior
--- a/common-build.xml
+++ b/common-build.xml
@ -42,7 +42,7 @@
  <property name="Name" value="Lucene"/>
  <property name="dev.version" value="2.9-dev"/>
  <property name="version" value="${dev.version}"/>
-  <property name="compatibility.tag" value="lucene_2_4_back_compat_tests_20090526"/>
+  <property name="compatibility.tag" value="lucene_2_4_back_compat_tests_20090530"/>
  <property name="spec.version" value="${version}"/>	
  <property name="year" value="2000-${current.year}"/>
  <property name="final.name" value="lucene-${name}-${version}"/>
--- a/src/java/org/apache/lucene/index/DocInverterPerField.java
+++ b/src/java/org/apache/lucene/index/DocInverterPerField.java
@ -126,6 +126,9 @@ final class DocInverterPerField extends DocFieldConsumerPerField {
          // reset the TokenStream to the first token
          stream.reset();
          // deprecated
          final boolean allowMinus1Position = docState.allowMinus1Position;
          try {
            int offsetEnd = fieldState.offset-1;
@ -162,7 +165,11 @@ final class DocInverterPerField extends DocFieldConsumerPerField {
              }
              final int posIncr = posIncrAttribute.getPositionIncrement();
-              fieldState.position += posIncr - 1;
+              fieldState.position += posIncr;
              if (allowMinus1Position || fieldState.position > 0) {
                fieldState.position--;
              }
              if (posIncr == 0)
                fieldState.numOverlap++;
--- a/src/java/org/apache/lucene/index/DocumentsWriter.java
+++ b/src/java/org/apache/lucene/index/DocumentsWriter.java
@ -150,6 +150,9 @@ final class DocumentsWriter {
    Document doc;
    String maxTermPrefix;
    // deprecated
    boolean allowMinus1Position;
    // Only called by asserts
    public boolean testPoint(String name) {
      return docWriter.writer.testPoint(name);
@ -298,6 +301,11 @@ final class DocumentsWriter {
      threadStates[i].docState.similarity = similarity;
  }
  synchronized void setAllowMinus1Position() {
    for(int i=0;i<threadStates.length;i++)
      threadStates[i].docState.allowMinus1Position = true;;
  }
  /** Set how much RAM we can use before flushing. */
  synchronized void setRAMBufferSizeMB(double mb) {
    if (mb == IndexWriter.DISABLE_AUTO_FLUSH) {
--- a/src/java/org/apache/lucene/index/DocumentsWriterThreadState.java
+++ b/src/java/org/apache/lucene/index/DocumentsWriterThreadState.java
@ -40,6 +40,7 @@ final class DocumentsWriterThreadState {
    docState.infoStream = docWriter.infoStream;
    docState.similarity = docWriter.similarity;
    docState.docWriter = docWriter;
    docState.allowMinus1Position = docWriter.writer.getAllowMinus1Position();
    consumer = docWriter.consumer.addThread(this);
  }
--- a/src/java/org/apache/lucene/index/IndexWriter.java
+++ b/src/java/org/apache/lucene/index/IndexWriter.java
@ -5485,6 +5485,22 @@ public class IndexWriter {
    throw oom;
  }
  // deprecated
  private boolean allowMinus1Position;
  /** Deprecated: emulates IndexWriter's buggy behavior when
   *  first token(s) have positionIncrement==0 (ie, prior to
   *  fixing LUCENE-1542) */
  public void setAllowMinus1Position() {
    allowMinus1Position = true;
    docWriter.setAllowMinus1Position();
  }
  // deprecated
  boolean getAllowMinus1Position() {
    return allowMinus1Position;
  }
  // Used only by assert for testing.  Current points:
  //   startDoFlush
  //   startCommitMerge
--- a/src/test/org/apache/lucene/index/TestIndexWriter.java
+++ b/src/test/org/apache/lucene/index/TestIndexWriter.java
@ -3594,7 +3594,7 @@ public class TestIndexWriter extends LuceneTestCase
    TermPositions tps = s.getIndexReader().termPositions(new Term("field", "a"));
    assertTrue(tps.next());
    assertEquals(1, tps.freq());
-    assertEquals(-1, tps.nextPosition());
+    assertEquals(0, tps.nextPosition());
    w.close();
    assertTrue(_TestUtil.checkIndex(dir));
--- a/src/test/org/apache/lucene/search/TestPositionIncrement.java
+++ b/src/test/org/apache/lucene/search/TestPositionIncrement.java
@ -17,8 +17,11 @@ package org.apache.lucene.search;
 * limitations under the License.
 */
 import java.io.IOException;
 import java.io.Reader;
 import java.io.IOException;
 import java.io.StringReader;
 import java.util.Collection;
 import java.util.Iterator;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.StopFilter;
@ -26,14 +29,27 @@ import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.WhitespaceAnalyzer;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.TermPositions;
 import org.apache.lucene.queryParser.QueryParser;
-import org.apache.lucene.store.RAMDirectory;
+import org.apache.lucene.store.MockRAMDirectory;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.analysis.LowerCaseTokenizer;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.index.Payload;
 import org.apache.lucene.search.payloads.PayloadSpanUtil;
 import org.apache.lucene.search.spans.PayloadSpans;
 import org.apache.lucene.search.spans.SpanNearQuery;
 import org.apache.lucene.search.spans.SpanQuery;
 import org.apache.lucene.search.spans.SpanTermQuery;
 import org.apache.lucene.search.spans.Spans;
 /**
 * Term position unit test.
@ -48,7 +64,7 @@ public class TestPositionIncrement extends LuceneTestCase {
      public TokenStream tokenStream(String fieldName, Reader reader) {
        return new TokenStream() {
          private final String[] TOKENS = {"1", "2", "3", "4", "5"};
-          private final int[] INCREMENTS = {1, 2, 1, 0, 1};
+          private final int[] INCREMENTS = {0, 2, 1, 0, 1};
          private int i = 0;
          PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
@ -67,7 +83,7 @@ public class TestPositionIncrement extends LuceneTestCase {
        };
      }
    };
-    RAMDirectory store = new RAMDirectory();
+    Directory store = new MockRAMDirectory();
    IndexWriter writer = new IndexWriter(store, analyzer, true,
                                         IndexWriter.MaxFieldLength.LIMITED);
    Document d = new Document();
@ -75,8 +91,20 @@ public class TestPositionIncrement extends LuceneTestCase {
    writer.addDocument(d);
    writer.optimize();
    writer.close();
    IndexSearcher searcher = new IndexSearcher(store);
    TermPositions pos = searcher.getIndexReader().termPositions(new Term("field", "1"));
    pos.next();
    // first token should be at position 0
    assertEquals(0, pos.nextPosition());
    pos = searcher.getIndexReader().termPositions(new Term("field", "2"));
    pos.next();
    // second token should be at position 2
    assertEquals(2, pos.nextPosition());
    PhraseQuery q;
    ScoreDoc[] hits;
@ -202,4 +230,146 @@ public class TestPositionIncrement extends LuceneTestCase {
      StopFilter.setEnablePositionIncrementsDefault(dflt);
    }
  }
  public void testPayloadsPos0() throws Exception {
    for(int x=0;x<2;x++) {
      Directory dir = new MockRAMDirectory();
      IndexWriter writer = new IndexWriter(dir,
                                           new TestPayloadAnalyzer(), true,
                                           IndexWriter.MaxFieldLength.LIMITED);
      if (x == 1) {
        writer.setAllowMinus1Position();
      }
      Document doc = new Document();
      doc.add(new Field("content",
                        new StringReader("a a b c d e a f g h i j a b k k")));
      writer.addDocument(doc);
      IndexReader r = writer.getReader();
      TermPositions tp = r.termPositions(new Term("content", "a"));
      int count = 0;
      assertTrue(tp.next());
      // "a" occurs 4 times
      assertEquals(4, tp.freq());
      int expected;
      if (x == 1) {
        expected = Integer.MAX_VALUE;
      } else {
        expected = 0;
      }
      assertEquals(expected, tp.nextPosition());
      if (x == 1) {
        continue;
      }
      assertEquals(1, tp.nextPosition());
      assertEquals(3, tp.nextPosition());
      assertEquals(6, tp.nextPosition());
      // only one doc has "a"
      assertFalse(tp.next());
      IndexSearcher is = new IndexSearcher(r);
      SpanTermQuery stq1 = new SpanTermQuery(new Term("content", "a"));
      SpanTermQuery stq2 = new SpanTermQuery(new Term("content", "k"));
      SpanQuery[] sqs = { stq1, stq2 };
      SpanNearQuery snq = new SpanNearQuery(sqs, 30, false);
      count = 0;
      boolean sawZero = false;
      //System.out.println("\ngetPayloadSpans test");
      PayloadSpans pspans = snq.getPayloadSpans(is.getIndexReader());
      while (pspans.next()) {
        //System.out.println(pspans.doc() + " - " + pspans.start() + " - "+ pspans.end());
        Collection payloads = pspans.getPayload();
        sawZero |= pspans.start() == 0;
        for (Iterator it = payloads.iterator(); it.hasNext();) {
          count++;
          it.next();
          //System.out.println(new String((byte[]) it.next()));
        }
      }
      assertEquals(5, count);
      assertTrue(sawZero);
      //System.out.println("\ngetSpans test");
      Spans spans = snq.getSpans(is.getIndexReader());
      count = 0;
      sawZero = false;
      while (spans.next()) {
        count++;
        sawZero |= spans.start() == 0;
        //System.out.println(spans.doc() + " - " + spans.start() + " - " + spans.end());
      }
      assertEquals(4, count);
      assertTrue(sawZero);
      //System.out.println("\nPayloadSpanUtil test");
      sawZero = false;
      PayloadSpanUtil psu = new PayloadSpanUtil(is.getIndexReader());
      Collection pls = psu.getPayloadsForQuery(snq);
      count = pls.size();
      for (Iterator it = pls.iterator(); it.hasNext();) {
        String s = new String((byte[]) it.next());
        //System.out.println(s);
        sawZero |= s.equals("pos: 0");
      }
      assertEquals(5, count);
      assertTrue(sawZero);
      writer.close();
      is.getIndexReader().close();
      dir.close();
    }
  }
 }
 class TestPayloadAnalyzer extends Analyzer {
  public TokenStream tokenStream(String fieldName, Reader reader) {
    TokenStream result = new LowerCaseTokenizer(reader);
    return new PayloadFilter(result, fieldName);
  }
 }
 class PayloadFilter extends TokenFilter {
  String fieldName;
  int pos;
  int i;
  final PositionIncrementAttribute posIncrAttr;
  final PayloadAttribute payloadAttr;
  final TermAttribute termAttr;
  public PayloadFilter(TokenStream input, String fieldName) {
    super(input);
    this.fieldName = fieldName;
    pos = 0;
    i = 0;
    posIncrAttr = (PositionIncrementAttribute) input.addAttribute(PositionIncrementAttribute.class);
    payloadAttr = (PayloadAttribute) input.addAttribute(PayloadAttribute.class);
    termAttr = (TermAttribute) input.addAttribute(TermAttribute.class);
  }
  public boolean incrementToken() throws IOException {
    if (input.incrementToken()) {
      payloadAttr.setPayload(new Payload(("pos: " + pos).getBytes()));
      int posIncr;
      if (i % 2 == 1) {
        posIncr = 1;
      } else {
        posIncr = 0;
      }
      posIncrAttr.setPositionIncrement(posIncr);
      pos += posIncr;
      // System.out.println("term=" + termAttr.term() + " pos=" + pos);
      i++;
      return true;
    } else {
      return false;
    }
  }
 }