LUCENE-4828: add AssertingPostingsConsumer, fix minor inconsistencies in producers

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1364792 13f79535-47bb-0310-9956-ffa450edef68
2012-07-23 20:46:08 +00:00 · 2012-07-23 20:46:08 +00:00 · e0d137f8e2
parent 87182914a3
commit e0d137f8e2
6 changed files with 110 additions and 20 deletions
--- a/lucene/core/src/java/org/apache/lucene/codecs/PostingsConsumer.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/PostingsConsumer.java
@ -49,14 +49,17 @@ import org.apache.lucene.util.FixedBitSet;
 */
 public abstract class PostingsConsumer {

-  /** Adds a new doc in this term. */
+  /** Adds a new doc in this term. 
+   * <code>freq</code> will be -1 when term frequencies are omitted
+   * for the field. */
  public abstract void startDoc(int docID, int freq) throws IOException;

  /** Add a new position & payload, and start/end offset.  A
   *  null payload means no payload; a non-null payload with
   *  zero length also means no payload.  Caller may reuse
   *  the {@link BytesRef} for the payload between calls
-   *  (method must fully consume the payload). */
+   *  (method must fully consume the payload). <code>startOffset</code>
+   *  and <code>endOffset</code> will be -1 when offsets are not indexed. */
  public abstract void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException;

  /** Called when we are done adding positions & payloads
@ -78,7 +81,7 @@ public abstract class PostingsConsumer {
          break;
        }
        visitedDocs.set(doc);
-        this.startDoc(doc, 0);
+        this.startDoc(doc, -1);
        this.finishDoc();
        df++;
      }
--- a/lucene/core/src/java/org/apache/lucene/codecs/TermsConsumer.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/TermsConsumer.java
@ -57,10 +57,14 @@ public abstract class TermsConsumer {
   *  no docs. */
  public abstract PostingsConsumer startTerm(BytesRef text) throws IOException;

-  /** Finishes the current term; numDocs must be > 0. */
+  /** Finishes the current term; numDocs must be > 0.
+   *  <code>stats.totalTermFreq</code> will be -1 when term 
+   *  frequencies are omitted for the field. */
  public abstract void finishTerm(BytesRef text, TermStats stats) throws IOException;

-  /** Called when we are done adding terms to this field */
+  /** Called when we are done adding terms to this field.
+   *  <code>sumTotalTermFreq</code> will be -1 when term 
+   *  frequencies are omitted for the field. */
  public abstract void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException;

  /** Return the BytesRef Comparator used to sort terms
--- a/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java
+++ b/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java
@ -430,7 +430,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
            if (readTermFreq) {
              termDocFreq = postings.docFreqs[termID];
            } else {
-              termDocFreq = 0;
+              termDocFreq = -1;
            }
            postings.lastDocCodes[termID] = -1;
          } else {
@ -441,7 +441,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
          final int code = freq.readVInt();
          if (!readTermFreq) {
            docID += code;
-            termDocFreq = 0;
+            termDocFreq = -1;
          } else {
            docID += code >>> 1;
            if ((code & 1) != 0) {
@ -469,7 +469,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
        // 2nd sweep does the real flush, but I suspect
        // that'd add too much time to flush.
        visitedDocs.set(docID);
-        postingsConsumer.startDoc(docID, termDocFreq);
+        postingsConsumer.startDoc(docID, writeTermFreq ? termDocFreq : -1);
        if (docID < delDocLimit) {
          // Mark it deleted.  TODO: we could also skip
          // writing its postings; this would be
--- a/lucene/core/src/test/org/apache/lucene/index/TestCodecs.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestCodecs.java
@ -154,7 +154,7 @@ public class TestCodecs extends LuceneTestCase {
      for(int i=0;i<docs.length;i++) {
        final int termDocFreq;
        if (field.omitTF) {
-          termDocFreq = 0;
+          termDocFreq = -1;
        } else {
          termDocFreq = positions[i].length;
        }
@ -165,8 +165,8 @@ public class TestCodecs extends LuceneTestCase {
            final PositionData pos = positions[i][j];
            postingsConsumer.addPosition(pos.pos, pos.payload, -1, -1);
          }
-          postingsConsumer.finishDoc();
        }
+        postingsConsumer.finishDoc();
      }
      termsConsumer.finishTerm(text, new TermStats(docs.length, field.omitTF ? -1 : totTF));
      return totTF;
--- a/lucene/core/src/test/org/apache/lucene/index/TestPostingsFormat.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestPostingsFormat.java
@ -406,7 +406,7 @@ public class TestPostingsFormat extends LuceneTestCase {
          if (VERBOSE) {
            System.out.println("    " + docCount + ": docID=" + posting.docID + " freq=" + posting.positions.size());
          }
-          postingsConsumer.startDoc(posting.docID, posting.positions.size());
+          postingsConsumer.startDoc(posting.docID, doFreq ? posting.positions.size() : -1);
          seenDocs.set(posting.docID);
          if (doPos) {
            totalTF += posting.positions.size();
--- a/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingPostingsFormat.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingPostingsFormat.java
@ -35,6 +35,7 @@ import org.apache.lucene.index.SegmentReadState;
 import org.apache.lucene.index.SegmentWriteState;
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.OpenBitSet;

 /**
 * Just like {@link Lucene40PostingsFormat} but with additional asserts.
@ -118,22 +119,23 @@ public class AssertingPostingsFormat extends PostingsFormat {
    private final FieldInfo fieldInfo;
    private BytesRef lastTerm = null;
    private TermsConsumerState state = TermsConsumerState.INITIAL;
+    private AssertingPostingsConsumer lastPostingsConsumer = null;
+    private long sumTotalTermFreq = 0;
+    private long sumDocFreq = 0;
+    private OpenBitSet visitedDocs = new OpenBitSet();
    
    AssertingTermsConsumer(TermsConsumer in, FieldInfo fieldInfo) {
      this.in = in;
      this.fieldInfo = fieldInfo;
    }
    
-    // TODO: AssertingPostingsConsumer
    @Override
    public PostingsConsumer startTerm(BytesRef text) throws IOException {
-      // TODO: assert that if state == START (no finishTerm called), that no actual docs were fed.
-      // TODO: this makes the api really confusing! we should try to clean this up!
-      assert state == TermsConsumerState.INITIAL || state == TermsConsumerState.START;
+      assert state == TermsConsumerState.INITIAL || state == TermsConsumerState.START && lastPostingsConsumer.docFreq == 0;
      state = TermsConsumerState.START;
      assert lastTerm == null || in.getComparator().compare(text, lastTerm) > 0;
      lastTerm = BytesRef.deepCopyOf(text);
-      return in.startTerm(text);
+      return lastPostingsConsumer = new AssertingPostingsConsumer(in.startTerm(text), fieldInfo, visitedDocs);
    }

    @Override
@ -142,24 +144,30 @@ public class AssertingPostingsFormat extends PostingsFormat {
      state = TermsConsumerState.INITIAL;
      assert text.equals(lastTerm);
      assert stats.docFreq > 0; // otherwise, this method should not be called.
+      assert stats.docFreq == lastPostingsConsumer.docFreq;
+      sumDocFreq += stats.docFreq;
      if (fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY) {
        assert stats.totalTermFreq == -1;
+      } else {
+        assert stats.totalTermFreq == lastPostingsConsumer.totalTermFreq;
+        sumTotalTermFreq += stats.totalTermFreq;
      }
      in.finishTerm(text, stats);
    }

    @Override
    public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException {
-      // TODO: assert that if state == START (no finishTerm called), that no actual docs were fed.
-      // TODO: this makes the api really confusing! we should try to clean this up!
-      assert state == TermsConsumerState.INITIAL || state == TermsConsumerState.START;
+      assert state == TermsConsumerState.INITIAL || state == TermsConsumerState.START && lastPostingsConsumer.docFreq == 0;
      state = TermsConsumerState.FINISHED;
      assert docCount >= 0;
+      assert docCount == visitedDocs.cardinality();
      assert sumDocFreq >= docCount;
+      assert sumDocFreq == this.sumDocFreq;
      if (fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY) {
        assert sumTotalTermFreq == -1;
      } else {
-        assert sumTotalTermFreq >= sumDocFreq;        
+        assert sumTotalTermFreq >= sumDocFreq;
+        assert sumTotalTermFreq == this.sumTotalTermFreq;
      }
      in.finish(sumTotalTermFreq, sumDocFreq, docCount);
    }
@ -169,4 +177,79 @@ public class AssertingPostingsFormat extends PostingsFormat {
      return in.getComparator();
    }
  }
+  
+  static enum PostingsConsumerState { INITIAL, START };
+  static class AssertingPostingsConsumer extends PostingsConsumer {
+    private final PostingsConsumer in;
+    private final FieldInfo fieldInfo;
+    private final OpenBitSet visitedDocs;
+    private PostingsConsumerState state = PostingsConsumerState.INITIAL;
+    private int freq;
+    private int positionCount;
+    private int lastPosition = 0;
+    private int lastStartOffset = 0;
+    int docFreq = 0;
+    long totalTermFreq = 0;
+    
+    AssertingPostingsConsumer(PostingsConsumer in, FieldInfo fieldInfo, OpenBitSet visitedDocs) {
+      this.in = in;
+      this.fieldInfo = fieldInfo;
+      this.visitedDocs = visitedDocs;
+    }
+
+    @Override
+    public void startDoc(int docID, int freq) throws IOException {
+      assert state == PostingsConsumerState.INITIAL;
+      state = PostingsConsumerState.START;
+      assert docID >= 0;
+      if (fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY) {
+        assert freq == -1;
+        this.freq = 0; // we don't expect any positions here
+      } else {
+        assert freq > 0;
+        this.freq = freq;
+        totalTermFreq += freq;
+      }
+      this.positionCount = 0;
+      this.lastPosition = 0;
+      this.lastStartOffset = 0;
+      docFreq++;
+      visitedDocs.set(docID);
+      in.startDoc(docID, freq);
+    }
+
+    @Override
+    public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException {
+      assert state == PostingsConsumerState.START;
+      assert positionCount < freq;
+      positionCount++;
+      assert position >= lastPosition || position == -1; /* we still allow -1 from old 3.x indexes */
+      lastPosition = position;
+      if (fieldInfo.getIndexOptions() == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) {
+        assert startOffset >= 0;
+        assert startOffset >= lastStartOffset;
+        lastStartOffset = startOffset;
+        assert endOffset >= startOffset;
+      } else {
+        assert startOffset == -1;
+        assert endOffset == -1;
+      }
+      if (payload != null) {
+        assert fieldInfo.hasPayloads();
+      }
+      in.addPosition(position, payload, startOffset, endOffset);
+    }
+
+    @Override
+    public void finishDoc() throws IOException {
+      assert state == PostingsConsumerState.START;
+      state = PostingsConsumerState.INITIAL;
+      if (fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) {
+        assert positionCount == 0; // we should not have fed any positions!
+      } else {
+        assert positionCount == freq;
+      }
+      in.finishDoc();
+    }
+  }
 }