LUCENE-4828: add AssertingPostingsConsumer, fix minor inconsistencies in producers

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1364792 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2012-07-23 20:46:08 +00:00
parent 87182914a3
commit e0d137f8e2
6 changed files with 110 additions and 20 deletions

View File

@ -49,14 +49,17 @@ import org.apache.lucene.util.FixedBitSet;
*/
public abstract class PostingsConsumer {
/** Adds a new doc in this term. */
/** Adds a new doc in this term.
* <code>freq</code> will be -1 when term frequencies are omitted
* for the field. */
public abstract void startDoc(int docID, int freq) throws IOException;
/** Add a new position & payload, and start/end offset. A
* null payload means no payload; a non-null payload with
* zero length also means no payload. Caller may reuse
* the {@link BytesRef} for the payload between calls
* (method must fully consume the payload). */
* (method must fully consume the payload). <code>startOffset</code>
* and <code>endOffset</code> will be -1 when offsets are not indexed. */
public abstract void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException;
/** Called when we are done adding positions & payloads
@ -78,7 +81,7 @@ public abstract class PostingsConsumer {
break;
}
visitedDocs.set(doc);
this.startDoc(doc, 0);
this.startDoc(doc, -1);
this.finishDoc();
df++;
}

View File

@ -57,10 +57,14 @@ public abstract class TermsConsumer {
* no docs. */
public abstract PostingsConsumer startTerm(BytesRef text) throws IOException;
/** Finishes the current term; numDocs must be > 0. */
/** Finishes the current term; numDocs must be > 0.
* <code>stats.totalTermFreq</code> will be -1 when term
* frequencies are omitted for the field. */
public abstract void finishTerm(BytesRef text, TermStats stats) throws IOException;
/** Called when we are done adding terms to this field */
/** Called when we are done adding terms to this field.
* <code>sumTotalTermFreq</code> will be -1 when term
* frequencies are omitted for the field. */
public abstract void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException;
/** Return the BytesRef Comparator used to sort terms

View File

@ -430,7 +430,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
if (readTermFreq) {
termDocFreq = postings.docFreqs[termID];
} else {
termDocFreq = 0;
termDocFreq = -1;
}
postings.lastDocCodes[termID] = -1;
} else {
@ -441,7 +441,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
final int code = freq.readVInt();
if (!readTermFreq) {
docID += code;
termDocFreq = 0;
termDocFreq = -1;
} else {
docID += code >>> 1;
if ((code & 1) != 0) {
@ -469,7 +469,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
// 2nd sweep does the real flush, but I suspect
// that'd add too much time to flush.
visitedDocs.set(docID);
postingsConsumer.startDoc(docID, termDocFreq);
postingsConsumer.startDoc(docID, writeTermFreq ? termDocFreq : -1);
if (docID < delDocLimit) {
// Mark it deleted. TODO: we could also skip
// writing its postings; this would be

View File

@ -154,7 +154,7 @@ public class TestCodecs extends LuceneTestCase {
for(int i=0;i<docs.length;i++) {
final int termDocFreq;
if (field.omitTF) {
termDocFreq = 0;
termDocFreq = -1;
} else {
termDocFreq = positions[i].length;
}
@ -165,8 +165,8 @@ public class TestCodecs extends LuceneTestCase {
final PositionData pos = positions[i][j];
postingsConsumer.addPosition(pos.pos, pos.payload, -1, -1);
}
postingsConsumer.finishDoc();
}
postingsConsumer.finishDoc();
}
termsConsumer.finishTerm(text, new TermStats(docs.length, field.omitTF ? -1 : totTF));
return totTF;

View File

@ -406,7 +406,7 @@ public class TestPostingsFormat extends LuceneTestCase {
if (VERBOSE) {
System.out.println(" " + docCount + ": docID=" + posting.docID + " freq=" + posting.positions.size());
}
postingsConsumer.startDoc(posting.docID, posting.positions.size());
postingsConsumer.startDoc(posting.docID, doFreq ? posting.positions.size() : -1);
seenDocs.set(posting.docID);
if (doPos) {
totalTF += posting.positions.size();

View File

@ -35,6 +35,7 @@ import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.OpenBitSet;
/**
* Just like {@link Lucene40PostingsFormat} but with additional asserts.
@ -118,22 +119,23 @@ public class AssertingPostingsFormat extends PostingsFormat {
private final FieldInfo fieldInfo;
private BytesRef lastTerm = null;
private TermsConsumerState state = TermsConsumerState.INITIAL;
private AssertingPostingsConsumer lastPostingsConsumer = null;
private long sumTotalTermFreq = 0;
private long sumDocFreq = 0;
private OpenBitSet visitedDocs = new OpenBitSet();
AssertingTermsConsumer(TermsConsumer in, FieldInfo fieldInfo) {
this.in = in;
this.fieldInfo = fieldInfo;
}
// TODO: AssertingPostingsConsumer
@Override
public PostingsConsumer startTerm(BytesRef text) throws IOException {
// TODO: assert that if state == START (no finishTerm called), that no actual docs were fed.
// TODO: this makes the api really confusing! we should try to clean this up!
assert state == TermsConsumerState.INITIAL || state == TermsConsumerState.START;
assert state == TermsConsumerState.INITIAL || state == TermsConsumerState.START && lastPostingsConsumer.docFreq == 0;
state = TermsConsumerState.START;
assert lastTerm == null || in.getComparator().compare(text, lastTerm) > 0;
lastTerm = BytesRef.deepCopyOf(text);
return in.startTerm(text);
return lastPostingsConsumer = new AssertingPostingsConsumer(in.startTerm(text), fieldInfo, visitedDocs);
}
@Override
@ -142,24 +144,30 @@ public class AssertingPostingsFormat extends PostingsFormat {
state = TermsConsumerState.INITIAL;
assert text.equals(lastTerm);
assert stats.docFreq > 0; // otherwise, this method should not be called.
assert stats.docFreq == lastPostingsConsumer.docFreq;
sumDocFreq += stats.docFreq;
if (fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY) {
assert stats.totalTermFreq == -1;
} else {
assert stats.totalTermFreq == lastPostingsConsumer.totalTermFreq;
sumTotalTermFreq += stats.totalTermFreq;
}
in.finishTerm(text, stats);
}
@Override
public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException {
// TODO: assert that if state == START (no finishTerm called), that no actual docs were fed.
// TODO: this makes the api really confusing! we should try to clean this up!
assert state == TermsConsumerState.INITIAL || state == TermsConsumerState.START;
assert state == TermsConsumerState.INITIAL || state == TermsConsumerState.START && lastPostingsConsumer.docFreq == 0;
state = TermsConsumerState.FINISHED;
assert docCount >= 0;
assert docCount == visitedDocs.cardinality();
assert sumDocFreq >= docCount;
assert sumDocFreq == this.sumDocFreq;
if (fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY) {
assert sumTotalTermFreq == -1;
} else {
assert sumTotalTermFreq >= sumDocFreq;
assert sumTotalTermFreq >= sumDocFreq;
assert sumTotalTermFreq == this.sumTotalTermFreq;
}
in.finish(sumTotalTermFreq, sumDocFreq, docCount);
}
@ -169,4 +177,79 @@ public class AssertingPostingsFormat extends PostingsFormat {
return in.getComparator();
}
}
static enum PostingsConsumerState { INITIAL, START };
static class AssertingPostingsConsumer extends PostingsConsumer {
private final PostingsConsumer in;
private final FieldInfo fieldInfo;
private final OpenBitSet visitedDocs;
private PostingsConsumerState state = PostingsConsumerState.INITIAL;
private int freq;
private int positionCount;
private int lastPosition = 0;
private int lastStartOffset = 0;
int docFreq = 0;
long totalTermFreq = 0;
AssertingPostingsConsumer(PostingsConsumer in, FieldInfo fieldInfo, OpenBitSet visitedDocs) {
this.in = in;
this.fieldInfo = fieldInfo;
this.visitedDocs = visitedDocs;
}
@Override
public void startDoc(int docID, int freq) throws IOException {
assert state == PostingsConsumerState.INITIAL;
state = PostingsConsumerState.START;
assert docID >= 0;
if (fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY) {
assert freq == -1;
this.freq = 0; // we don't expect any positions here
} else {
assert freq > 0;
this.freq = freq;
totalTermFreq += freq;
}
this.positionCount = 0;
this.lastPosition = 0;
this.lastStartOffset = 0;
docFreq++;
visitedDocs.set(docID);
in.startDoc(docID, freq);
}
@Override
public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException {
assert state == PostingsConsumerState.START;
assert positionCount < freq;
positionCount++;
assert position >= lastPosition || position == -1; /* we still allow -1 from old 3.x indexes */
lastPosition = position;
if (fieldInfo.getIndexOptions() == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) {
assert startOffset >= 0;
assert startOffset >= lastStartOffset;
lastStartOffset = startOffset;
assert endOffset >= startOffset;
} else {
assert startOffset == -1;
assert endOffset == -1;
}
if (payload != null) {
assert fieldInfo.hasPayloads();
}
in.addPosition(position, payload, startOffset, endOffset);
}
@Override
public void finishDoc() throws IOException {
assert state == PostingsConsumerState.START;
state = PostingsConsumerState.INITIAL;
if (fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) {
assert positionCount == 0; // we should not have fed any positions!
} else {
assert positionCount == freq;
}
in.finishDoc();
}
}
}