mirror of https://github.com/apache/lucene.git
LUCENE-4828: add AssertingPostingsConsumer, fix minor inconsistencies in producers
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1364792 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
87182914a3
commit
e0d137f8e2
|
@ -49,14 +49,17 @@ import org.apache.lucene.util.FixedBitSet;
|
|||
*/
|
||||
public abstract class PostingsConsumer {
|
||||
|
||||
/** Adds a new doc in this term. */
|
||||
/** Adds a new doc in this term.
|
||||
* <code>freq</code> will be -1 when term frequencies are omitted
|
||||
* for the field. */
|
||||
public abstract void startDoc(int docID, int freq) throws IOException;
|
||||
|
||||
/** Add a new position & payload, and start/end offset. A
|
||||
* null payload means no payload; a non-null payload with
|
||||
* zero length also means no payload. Caller may reuse
|
||||
* the {@link BytesRef} for the payload between calls
|
||||
* (method must fully consume the payload). */
|
||||
* (method must fully consume the payload). <code>startOffset</code>
|
||||
* and <code>endOffset</code> will be -1 when offsets are not indexed. */
|
||||
public abstract void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException;
|
||||
|
||||
/** Called when we are done adding positions & payloads
|
||||
|
@ -78,7 +81,7 @@ public abstract class PostingsConsumer {
|
|||
break;
|
||||
}
|
||||
visitedDocs.set(doc);
|
||||
this.startDoc(doc, 0);
|
||||
this.startDoc(doc, -1);
|
||||
this.finishDoc();
|
||||
df++;
|
||||
}
|
||||
|
|
|
@ -57,10 +57,14 @@ public abstract class TermsConsumer {
|
|||
* no docs. */
|
||||
public abstract PostingsConsumer startTerm(BytesRef text) throws IOException;
|
||||
|
||||
/** Finishes the current term; numDocs must be > 0. */
|
||||
/** Finishes the current term; numDocs must be > 0.
|
||||
* <code>stats.totalTermFreq</code> will be -1 when term
|
||||
* frequencies are omitted for the field. */
|
||||
public abstract void finishTerm(BytesRef text, TermStats stats) throws IOException;
|
||||
|
||||
/** Called when we are done adding terms to this field */
|
||||
/** Called when we are done adding terms to this field.
|
||||
* <code>sumTotalTermFreq</code> will be -1 when term
|
||||
* frequencies are omitted for the field. */
|
||||
public abstract void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException;
|
||||
|
||||
/** Return the BytesRef Comparator used to sort terms
|
||||
|
|
|
@ -430,7 +430,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
|
|||
if (readTermFreq) {
|
||||
termDocFreq = postings.docFreqs[termID];
|
||||
} else {
|
||||
termDocFreq = 0;
|
||||
termDocFreq = -1;
|
||||
}
|
||||
postings.lastDocCodes[termID] = -1;
|
||||
} else {
|
||||
|
@ -441,7 +441,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
|
|||
final int code = freq.readVInt();
|
||||
if (!readTermFreq) {
|
||||
docID += code;
|
||||
termDocFreq = 0;
|
||||
termDocFreq = -1;
|
||||
} else {
|
||||
docID += code >>> 1;
|
||||
if ((code & 1) != 0) {
|
||||
|
@ -469,7 +469,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
|
|||
// 2nd sweep does the real flush, but I suspect
|
||||
// that'd add too much time to flush.
|
||||
visitedDocs.set(docID);
|
||||
postingsConsumer.startDoc(docID, termDocFreq);
|
||||
postingsConsumer.startDoc(docID, writeTermFreq ? termDocFreq : -1);
|
||||
if (docID < delDocLimit) {
|
||||
// Mark it deleted. TODO: we could also skip
|
||||
// writing its postings; this would be
|
||||
|
|
|
@ -154,7 +154,7 @@ public class TestCodecs extends LuceneTestCase {
|
|||
for(int i=0;i<docs.length;i++) {
|
||||
final int termDocFreq;
|
||||
if (field.omitTF) {
|
||||
termDocFreq = 0;
|
||||
termDocFreq = -1;
|
||||
} else {
|
||||
termDocFreq = positions[i].length;
|
||||
}
|
||||
|
@ -165,8 +165,8 @@ public class TestCodecs extends LuceneTestCase {
|
|||
final PositionData pos = positions[i][j];
|
||||
postingsConsumer.addPosition(pos.pos, pos.payload, -1, -1);
|
||||
}
|
||||
postingsConsumer.finishDoc();
|
||||
}
|
||||
postingsConsumer.finishDoc();
|
||||
}
|
||||
termsConsumer.finishTerm(text, new TermStats(docs.length, field.omitTF ? -1 : totTF));
|
||||
return totTF;
|
||||
|
|
|
@ -406,7 +406,7 @@ public class TestPostingsFormat extends LuceneTestCase {
|
|||
if (VERBOSE) {
|
||||
System.out.println(" " + docCount + ": docID=" + posting.docID + " freq=" + posting.positions.size());
|
||||
}
|
||||
postingsConsumer.startDoc(posting.docID, posting.positions.size());
|
||||
postingsConsumer.startDoc(posting.docID, doFreq ? posting.positions.size() : -1);
|
||||
seenDocs.set(posting.docID);
|
||||
if (doPos) {
|
||||
totalTF += posting.positions.size();
|
||||
|
|
|
@ -35,6 +35,7 @@ import org.apache.lucene.index.SegmentReadState;
|
|||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.OpenBitSet;
|
||||
|
||||
/**
|
||||
* Just like {@link Lucene40PostingsFormat} but with additional asserts.
|
||||
|
@ -118,22 +119,23 @@ public class AssertingPostingsFormat extends PostingsFormat {
|
|||
private final FieldInfo fieldInfo;
|
||||
private BytesRef lastTerm = null;
|
||||
private TermsConsumerState state = TermsConsumerState.INITIAL;
|
||||
private AssertingPostingsConsumer lastPostingsConsumer = null;
|
||||
private long sumTotalTermFreq = 0;
|
||||
private long sumDocFreq = 0;
|
||||
private OpenBitSet visitedDocs = new OpenBitSet();
|
||||
|
||||
AssertingTermsConsumer(TermsConsumer in, FieldInfo fieldInfo) {
|
||||
this.in = in;
|
||||
this.fieldInfo = fieldInfo;
|
||||
}
|
||||
|
||||
// TODO: AssertingPostingsConsumer
|
||||
@Override
|
||||
public PostingsConsumer startTerm(BytesRef text) throws IOException {
|
||||
// TODO: assert that if state == START (no finishTerm called), that no actual docs were fed.
|
||||
// TODO: this makes the api really confusing! we should try to clean this up!
|
||||
assert state == TermsConsumerState.INITIAL || state == TermsConsumerState.START;
|
||||
assert state == TermsConsumerState.INITIAL || state == TermsConsumerState.START && lastPostingsConsumer.docFreq == 0;
|
||||
state = TermsConsumerState.START;
|
||||
assert lastTerm == null || in.getComparator().compare(text, lastTerm) > 0;
|
||||
lastTerm = BytesRef.deepCopyOf(text);
|
||||
return in.startTerm(text);
|
||||
return lastPostingsConsumer = new AssertingPostingsConsumer(in.startTerm(text), fieldInfo, visitedDocs);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -142,24 +144,30 @@ public class AssertingPostingsFormat extends PostingsFormat {
|
|||
state = TermsConsumerState.INITIAL;
|
||||
assert text.equals(lastTerm);
|
||||
assert stats.docFreq > 0; // otherwise, this method should not be called.
|
||||
assert stats.docFreq == lastPostingsConsumer.docFreq;
|
||||
sumDocFreq += stats.docFreq;
|
||||
if (fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY) {
|
||||
assert stats.totalTermFreq == -1;
|
||||
} else {
|
||||
assert stats.totalTermFreq == lastPostingsConsumer.totalTermFreq;
|
||||
sumTotalTermFreq += stats.totalTermFreq;
|
||||
}
|
||||
in.finishTerm(text, stats);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException {
|
||||
// TODO: assert that if state == START (no finishTerm called), that no actual docs were fed.
|
||||
// TODO: this makes the api really confusing! we should try to clean this up!
|
||||
assert state == TermsConsumerState.INITIAL || state == TermsConsumerState.START;
|
||||
assert state == TermsConsumerState.INITIAL || state == TermsConsumerState.START && lastPostingsConsumer.docFreq == 0;
|
||||
state = TermsConsumerState.FINISHED;
|
||||
assert docCount >= 0;
|
||||
assert docCount == visitedDocs.cardinality();
|
||||
assert sumDocFreq >= docCount;
|
||||
assert sumDocFreq == this.sumDocFreq;
|
||||
if (fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY) {
|
||||
assert sumTotalTermFreq == -1;
|
||||
} else {
|
||||
assert sumTotalTermFreq >= sumDocFreq;
|
||||
assert sumTotalTermFreq >= sumDocFreq;
|
||||
assert sumTotalTermFreq == this.sumTotalTermFreq;
|
||||
}
|
||||
in.finish(sumTotalTermFreq, sumDocFreq, docCount);
|
||||
}
|
||||
|
@ -169,4 +177,79 @@ public class AssertingPostingsFormat extends PostingsFormat {
|
|||
return in.getComparator();
|
||||
}
|
||||
}
|
||||
|
||||
static enum PostingsConsumerState { INITIAL, START };
|
||||
static class AssertingPostingsConsumer extends PostingsConsumer {
|
||||
private final PostingsConsumer in;
|
||||
private final FieldInfo fieldInfo;
|
||||
private final OpenBitSet visitedDocs;
|
||||
private PostingsConsumerState state = PostingsConsumerState.INITIAL;
|
||||
private int freq;
|
||||
private int positionCount;
|
||||
private int lastPosition = 0;
|
||||
private int lastStartOffset = 0;
|
||||
int docFreq = 0;
|
||||
long totalTermFreq = 0;
|
||||
|
||||
AssertingPostingsConsumer(PostingsConsumer in, FieldInfo fieldInfo, OpenBitSet visitedDocs) {
|
||||
this.in = in;
|
||||
this.fieldInfo = fieldInfo;
|
||||
this.visitedDocs = visitedDocs;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void startDoc(int docID, int freq) throws IOException {
|
||||
assert state == PostingsConsumerState.INITIAL;
|
||||
state = PostingsConsumerState.START;
|
||||
assert docID >= 0;
|
||||
if (fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY) {
|
||||
assert freq == -1;
|
||||
this.freq = 0; // we don't expect any positions here
|
||||
} else {
|
||||
assert freq > 0;
|
||||
this.freq = freq;
|
||||
totalTermFreq += freq;
|
||||
}
|
||||
this.positionCount = 0;
|
||||
this.lastPosition = 0;
|
||||
this.lastStartOffset = 0;
|
||||
docFreq++;
|
||||
visitedDocs.set(docID);
|
||||
in.startDoc(docID, freq);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException {
|
||||
assert state == PostingsConsumerState.START;
|
||||
assert positionCount < freq;
|
||||
positionCount++;
|
||||
assert position >= lastPosition || position == -1; /* we still allow -1 from old 3.x indexes */
|
||||
lastPosition = position;
|
||||
if (fieldInfo.getIndexOptions() == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) {
|
||||
assert startOffset >= 0;
|
||||
assert startOffset >= lastStartOffset;
|
||||
lastStartOffset = startOffset;
|
||||
assert endOffset >= startOffset;
|
||||
} else {
|
||||
assert startOffset == -1;
|
||||
assert endOffset == -1;
|
||||
}
|
||||
if (payload != null) {
|
||||
assert fieldInfo.hasPayloads();
|
||||
}
|
||||
in.addPosition(position, payload, startOffset, endOffset);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void finishDoc() throws IOException {
|
||||
assert state == PostingsConsumerState.START;
|
||||
state = PostingsConsumerState.INITIAL;
|
||||
if (fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) {
|
||||
assert positionCount == 0; // we should not have fed any positions!
|
||||
} else {
|
||||
assert positionCount == freq;
|
||||
}
|
||||
in.finishDoc();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue