LUCENE-2862: use the stats instead of an extra byte in pulsing

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1059371 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2011-01-15 16:46:35 +00:00
parent bcbf1d462c
commit d08d464528
2 changed files with 10 additions and 9 deletions

View File

@ -54,6 +54,7 @@ public class PulsingPostingsReaderImpl extends PostingsReaderBase {
public void init(IndexInput termsIn) throws IOException {
CodecUtil.checkHeader(termsIn, PulsingPostingsWriterImpl.CODEC,
PulsingPostingsWriterImpl.VERSION_START, PulsingPostingsWriterImpl.VERSION_START);
maxPositions = termsIn.readVInt();
wrappedPostingsReader.init(termsIn);
}
@ -115,8 +116,10 @@ public class PulsingPostingsReaderImpl extends PostingsReaderBase {
termState.pendingIndexTerm |= isIndexTerm;
// TODO: wasteful to use whole byte for this (need just a 1 bit);
if (termsIn.readByte() == 1) {
// total TF, but in the omitTFAP case its computed based on docFreq.
long count = fieldInfo.omitTermFreqAndPositions ? termState.docFreq : termState.totalTermFreq;
if (count <= maxPositions) {
// Inlined into terms dict -- just read the byte[] blob in,
// but don't decode it now (we only decode when a DocsEnum

View File

@ -27,10 +27,10 @@ import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CodecUtil;
// TODO: we now pulse entirely according to docFreq of the
// term; it might be better to eg pulse by "net bytes used"
// so that a term that has only 1 doc but zillions of
// positions would not be inlined. Though this is
// TODO: we pulse based on total TF of the term,
// it might be better to eg pulse by "net bytes used"
// so that a term that has only 1 posting but a huge
// payload would not be inlined. Though this is
// presumably rare in practice...
/** @lucene.experimental */
@ -86,6 +86,7 @@ public final class PulsingPostingsWriterImpl extends PostingsWriterBase {
public void start(IndexOutput termsOut) throws IOException {
this.termsOut = termsOut;
CodecUtil.writeHeader(termsOut, CODEC, VERSION_CURRENT);
termsOut.writeVInt(pending.length); // encode maxPositions in header
wrappedPostingsWriter.start(termsOut);
}
@ -186,7 +187,6 @@ public final class PulsingPostingsWriterImpl extends PostingsWriterBase {
pendingIsIndexTerm |= isIndexTerm;
if (pendingCount == -1) {
termsOut.writeByte((byte) 0);
wrappedPostingsWriter.finishTerm(stats, pendingIsIndexTerm);
pendingIsIndexTerm = false;
} else {
@ -195,8 +195,6 @@ public final class PulsingPostingsWriterImpl extends PostingsWriterBase {
// term, so we fully inline our postings data into
// terms dict, now:
termsOut.writeByte((byte) 1);
// TODO: it'd be better to share this encoding logic
// in some inner codec that knows how to write a
// single doc / single position, etc. This way if a