LUCENE-2862: use the stats instead of an extra byte in pulsing

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1059371 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2011-01-15 16:46:35 +00:00
parent bcbf1d462c
commit d08d464528
2 changed files with 10 additions and 9 deletions

View File

@ -54,6 +54,7 @@ public class PulsingPostingsReaderImpl extends PostingsReaderBase {
public void init(IndexInput termsIn) throws IOException { public void init(IndexInput termsIn) throws IOException {
CodecUtil.checkHeader(termsIn, PulsingPostingsWriterImpl.CODEC, CodecUtil.checkHeader(termsIn, PulsingPostingsWriterImpl.CODEC,
PulsingPostingsWriterImpl.VERSION_START, PulsingPostingsWriterImpl.VERSION_START); PulsingPostingsWriterImpl.VERSION_START, PulsingPostingsWriterImpl.VERSION_START);
maxPositions = termsIn.readVInt();
wrappedPostingsReader.init(termsIn); wrappedPostingsReader.init(termsIn);
} }
@ -115,8 +116,10 @@ public class PulsingPostingsReaderImpl extends PostingsReaderBase {
termState.pendingIndexTerm |= isIndexTerm; termState.pendingIndexTerm |= isIndexTerm;
// TODO: wasteful to use whole byte for this (need just a 1 bit); // total TF, but in the omitTFAP case its computed based on docFreq.
if (termsIn.readByte() == 1) { long count = fieldInfo.omitTermFreqAndPositions ? termState.docFreq : termState.totalTermFreq;
if (count <= maxPositions) {
// Inlined into terms dict -- just read the byte[] blob in, // Inlined into terms dict -- just read the byte[] blob in,
// but don't decode it now (we only decode when a DocsEnum // but don't decode it now (we only decode when a DocsEnum

View File

@ -27,10 +27,10 @@ import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CodecUtil; import org.apache.lucene.util.CodecUtil;
// TODO: we now pulse entirely according to docFreq of the // TODO: we pulse based on total TF of the term,
// term; it might be better to eg pulse by "net bytes used" // it might be better to eg pulse by "net bytes used"
// so that a term that has only 1 doc but zillions of // so that a term that has only 1 posting but a huge
// positions would not be inlined. Though this is // payload would not be inlined. Though this is
// presumably rare in practice... // presumably rare in practice...
/** @lucene.experimental */ /** @lucene.experimental */
@ -86,6 +86,7 @@ public final class PulsingPostingsWriterImpl extends PostingsWriterBase {
public void start(IndexOutput termsOut) throws IOException { public void start(IndexOutput termsOut) throws IOException {
this.termsOut = termsOut; this.termsOut = termsOut;
CodecUtil.writeHeader(termsOut, CODEC, VERSION_CURRENT); CodecUtil.writeHeader(termsOut, CODEC, VERSION_CURRENT);
termsOut.writeVInt(pending.length); // encode maxPositions in header
wrappedPostingsWriter.start(termsOut); wrappedPostingsWriter.start(termsOut);
} }
@ -186,7 +187,6 @@ public final class PulsingPostingsWriterImpl extends PostingsWriterBase {
pendingIsIndexTerm |= isIndexTerm; pendingIsIndexTerm |= isIndexTerm;
if (pendingCount == -1) { if (pendingCount == -1) {
termsOut.writeByte((byte) 0);
wrappedPostingsWriter.finishTerm(stats, pendingIsIndexTerm); wrappedPostingsWriter.finishTerm(stats, pendingIsIndexTerm);
pendingIsIndexTerm = false; pendingIsIndexTerm = false;
} else { } else {
@ -195,8 +195,6 @@ public final class PulsingPostingsWriterImpl extends PostingsWriterBase {
// term, so we fully inline our postings data into // term, so we fully inline our postings data into
// terms dict, now: // terms dict, now:
termsOut.writeByte((byte) 1);
// TODO: it'd be better to share this encoding logic // TODO: it'd be better to share this encoding logic
// in some inner codec that knows how to write a // in some inner codec that knows how to write a
// single doc / single position, etc. This way if a // single doc / single position, etc. This way if a