From d08d4645287598f87e56ce62980a13e64623107a Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Sat, 15 Jan 2011 16:46:35 +0000 Subject: [PATCH] LUCENE-2862: use the stats instead of an extra byte in pulsing git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1059371 13f79535-47bb-0310-9956-ffa450edef68 --- .../codecs/pulsing/PulsingPostingsReaderImpl.java | 7 +++++-- .../codecs/pulsing/PulsingPostingsWriterImpl.java | 12 +++++------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java b/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java index ac497a4e5a8..8a36a22082a 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java @@ -54,6 +54,7 @@ public class PulsingPostingsReaderImpl extends PostingsReaderBase { public void init(IndexInput termsIn) throws IOException { CodecUtil.checkHeader(termsIn, PulsingPostingsWriterImpl.CODEC, PulsingPostingsWriterImpl.VERSION_START, PulsingPostingsWriterImpl.VERSION_START); + maxPositions = termsIn.readVInt(); wrappedPostingsReader.init(termsIn); } @@ -115,8 +116,10 @@ public class PulsingPostingsReaderImpl extends PostingsReaderBase { termState.pendingIndexTerm |= isIndexTerm; - // TODO: wasteful to use whole byte for this (need just a 1 bit); - if (termsIn.readByte() == 1) { + // total TF, but in the omitTFAP case its computed based on docFreq. + long count = fieldInfo.omitTermFreqAndPositions ? termState.docFreq : termState.totalTermFreq; + + if (count <= maxPositions) { // Inlined into terms dict -- just read the byte[] blob in, // but don't decode it now (we only decode when a DocsEnum diff --git a/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsWriterImpl.java b/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsWriterImpl.java index a018122c567..94c39c12173 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsWriterImpl.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsWriterImpl.java @@ -27,10 +27,10 @@ import org.apache.lucene.store.RAMOutputStream; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CodecUtil; -// TODO: we now pulse entirely according to docFreq of the -// term; it might be better to eg pulse by "net bytes used" -// so that a term that has only 1 doc but zillions of -// positions would not be inlined. Though this is +// TODO: we pulse based on total TF of the term, +// it might be better to eg pulse by "net bytes used" +// so that a term that has only 1 posting but a huge +// payload would not be inlined. Though this is // presumably rare in practice... /** @lucene.experimental */ @@ -86,6 +86,7 @@ public final class PulsingPostingsWriterImpl extends PostingsWriterBase { public void start(IndexOutput termsOut) throws IOException { this.termsOut = termsOut; CodecUtil.writeHeader(termsOut, CODEC, VERSION_CURRENT); + termsOut.writeVInt(pending.length); // encode maxPositions in header wrappedPostingsWriter.start(termsOut); } @@ -186,7 +187,6 @@ public final class PulsingPostingsWriterImpl extends PostingsWriterBase { pendingIsIndexTerm |= isIndexTerm; if (pendingCount == -1) { - termsOut.writeByte((byte) 0); wrappedPostingsWriter.finishTerm(stats, pendingIsIndexTerm); pendingIsIndexTerm = false; } else { @@ -195,8 +195,6 @@ public final class PulsingPostingsWriterImpl extends PostingsWriterBase { // term, so we fully inline our postings data into // terms dict, now: - termsOut.writeByte((byte) 1); - // TODO: it'd be better to share this encoding logic // in some inner codec that knows how to write a // single doc / single position, etc. This way if a