LUCENE-2862: use the stats instead of an extra byte in pulsing

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1059371 13f79535-47bb-0310-9956-ffa450edef68
2025-02-08 11:05:29 +00:00 · 2011-01-15 16:46:35 +00:00 · 2011-01-15 16:46:35 +00:00 · d08d464528
commit d08d464528
parent bcbf1d462c
2 changed files with 10 additions and 9 deletions
--- a/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java
+++ b/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java
@ -54,6 +54,7 @@ public class PulsingPostingsReaderImpl extends PostingsReaderBase {
  public void init(IndexInput termsIn) throws IOException {
    CodecUtil.checkHeader(termsIn, PulsingPostingsWriterImpl.CODEC,
      PulsingPostingsWriterImpl.VERSION_START, PulsingPostingsWriterImpl.VERSION_START);
    maxPositions = termsIn.readVInt();
    wrappedPostingsReader.init(termsIn);
  }
@ -115,8 +116,10 @@ public class PulsingPostingsReaderImpl extends PostingsReaderBase {
    termState.pendingIndexTerm |= isIndexTerm;
-    // TODO: wasteful to use whole byte for this (need just a 1 bit);
+    // total TF, but in the omitTFAP case its computed based on docFreq.
-    if (termsIn.readByte() == 1) {
+    long count = fieldInfo.omitTermFreqAndPositions ? termState.docFreq : termState.totalTermFreq;
    if (count <= maxPositions) {
      // Inlined into terms dict -- just read the byte[] blob in,
      // but don't decode it now (we only decode when a DocsEnum
--- a/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsWriterImpl.java
+++ b/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsWriterImpl.java
@ -27,10 +27,10 @@ import org.apache.lucene.store.RAMOutputStream;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.CodecUtil;
-// TODO: we now pulse entirely according to docFreq of the
+// TODO: we pulse based on total TF of the term,
-// term; it might be better to eg pulse by "net bytes used"
+// it might be better to eg pulse by "net bytes used"
-// so that a term that has only 1 doc but zillions of
+// so that a term that has only 1 posting but a huge
-// positions would not be inlined.  Though this is
+// payload would not be inlined.  Though this is
 // presumably rare in practice...
 /** @lucene.experimental */
@ -86,6 +86,7 @@ public final class PulsingPostingsWriterImpl extends PostingsWriterBase {
  public void start(IndexOutput termsOut) throws IOException {
    this.termsOut = termsOut;
    CodecUtil.writeHeader(termsOut, CODEC, VERSION_CURRENT);
    termsOut.writeVInt(pending.length); // encode maxPositions in header
    wrappedPostingsWriter.start(termsOut);
  }
@ -186,7 +187,6 @@ public final class PulsingPostingsWriterImpl extends PostingsWriterBase {
    pendingIsIndexTerm |= isIndexTerm;
    if (pendingCount == -1) {
      termsOut.writeByte((byte) 0);
      wrappedPostingsWriter.finishTerm(stats, pendingIsIndexTerm);
      pendingIsIndexTerm = false;
    } else {
@ -195,8 +195,6 @@ public final class PulsingPostingsWriterImpl extends PostingsWriterBase {
      // term, so we fully inline our postings data into
      // terms dict, now:
      termsOut.writeByte((byte) 1);
      // TODO: it'd be better to share this encoding logic
      // in some inner codec that knows how to write a
      // single doc / single position, etc.  This way if a