LUCENE-2862: use the stats instead of an extra byte in pulsing

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1059371 13f79535-47bb-0310-9956-ffa450edef68
2011-01-15 16:46:35 +00:00 · 2011-01-15 16:46:35 +00:00 · d08d464528
parent bcbf1d462c
commit d08d464528
2 changed files with 10 additions and 9 deletions
--- a/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java
+++ b/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java
@ -54,6 +54,7 @@ public class PulsingPostingsReaderImpl extends PostingsReaderBase {
  public void init(IndexInput termsIn) throws IOException {
    CodecUtil.checkHeader(termsIn, PulsingPostingsWriterImpl.CODEC,
      PulsingPostingsWriterImpl.VERSION_START, PulsingPostingsWriterImpl.VERSION_START);
+    maxPositions = termsIn.readVInt();
    wrappedPostingsReader.init(termsIn);
  }

@ -115,8 +116,10 @@ public class PulsingPostingsReaderImpl extends PostingsReaderBase {

    termState.pendingIndexTerm |= isIndexTerm;

-    // TODO: wasteful to use whole byte for this (need just a 1 bit);
-    if (termsIn.readByte() == 1) {
+    // total TF, but in the omitTFAP case its computed based on docFreq.
+    long count = fieldInfo.omitTermFreqAndPositions ? termState.docFreq : termState.totalTermFreq;
+    
+    if (count <= maxPositions) {

      // Inlined into terms dict -- just read the byte[] blob in,
      // but don't decode it now (we only decode when a DocsEnum
--- a/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsWriterImpl.java
+++ b/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsWriterImpl.java
@ -27,10 +27,10 @@ import org.apache.lucene.store.RAMOutputStream;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.CodecUtil;

-// TODO: we now pulse entirely according to docFreq of the
-// term; it might be better to eg pulse by "net bytes used"
-// so that a term that has only 1 doc but zillions of
-// positions would not be inlined.  Though this is
+// TODO: we pulse based on total TF of the term,
+// it might be better to eg pulse by "net bytes used"
+// so that a term that has only 1 posting but a huge
+// payload would not be inlined.  Though this is
 // presumably rare in practice...

 /** @lucene.experimental */
@ -86,6 +86,7 @@ public final class PulsingPostingsWriterImpl extends PostingsWriterBase {
  public void start(IndexOutput termsOut) throws IOException {
    this.termsOut = termsOut;
    CodecUtil.writeHeader(termsOut, CODEC, VERSION_CURRENT);
+    termsOut.writeVInt(pending.length); // encode maxPositions in header
    wrappedPostingsWriter.start(termsOut);
  }

@ -186,7 +187,6 @@ public final class PulsingPostingsWriterImpl extends PostingsWriterBase {
    pendingIsIndexTerm |= isIndexTerm;

    if (pendingCount == -1) {
-      termsOut.writeByte((byte) 0);
      wrappedPostingsWriter.finishTerm(stats, pendingIsIndexTerm);
      pendingIsIndexTerm = false;
    } else {
@ -195,8 +195,6 @@ public final class PulsingPostingsWriterImpl extends PostingsWriterBase {
      // term, so we fully inline our postings data into
      // terms dict, now:

-      termsOut.writeByte((byte) 1);
-
      // TODO: it'd be better to share this encoding logic
      // in some inner codec that knows how to write a
      // single doc / single position, etc.  This way if a