LUCENE-4498: pulse docFreq=1 in 4.1 codec

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1401284 13f79535-47bb-0310-9956-ffa450edef68
2012-10-23 13:10:09 +00:00 · 2012-10-23 13:10:09 +00:00 · 2c462fa3a6
parent db2e268bec
commit 2c462fa3a6
4 changed files with 102 additions and 31 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -108,6 +108,10 @@ Optimizations
 * LUCENE-4497: Don't write PosVIntCount to the positions file in 
  Lucene41PostingsFormat, as its always totalTermFreq % BLOCK_SIZE. (Robert Muir)

+* LUCENE-4498: In Lucene41PostingsFormat, when a term appears in only one document, 
+  Instead of writing a file pointer to a VIntBlock containing the doc id, just 
+  write the doc id.  (Mike McCandless, Robert Muir)
+
 Build

 * LUCENE-4451: Memory leak per unique thread caused by 
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsFormat.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsFormat.java
@ -127,10 +127,10 @@ import org.apache.lucene.util.packed.PackedInts;
 *
 * <ul>
 *   <li>Postings Metadata --&gt; Header, PackedBlockSize</li>
- *   <li>Term Metadata --&gt; DocFPDelta, PosFPDelta?, PosVIntBlockFPDelta?, PayFPDelta?, 
+ *   <li>Term Metadata --&gt; (DocFPDelta|SingletonDocID), PosFPDelta?, PosVIntBlockFPDelta?, PayFPDelta?, 
 *                            SkipFPDelta?</li>
 *   <li>Header, --&gt; {@link CodecUtil#writeHeader CodecHeader}</li>
- *   <li>PackedBlockSize --&gt; {@link DataOutput#writeVInt VInt}</li>
+ *   <li>PackedBlockSize, SingletonDocID --&gt; {@link DataOutput#writeVInt VInt}</li>
 *   <li>DocFPDelta, PosFPDelta, PayFPDelta, PosVIntBlockFPDelta, SkipFPDelta --&gt; {@link DataOutput#writeVLong VLong}</li>
 * </ul>
 * <p>Notes:</p>
@ -162,6 +162,9 @@ import org.apache.lucene.util.packed.PackedInts;
 *        file. In particular, it is the length of the TermFreq data.
 *        SkipDelta is only stored if DocFreq is not smaller than SkipMinimum
 *        (i.e. 8 in Lucene41PostingsFormat).</li>
+ *    <li>SingletonDocID is an optimization when a term only appears in one document. In this case, instead
+ *        of writing a file pointer to the .doc file (DocFPDelta), and then a VIntBlock at that location, the 
+ *        single document ID is written to the term dictionary.</li>
 * </ul>
 * </dd>
 * </dl>
@ -277,7 +280,7 @@ import org.apache.lucene.util.packed.PackedInts;
 *   <li>VIntBlock --&gt; &lt;PositionDelta[, PayloadLength?], PayloadData?, 
 *                        OffsetDelta?, OffsetLength?&gt;<sup>PosVIntCount</sup>
 *   <li>PackedPosDeltaBlock --&gt; {@link PackedInts PackedInts}</li>
- *   <li>PosVIntCount, PositionDelta, OffsetDelta, OffsetLength --&gt; 
+ *   <li>PositionDelta, OffsetDelta, OffsetLength --&gt; 
 *       {@link DataOutput#writeVInt VInt}</li>
 *   <li>PayloadData --&gt; {@link DataOutput#writeByte byte}<sup>PayLength</sup></li>
 * </ul>
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java
@ -148,6 +148,9 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
    long payStartFP;
    long skipOffset;
    long lastPosBlockOffset;
+    // docid when there is a single pulsed posting, otherwise -1
+    // freq is always implicitly totalTermFreq in this case.
+    int singletonDocID;

    // Only used by the "primary" TermState -- clones don't
    // copy this (basically they are "transient"):
@ -170,6 +173,7 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
      payStartFP = other.payStartFP;
      lastPosBlockOffset = other.lastPosBlockOffset;
      skipOffset = other.skipOffset;
+      singletonDocID = other.singletonDocID;

      // Do not copy bytes, bytesReader (else TermState is
      // very heavy, ie drags around the entire block's
@ -179,7 +183,7 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {

    @Override
    public String toString() {
-      return super.toString() + " docStartFP=" + docStartFP + " posStartFP=" + posStartFP + " payStartFP=" + payStartFP + " lastPosBlockOffset=" + lastPosBlockOffset;
+      return super.toString() + " docStartFP=" + docStartFP + " posStartFP=" + posStartFP + " payStartFP=" + payStartFP + " lastPosBlockOffset=" + lastPosBlockOffset + " singletonDocID=" + singletonDocID;
    }
  }

@ -223,7 +227,13 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {

    final DataInput in = termState.bytesReader;
    if (isFirstTerm) {
+      if (termState.docFreq == 1) {
+        termState.singletonDocID = in.readVInt();
+        termState.docStartFP = 0;
+      } else {
+        termState.singletonDocID = -1;
        termState.docStartFP = in.readVLong();
+      }
      if (fieldHasPositions) {
        termState.posStartFP = in.readVLong();
        if (termState.totalTermFreq > BLOCK_SIZE) {
@ -238,7 +248,12 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
        }
      }
    } else {
+      if (termState.docFreq == 1) {
+        termState.singletonDocID = in.readVInt();
+      } else {
+        termState.singletonDocID = -1;
        termState.docStartFP += in.readVLong();
+      }
      if (fieldHasPositions) {
        termState.posStartFP += in.readVLong();
        if (termState.totalTermFreq > BLOCK_SIZE) {
@ -327,13 +342,14 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {

    final IndexInput startDocIn;

-    final IndexInput docIn;
+    IndexInput docIn;
    final boolean indexHasFreq;
    final boolean indexHasPos;
    final boolean indexHasOffsets;
    final boolean indexHasPayloads;

    private int docFreq;                              // number of docs in this posting list
+    private long totalTermFreq;                       // sum of freqs in this posting list (or docFreq when omitted)
    private int docUpto;                              // how many docs we've read
    private int doc;                                  // doc we last read
    private int accum;                                // accumulator for doc deltas
@ -354,10 +370,11 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
    private Bits liveDocs;
    
    private boolean needsFreq; // true if the caller actually needs frequencies
+    private int singletonDocID; // docid when there is a single pulsed posting, otherwise -1

    public BlockDocsEnum(FieldInfo fieldInfo) throws IOException {
      this.startDocIn = Lucene41PostingsReader.this.docIn;
-      this.docIn = startDocIn.clone();
+      this.docIn = null;
      indexHasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
      indexHasPos = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
      indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
@ -378,9 +395,17 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
      //   System.out.println("  FPR.reset: termState=" + termState);
      // }
      docFreq = termState.docFreq;
+      totalTermFreq = indexHasFreq ? termState.totalTermFreq : docFreq;
      docTermStartFP = termState.docStartFP;
-      docIn.seek(docTermStartFP);
      skipOffset = termState.skipOffset;
+      singletonDocID = termState.singletonDocID;
+      if (docFreq > 1) {
+        if (docIn == null) {
+          // lazy init
+          docIn = startDocIn.clone();
+        }
+        docIn.seek(docTermStartFP);
+      }

      doc = -1;
      this.needsFreq = (flags & DocsEnum.FLAG_FREQS) != 0;
@ -425,6 +450,9 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
            forUtil.skipBlock(docIn); // skip over freqs
          }
        }
+      } else if (docFreq == 1) {
+        docDeltaBuffer[0] = singletonDocID;
+        freqBuffer[0] = (int) totalTermFreq;
      } else {
        // Read vInts:
        // if (DEBUG) {
@ -590,7 +618,7 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {

    final IndexInput startDocIn;

-    final IndexInput docIn;
+    IndexInput docIn;
    final IndexInput posIn;

    final boolean indexHasOffsets;
@ -635,10 +663,11 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
    private int nextSkipDoc;

    private Bits liveDocs;
+    private int singletonDocID; // docid when there is a single pulsed posting, otherwise -1
    
    public BlockDocsAndPositionsEnum(FieldInfo fieldInfo) throws IOException {
      this.startDocIn = Lucene41PostingsReader.this.docIn;
-      this.docIn = startDocIn.clone();
+      this.docIn = null;
      this.posIn = Lucene41PostingsReader.this.posIn.clone();
      encoded = new byte[MAX_ENCODED_SIZE];
      indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
@ -660,9 +689,16 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
      docTermStartFP = termState.docStartFP;
      posTermStartFP = termState.posStartFP;
      payTermStartFP = termState.payStartFP;
-      docIn.seek(docTermStartFP);
      skipOffset = termState.skipOffset;
      totalTermFreq = termState.totalTermFreq;
+      singletonDocID = termState.singletonDocID;
+      if (docFreq > 1) {
+        if (docIn == null) {
+          // lazy init
+          docIn = startDocIn.clone();
+        }
+        docIn.seek(docTermStartFP);
+      }
      posPendingFP = posTermStartFP;
      posPendingCount = 0;
      if (termState.totalTermFreq < BLOCK_SIZE) {
@ -705,6 +741,9 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
        //   System.out.println("    fill freq block from fp=" + docIn.getFilePointer());
        // }
        forUtil.readBlock(docIn, encoded, freqBuffer);
+      } else if (docFreq == 1) {
+        docDeltaBuffer[0] = singletonDocID;
+        freqBuffer[0] = (int) totalTermFreq;
      } else {
        // Read vInts:
        // if (DEBUG) {
@ -1002,7 +1041,7 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {

    final IndexInput startDocIn;

-    final IndexInput docIn;
+    IndexInput docIn;
    final IndexInput posIn;
    final IndexInput payIn;
    final BytesRef payload;
@ -1056,10 +1095,11 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
    
    private boolean needsOffsets; // true if we actually need offsets
    private boolean needsPayloads; // true if we actually need payloads
+    private int singletonDocID; // docid when there is a single pulsed posting, otherwise -1
    
    public EverythingEnum(FieldInfo fieldInfo) throws IOException {
      this.startDocIn = Lucene41PostingsReader.this.docIn;
-      this.docIn = startDocIn.clone();
+      this.docIn = null;
      this.posIn = Lucene41PostingsReader.this.posIn.clone();
      this.payIn = Lucene41PostingsReader.this.payIn.clone();
      encoded = new byte[MAX_ENCODED_SIZE];
@ -1101,9 +1141,16 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
      docTermStartFP = termState.docStartFP;
      posTermStartFP = termState.posStartFP;
      payTermStartFP = termState.payStartFP;
-      docIn.seek(docTermStartFP);
      skipOffset = termState.skipOffset;
      totalTermFreq = termState.totalTermFreq;
+      singletonDocID = termState.singletonDocID;
+      if (docFreq > 1) {
+        if (docIn == null) {
+          // lazy init
+          docIn = startDocIn.clone();
+        }
+        docIn.seek(docTermStartFP);
+      }
      posPendingFP = posTermStartFP;
      payPendingFP = payTermStartFP;
      posPendingCount = 0;
@ -1150,6 +1197,9 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
        //   System.out.println("    fill freq block from fp=" + docIn.getFilePointer());
        // }
        forUtil.readBlock(docIn, encoded, freqBuffer);
+      } else if (docFreq == 1) {
+        docDeltaBuffer[0] = singletonDocID;
+        freqBuffer[0] = (int) totalTermFreq;
      } else {
        // if (DEBUG) {
        //   System.out.println("    fill last vInt doc block from fp=" + docIn.getFilePointer());
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsWriter.java
@ -354,13 +354,15 @@ public final class Lucene41PostingsWriter extends PostingsWriterBase {
    public final long payStartFP;
    public final long skipOffset;
    public final long lastPosBlockOffset;
+    public final int singletonDocID;

-    public PendingTerm(long docStartFP, long posStartFP, long payStartFP, long skipOffset, long lastPosBlockOffset) {
+    public PendingTerm(long docStartFP, long posStartFP, long payStartFP, long skipOffset, long lastPosBlockOffset, int singletonDocID) {
      this.docStartFP = docStartFP;
      this.posStartFP = posStartFP;
      this.payStartFP = payStartFP;
      this.skipOffset = skipOffset;
      this.lastPosBlockOffset = lastPosBlockOffset;
+      this.singletonDocID = singletonDocID;
    }
  }

@ -385,6 +387,13 @@ public final class Lucene41PostingsWriter extends PostingsWriterBase {
    //   }
    // }
    
+    // docFreq == 1, don't write the single docid/freq to a separate file along with a pointer to it.
+    final int singletonDocID;
+    if (stats.docFreq == 1) {
+      // pulse the singleton docid into the term dictionary, freq is implicitly totalTermFreq
+      singletonDocID = docDeltaBuffer[0];
+    } else {
+      singletonDocID = -1;
      // vInt encode the remaining doc deltas and freqs:
      for(int i=0;i<docBufferUpto;i++) {
        final int docDelta = docDeltaBuffer[i];
@ -398,6 +407,7 @@ public final class Lucene41PostingsWriter extends PostingsWriterBase {
          docOut.writeVInt(freq);
        }
      }
+    }

    final long lastPosBlockOffset;

@ -507,7 +517,7 @@ public final class Lucene41PostingsWriter extends PostingsWriterBase {
    //   System.out.println("  payStartFP=" + payStartFP);
    // }

-    pendingTerms.add(new PendingTerm(docTermStartFP, posTermStartFP, payStartFP, skipOffset, lastPosBlockOffset));
+    pendingTerms.add(new PendingTerm(docTermStartFP, posTermStartFP, payStartFP, skipOffset, lastPosBlockOffset, singletonDocID));
    docBufferUpto = 0;
    posBufferUpto = 0;
    lastDocID = 0;
@ -535,8 +545,12 @@ public final class Lucene41PostingsWriter extends PostingsWriterBase {
    for(int idx=limit-count; idx<limit; idx++) {
      PendingTerm term = pendingTerms.get(idx);

+      if (term.singletonDocID == -1) {
        bytesWriter.writeVLong(term.docStartFP - lastDocStartFP);
        lastDocStartFP = term.docStartFP;
+      } else {
+        bytesWriter.writeVInt(term.singletonDocID);
+      }

      if (fieldHasPositions) {
        bytesWriter.writeVLong(term.posStartFP - lastPosStartFP);