LUCENE-4498: pulse docFreq=1 in 4.1 codec

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1401284 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2012-10-23 13:10:09 +00:00
parent db2e268bec
commit 2c462fa3a6
4 changed files with 102 additions and 31 deletions

View File

@ -108,6 +108,10 @@ Optimizations
* LUCENE-4497: Don't write PosVIntCount to the positions file in * LUCENE-4497: Don't write PosVIntCount to the positions file in
Lucene41PostingsFormat, as its always totalTermFreq % BLOCK_SIZE. (Robert Muir) Lucene41PostingsFormat, as its always totalTermFreq % BLOCK_SIZE. (Robert Muir)
* LUCENE-4498: In Lucene41PostingsFormat, when a term appears in only one document,
Instead of writing a file pointer to a VIntBlock containing the doc id, just
write the doc id. (Mike McCandless, Robert Muir)
Build Build
* LUCENE-4451: Memory leak per unique thread caused by * LUCENE-4451: Memory leak per unique thread caused by

View File

@ -127,10 +127,10 @@ import org.apache.lucene.util.packed.PackedInts;
* *
* <ul> * <ul>
* <li>Postings Metadata --&gt; Header, PackedBlockSize</li> * <li>Postings Metadata --&gt; Header, PackedBlockSize</li>
* <li>Term Metadata --&gt; DocFPDelta, PosFPDelta?, PosVIntBlockFPDelta?, PayFPDelta?, * <li>Term Metadata --&gt; (DocFPDelta|SingletonDocID), PosFPDelta?, PosVIntBlockFPDelta?, PayFPDelta?,
* SkipFPDelta?</li> * SkipFPDelta?</li>
* <li>Header, --&gt; {@link CodecUtil#writeHeader CodecHeader}</li> * <li>Header, --&gt; {@link CodecUtil#writeHeader CodecHeader}</li>
* <li>PackedBlockSize --&gt; {@link DataOutput#writeVInt VInt}</li> * <li>PackedBlockSize, SingletonDocID --&gt; {@link DataOutput#writeVInt VInt}</li>
* <li>DocFPDelta, PosFPDelta, PayFPDelta, PosVIntBlockFPDelta, SkipFPDelta --&gt; {@link DataOutput#writeVLong VLong}</li> * <li>DocFPDelta, PosFPDelta, PayFPDelta, PosVIntBlockFPDelta, SkipFPDelta --&gt; {@link DataOutput#writeVLong VLong}</li>
* </ul> * </ul>
* <p>Notes:</p> * <p>Notes:</p>
@ -162,6 +162,9 @@ import org.apache.lucene.util.packed.PackedInts;
* file. In particular, it is the length of the TermFreq data. * file. In particular, it is the length of the TermFreq data.
* SkipDelta is only stored if DocFreq is not smaller than SkipMinimum * SkipDelta is only stored if DocFreq is not smaller than SkipMinimum
* (i.e. 8 in Lucene41PostingsFormat).</li> * (i.e. 8 in Lucene41PostingsFormat).</li>
* <li>SingletonDocID is an optimization when a term only appears in one document. In this case, instead
* of writing a file pointer to the .doc file (DocFPDelta), and then a VIntBlock at that location, the
* single document ID is written to the term dictionary.</li>
* </ul> * </ul>
* </dd> * </dd>
* </dl> * </dl>
@ -277,7 +280,7 @@ import org.apache.lucene.util.packed.PackedInts;
* <li>VIntBlock --&gt; &lt;PositionDelta[, PayloadLength?], PayloadData?, * <li>VIntBlock --&gt; &lt;PositionDelta[, PayloadLength?], PayloadData?,
* OffsetDelta?, OffsetLength?&gt;<sup>PosVIntCount</sup> * OffsetDelta?, OffsetLength?&gt;<sup>PosVIntCount</sup>
* <li>PackedPosDeltaBlock --&gt; {@link PackedInts PackedInts}</li> * <li>PackedPosDeltaBlock --&gt; {@link PackedInts PackedInts}</li>
* <li>PosVIntCount, PositionDelta, OffsetDelta, OffsetLength --&gt; * <li>PositionDelta, OffsetDelta, OffsetLength --&gt;
* {@link DataOutput#writeVInt VInt}</li> * {@link DataOutput#writeVInt VInt}</li>
* <li>PayloadData --&gt; {@link DataOutput#writeByte byte}<sup>PayLength</sup></li> * <li>PayloadData --&gt; {@link DataOutput#writeByte byte}<sup>PayLength</sup></li>
* </ul> * </ul>

View File

@ -148,6 +148,9 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
long payStartFP; long payStartFP;
long skipOffset; long skipOffset;
long lastPosBlockOffset; long lastPosBlockOffset;
// docid when there is a single pulsed posting, otherwise -1
// freq is always implicitly totalTermFreq in this case.
int singletonDocID;
// Only used by the "primary" TermState -- clones don't // Only used by the "primary" TermState -- clones don't
// copy this (basically they are "transient"): // copy this (basically they are "transient"):
@ -170,6 +173,7 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
payStartFP = other.payStartFP; payStartFP = other.payStartFP;
lastPosBlockOffset = other.lastPosBlockOffset; lastPosBlockOffset = other.lastPosBlockOffset;
skipOffset = other.skipOffset; skipOffset = other.skipOffset;
singletonDocID = other.singletonDocID;
// Do not copy bytes, bytesReader (else TermState is // Do not copy bytes, bytesReader (else TermState is
// very heavy, ie drags around the entire block's // very heavy, ie drags around the entire block's
@ -179,7 +183,7 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
@Override @Override
public String toString() { public String toString() {
return super.toString() + " docStartFP=" + docStartFP + " posStartFP=" + posStartFP + " payStartFP=" + payStartFP + " lastPosBlockOffset=" + lastPosBlockOffset; return super.toString() + " docStartFP=" + docStartFP + " posStartFP=" + posStartFP + " payStartFP=" + payStartFP + " lastPosBlockOffset=" + lastPosBlockOffset + " singletonDocID=" + singletonDocID;
} }
} }
@ -223,7 +227,13 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
final DataInput in = termState.bytesReader; final DataInput in = termState.bytesReader;
if (isFirstTerm) { if (isFirstTerm) {
if (termState.docFreq == 1) {
termState.singletonDocID = in.readVInt();
termState.docStartFP = 0;
} else {
termState.singletonDocID = -1;
termState.docStartFP = in.readVLong(); termState.docStartFP = in.readVLong();
}
if (fieldHasPositions) { if (fieldHasPositions) {
termState.posStartFP = in.readVLong(); termState.posStartFP = in.readVLong();
if (termState.totalTermFreq > BLOCK_SIZE) { if (termState.totalTermFreq > BLOCK_SIZE) {
@ -238,7 +248,12 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
} }
} }
} else { } else {
if (termState.docFreq == 1) {
termState.singletonDocID = in.readVInt();
} else {
termState.singletonDocID = -1;
termState.docStartFP += in.readVLong(); termState.docStartFP += in.readVLong();
}
if (fieldHasPositions) { if (fieldHasPositions) {
termState.posStartFP += in.readVLong(); termState.posStartFP += in.readVLong();
if (termState.totalTermFreq > BLOCK_SIZE) { if (termState.totalTermFreq > BLOCK_SIZE) {
@ -327,13 +342,14 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
final IndexInput startDocIn; final IndexInput startDocIn;
final IndexInput docIn; IndexInput docIn;
final boolean indexHasFreq; final boolean indexHasFreq;
final boolean indexHasPos; final boolean indexHasPos;
final boolean indexHasOffsets; final boolean indexHasOffsets;
final boolean indexHasPayloads; final boolean indexHasPayloads;
private int docFreq; // number of docs in this posting list private int docFreq; // number of docs in this posting list
private long totalTermFreq; // sum of freqs in this posting list (or docFreq when omitted)
private int docUpto; // how many docs we've read private int docUpto; // how many docs we've read
private int doc; // doc we last read private int doc; // doc we last read
private int accum; // accumulator for doc deltas private int accum; // accumulator for doc deltas
@ -354,10 +370,11 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
private Bits liveDocs; private Bits liveDocs;
private boolean needsFreq; // true if the caller actually needs frequencies private boolean needsFreq; // true if the caller actually needs frequencies
private int singletonDocID; // docid when there is a single pulsed posting, otherwise -1
public BlockDocsEnum(FieldInfo fieldInfo) throws IOException { public BlockDocsEnum(FieldInfo fieldInfo) throws IOException {
this.startDocIn = Lucene41PostingsReader.this.docIn; this.startDocIn = Lucene41PostingsReader.this.docIn;
this.docIn = startDocIn.clone(); this.docIn = null;
indexHasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; indexHasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
indexHasPos = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; indexHasPos = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
@ -378,9 +395,17 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
// System.out.println(" FPR.reset: termState=" + termState); // System.out.println(" FPR.reset: termState=" + termState);
// } // }
docFreq = termState.docFreq; docFreq = termState.docFreq;
totalTermFreq = indexHasFreq ? termState.totalTermFreq : docFreq;
docTermStartFP = termState.docStartFP; docTermStartFP = termState.docStartFP;
docIn.seek(docTermStartFP);
skipOffset = termState.skipOffset; skipOffset = termState.skipOffset;
singletonDocID = termState.singletonDocID;
if (docFreq > 1) {
if (docIn == null) {
// lazy init
docIn = startDocIn.clone();
}
docIn.seek(docTermStartFP);
}
doc = -1; doc = -1;
this.needsFreq = (flags & DocsEnum.FLAG_FREQS) != 0; this.needsFreq = (flags & DocsEnum.FLAG_FREQS) != 0;
@ -425,6 +450,9 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
forUtil.skipBlock(docIn); // skip over freqs forUtil.skipBlock(docIn); // skip over freqs
} }
} }
} else if (docFreq == 1) {
docDeltaBuffer[0] = singletonDocID;
freqBuffer[0] = (int) totalTermFreq;
} else { } else {
// Read vInts: // Read vInts:
// if (DEBUG) { // if (DEBUG) {
@ -590,7 +618,7 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
final IndexInput startDocIn; final IndexInput startDocIn;
final IndexInput docIn; IndexInput docIn;
final IndexInput posIn; final IndexInput posIn;
final boolean indexHasOffsets; final boolean indexHasOffsets;
@ -635,10 +663,11 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
private int nextSkipDoc; private int nextSkipDoc;
private Bits liveDocs; private Bits liveDocs;
private int singletonDocID; // docid when there is a single pulsed posting, otherwise -1
public BlockDocsAndPositionsEnum(FieldInfo fieldInfo) throws IOException { public BlockDocsAndPositionsEnum(FieldInfo fieldInfo) throws IOException {
this.startDocIn = Lucene41PostingsReader.this.docIn; this.startDocIn = Lucene41PostingsReader.this.docIn;
this.docIn = startDocIn.clone(); this.docIn = null;
this.posIn = Lucene41PostingsReader.this.posIn.clone(); this.posIn = Lucene41PostingsReader.this.posIn.clone();
encoded = new byte[MAX_ENCODED_SIZE]; encoded = new byte[MAX_ENCODED_SIZE];
indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
@ -660,9 +689,16 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
docTermStartFP = termState.docStartFP; docTermStartFP = termState.docStartFP;
posTermStartFP = termState.posStartFP; posTermStartFP = termState.posStartFP;
payTermStartFP = termState.payStartFP; payTermStartFP = termState.payStartFP;
docIn.seek(docTermStartFP);
skipOffset = termState.skipOffset; skipOffset = termState.skipOffset;
totalTermFreq = termState.totalTermFreq; totalTermFreq = termState.totalTermFreq;
singletonDocID = termState.singletonDocID;
if (docFreq > 1) {
if (docIn == null) {
// lazy init
docIn = startDocIn.clone();
}
docIn.seek(docTermStartFP);
}
posPendingFP = posTermStartFP; posPendingFP = posTermStartFP;
posPendingCount = 0; posPendingCount = 0;
if (termState.totalTermFreq < BLOCK_SIZE) { if (termState.totalTermFreq < BLOCK_SIZE) {
@ -705,6 +741,9 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
// System.out.println(" fill freq block from fp=" + docIn.getFilePointer()); // System.out.println(" fill freq block from fp=" + docIn.getFilePointer());
// } // }
forUtil.readBlock(docIn, encoded, freqBuffer); forUtil.readBlock(docIn, encoded, freqBuffer);
} else if (docFreq == 1) {
docDeltaBuffer[0] = singletonDocID;
freqBuffer[0] = (int) totalTermFreq;
} else { } else {
// Read vInts: // Read vInts:
// if (DEBUG) { // if (DEBUG) {
@ -1002,7 +1041,7 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
final IndexInput startDocIn; final IndexInput startDocIn;
final IndexInput docIn; IndexInput docIn;
final IndexInput posIn; final IndexInput posIn;
final IndexInput payIn; final IndexInput payIn;
final BytesRef payload; final BytesRef payload;
@ -1056,10 +1095,11 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
private boolean needsOffsets; // true if we actually need offsets private boolean needsOffsets; // true if we actually need offsets
private boolean needsPayloads; // true if we actually need payloads private boolean needsPayloads; // true if we actually need payloads
private int singletonDocID; // docid when there is a single pulsed posting, otherwise -1
public EverythingEnum(FieldInfo fieldInfo) throws IOException { public EverythingEnum(FieldInfo fieldInfo) throws IOException {
this.startDocIn = Lucene41PostingsReader.this.docIn; this.startDocIn = Lucene41PostingsReader.this.docIn;
this.docIn = startDocIn.clone(); this.docIn = null;
this.posIn = Lucene41PostingsReader.this.posIn.clone(); this.posIn = Lucene41PostingsReader.this.posIn.clone();
this.payIn = Lucene41PostingsReader.this.payIn.clone(); this.payIn = Lucene41PostingsReader.this.payIn.clone();
encoded = new byte[MAX_ENCODED_SIZE]; encoded = new byte[MAX_ENCODED_SIZE];
@ -1101,9 +1141,16 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
docTermStartFP = termState.docStartFP; docTermStartFP = termState.docStartFP;
posTermStartFP = termState.posStartFP; posTermStartFP = termState.posStartFP;
payTermStartFP = termState.payStartFP; payTermStartFP = termState.payStartFP;
docIn.seek(docTermStartFP);
skipOffset = termState.skipOffset; skipOffset = termState.skipOffset;
totalTermFreq = termState.totalTermFreq; totalTermFreq = termState.totalTermFreq;
singletonDocID = termState.singletonDocID;
if (docFreq > 1) {
if (docIn == null) {
// lazy init
docIn = startDocIn.clone();
}
docIn.seek(docTermStartFP);
}
posPendingFP = posTermStartFP; posPendingFP = posTermStartFP;
payPendingFP = payTermStartFP; payPendingFP = payTermStartFP;
posPendingCount = 0; posPendingCount = 0;
@ -1150,6 +1197,9 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
// System.out.println(" fill freq block from fp=" + docIn.getFilePointer()); // System.out.println(" fill freq block from fp=" + docIn.getFilePointer());
// } // }
forUtil.readBlock(docIn, encoded, freqBuffer); forUtil.readBlock(docIn, encoded, freqBuffer);
} else if (docFreq == 1) {
docDeltaBuffer[0] = singletonDocID;
freqBuffer[0] = (int) totalTermFreq;
} else { } else {
// if (DEBUG) { // if (DEBUG) {
// System.out.println(" fill last vInt doc block from fp=" + docIn.getFilePointer()); // System.out.println(" fill last vInt doc block from fp=" + docIn.getFilePointer());

View File

@ -354,13 +354,15 @@ public final class Lucene41PostingsWriter extends PostingsWriterBase {
public final long payStartFP; public final long payStartFP;
public final long skipOffset; public final long skipOffset;
public final long lastPosBlockOffset; public final long lastPosBlockOffset;
public final int singletonDocID;
public PendingTerm(long docStartFP, long posStartFP, long payStartFP, long skipOffset, long lastPosBlockOffset) { public PendingTerm(long docStartFP, long posStartFP, long payStartFP, long skipOffset, long lastPosBlockOffset, int singletonDocID) {
this.docStartFP = docStartFP; this.docStartFP = docStartFP;
this.posStartFP = posStartFP; this.posStartFP = posStartFP;
this.payStartFP = payStartFP; this.payStartFP = payStartFP;
this.skipOffset = skipOffset; this.skipOffset = skipOffset;
this.lastPosBlockOffset = lastPosBlockOffset; this.lastPosBlockOffset = lastPosBlockOffset;
this.singletonDocID = singletonDocID;
} }
} }
@ -385,6 +387,13 @@ public final class Lucene41PostingsWriter extends PostingsWriterBase {
// } // }
// } // }
// docFreq == 1, don't write the single docid/freq to a separate file along with a pointer to it.
final int singletonDocID;
if (stats.docFreq == 1) {
// pulse the singleton docid into the term dictionary, freq is implicitly totalTermFreq
singletonDocID = docDeltaBuffer[0];
} else {
singletonDocID = -1;
// vInt encode the remaining doc deltas and freqs: // vInt encode the remaining doc deltas and freqs:
for(int i=0;i<docBufferUpto;i++) { for(int i=0;i<docBufferUpto;i++) {
final int docDelta = docDeltaBuffer[i]; final int docDelta = docDeltaBuffer[i];
@ -398,6 +407,7 @@ public final class Lucene41PostingsWriter extends PostingsWriterBase {
docOut.writeVInt(freq); docOut.writeVInt(freq);
} }
} }
}
final long lastPosBlockOffset; final long lastPosBlockOffset;
@ -507,7 +517,7 @@ public final class Lucene41PostingsWriter extends PostingsWriterBase {
// System.out.println(" payStartFP=" + payStartFP); // System.out.println(" payStartFP=" + payStartFP);
// } // }
pendingTerms.add(new PendingTerm(docTermStartFP, posTermStartFP, payStartFP, skipOffset, lastPosBlockOffset)); pendingTerms.add(new PendingTerm(docTermStartFP, posTermStartFP, payStartFP, skipOffset, lastPosBlockOffset, singletonDocID));
docBufferUpto = 0; docBufferUpto = 0;
posBufferUpto = 0; posBufferUpto = 0;
lastDocID = 0; lastDocID = 0;
@ -535,8 +545,12 @@ public final class Lucene41PostingsWriter extends PostingsWriterBase {
for(int idx=limit-count; idx<limit; idx++) { for(int idx=limit-count; idx<limit; idx++) {
PendingTerm term = pendingTerms.get(idx); PendingTerm term = pendingTerms.get(idx);
if (term.singletonDocID == -1) {
bytesWriter.writeVLong(term.docStartFP - lastDocStartFP); bytesWriter.writeVLong(term.docStartFP - lastDocStartFP);
lastDocStartFP = term.docStartFP; lastDocStartFP = term.docStartFP;
} else {
bytesWriter.writeVInt(term.singletonDocID);
}
if (fieldHasPositions) { if (fieldHasPositions) {
bytesWriter.writeVLong(term.posStartFP - lastPosStartFP); bytesWriter.writeVLong(term.posStartFP - lastPosStartFP);