LUCENE-4473: encode low-freq terms offsets more efficiently in blockPF

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1396867 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2012-10-11 00:08:54 +00:00
parent c2c2717c3e
commit 295e46da1f
4 changed files with 28 additions and 10 deletions

View File

@ -75,6 +75,9 @@ Optimizations
failures in TestWeakIdentityMap disappear, too.
(Uwe Schindler, Mike McCandless, Robert Muir)
* LUCENE-4473: BlockPostingsFormat encodes offsets more efficiently
for low frequency terms (< 128 occurrences). (Robert Muir)
Build
* LUCENE-4451: Memory leak per unique thread caused by

View File

@ -306,10 +306,10 @@ import org.apache.lucene.util.packed.PackedInts;
* PayloadLength is stored at the current position, then it indicates the length
* of this payload. If PayloadLength is not stored, then this payload has the same
* length as the payload at the previous position.</li>
* <li>OffsetDelta is the difference between this position's startOffset from the
* <li>OffsetDelta/2 is the difference between this position's startOffset from the
* previous occurrence (or zero, if this is the first occurrence in this document).
* OffsetLength follows, encoding the difference between endOffset and startOffset.
* Offset data is only written for
* If OffsetDelta is odd, then the length (endOffset-startOffset) differs from the
* previous occurrence and an OffsetLength follows. Offset data is only written for
* {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}.</li>
* </ul>
* </dd>

View File

@ -729,8 +729,10 @@ final class BlockPostingsReader extends PostingsReaderBase {
posDeltaBuffer[i] = code;
}
if (indexHasOffsets) {
posIn.readVInt();
posIn.readVInt();
if ((posIn.readVInt() & 1) != 0) {
// offset length changed
posIn.readVInt();
}
}
}
} else {
@ -1149,6 +1151,7 @@ final class BlockPostingsReader extends PostingsReaderBase {
// }
final int count = posIn.readVInt();
int payloadLength = 0;
int offsetLength = 0;
payloadByteUpto = 0;
for(int i=0;i<count;i++) {
int code = posIn.readVInt();
@ -1177,8 +1180,12 @@ final class BlockPostingsReader extends PostingsReaderBase {
// if (DEBUG) {
// System.out.println(" i=" + i + " read offsets from posIn.fp=" + posIn.getFilePointer());
// }
offsetStartDeltaBuffer[i] = posIn.readVInt();
offsetLengthBuffer[i] = posIn.readVInt();
int deltaCode = posIn.readVInt();
if ((deltaCode & 1) != 0) {
offsetLength = posIn.readVInt();
}
offsetStartDeltaBuffer[i] = deltaCode >>> 1;
offsetLengthBuffer[i] = offsetLength;
// if (DEBUG) {
// System.out.println(" startOffDelta=" + offsetStartDeltaBuffer[i] + " offsetLen=" + offsetLengthBuffer[i]);
// }

View File

@ -424,7 +424,8 @@ final class BlockPostingsWriter extends PostingsWriterBase {
// majority)
// vInt encode the remaining positions/payloads/offsets:
int lastPayloadLength = -1;
int lastPayloadLength = -1; // force first payload length to be written
int lastOffsetLength = -1; // force first offset length to be written
int payloadBytesReadUpto = 0;
for(int i=0;i<posBufferUpto;i++) {
final int posDelta = posDeltaBuffer[i];
@ -457,8 +458,15 @@ final class BlockPostingsWriter extends PostingsWriterBase {
// if (DEBUG) {
// System.out.println(" write offset @ pos.fp=" + posOut.getFilePointer());
// }
posOut.writeVInt(offsetStartDeltaBuffer[i]);
posOut.writeVInt(offsetLengthBuffer[i]);
int delta = offsetStartDeltaBuffer[i];
int length = offsetLengthBuffer[i];
if (length == lastOffsetLength) {
posOut.writeVInt(delta << 1);
} else {
posOut.writeVInt(delta << 1 | 1);
posOut.writeVInt(length);
lastOffsetLength = length;
}
}
}