diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsFormat.java index 173a0febd46..59b40c47eec 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsFormat.java @@ -190,14 +190,15 @@ import org.apache.lucene.util.fst.FST; // javadocs * sequence of VInts instead:
*7,4
*DocSkip records the document number before every SkipInterval th - * document in TermFreqs. If payloads are disabled for the term's field, then + * document in TermFreqs. If payloads and offsets are disabled for the term's field, then * DocSkip represents the difference from the previous value in the sequence. If - * payloads are enabled for the term's field, then DocSkip/2 represents the - * difference from the previous value in the sequence. If payloads are enabled and - * DocSkip is odd, then PayloadLength is stored indicating the length of the last - * payload before the SkipIntervalth document in TermPositions. If offsets - * are enabled, then OffsetLength is stored indicating the length of the last offset - * (endOffset-startOffset). + * payloads and/or offsets are enabled for the term's field, then DocSkip/2 represents the + * difference from the previous value in the sequence. In this case when + * DocSkip is odd, then PayloadLength and/or OffsetLength are stored indicating the length of + * the last payload/offset before the SkipIntervalth document in TermPositions.
+ *PayloadLength indicates the length of the last payload.
+ *OffsetLength indicates the length of the last offset (endOffset-startOffset).
+ ** FreqSkip and ProxSkip record the position of every SkipInterval th * entry in FreqFile and ProxFile, respectively. File positions are relative to * the start of TermFreqs and Positions, to the previous SkipDatum in the diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsWriter.java index 75dbca28592..a07f1fb246d 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsWriter.java @@ -138,11 +138,11 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase { //if (DEBUG) System.out.println("SPW: startTerm freqOut.fp=" + freqStart); if (proxOut != null) { proxStart = proxOut.getFilePointer(); - // force first payload to write its length - lastPayloadLength = -1; - // force first offset to write its length - lastOffsetLength = -1; } + // force first payload to write its length + lastPayloadLength = -1; + // force first offset to write its length + lastOffsetLength = -1; skipListWriter.resetSkip(); } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40SkipListReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40SkipListReader.java index 754b839d7b2..741a8273549 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40SkipListReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40SkipListReader.java @@ -111,25 +111,26 @@ public class Lucene40SkipListReader extends MultiLevelSkipListReader { @Override protected int readSkipData(int level, IndexInput skipStream) throws IOException { int delta; - if (currentFieldStoresPayloads) { - // the current field stores payloads. + if (currentFieldStoresPayloads || currentFieldStoresOffsets) { + // the current field stores payloads and/or offsets. // if the doc delta is odd then we have - // to read the current payload length - // because it differs from the length of the - // previous payload + // to read the current payload/offset lengths + // because it differs from the lengths of the + // previous payload/offset delta = skipStream.readVInt(); if ((delta & 1) != 0) { - payloadLength[level] = skipStream.readVInt(); + if (currentFieldStoresPayloads) { + payloadLength[level] = skipStream.readVInt(); + } + if (currentFieldStoresOffsets) { + offsetLength[level] = skipStream.readVInt(); + } } delta >>>= 1; } else { delta = skipStream.readVInt(); } - if (currentFieldStoresOffsets) { - offsetLength[level] = skipStream.readVInt(); - } - freqPointer[level] += skipStream.readVInt(); proxPointer[level] += skipStream.readVInt(); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40SkipListWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40SkipListWriter.java index d06a7214374..e7167fd2bb8 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40SkipListWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40SkipListWriter.java @@ -32,6 +32,7 @@ import org.apache.lucene.codecs.MultiLevelSkipListWriter; public class Lucene40SkipListWriter extends MultiLevelSkipListWriter { private int[] lastSkipDoc; private int[] lastSkipPayloadLength; + private int[] lastSkipOffsetLength; private long[] lastSkipFreqPointer; private long[] lastSkipProxPointer; @@ -53,6 +54,7 @@ public class Lucene40SkipListWriter extends MultiLevelSkipListWriter { lastSkipDoc = new int[numberOfSkipLevels]; lastSkipPayloadLength = new int[numberOfSkipLevels]; + lastSkipOffsetLength = new int[numberOfSkipLevels]; lastSkipFreqPointer = new long[numberOfSkipLevels]; lastSkipProxPointer = new long[numberOfSkipLevels]; } @@ -61,6 +63,8 @@ public class Lucene40SkipListWriter extends MultiLevelSkipListWriter { * Sets the values for the current skip data. */ public void setSkipData(int doc, boolean storePayloads, int payloadLength, boolean storeOffsets, int offsetLength) { + assert storePayloads || payloadLength == -1; + assert storeOffsets || offsetLength == -1; this.curDoc = doc; this.curStorePayloads = storePayloads; this.curPayloadLength = payloadLength; @@ -76,6 +80,7 @@ public class Lucene40SkipListWriter extends MultiLevelSkipListWriter { super.resetSkip(); Arrays.fill(lastSkipDoc, 0); Arrays.fill(lastSkipPayloadLength, -1); // we don't have to write the first length in the skip list + Arrays.fill(lastSkipOffsetLength, -1); // we don't have to write the first length in the skip list Arrays.fill(lastSkipFreqPointer, freqOutput.getFilePointer()); if (proxOutput != null) Arrays.fill(lastSkipProxPointer, proxOutput.getFilePointer()); @@ -83,47 +88,53 @@ public class Lucene40SkipListWriter extends MultiLevelSkipListWriter { @Override protected void writeSkipData(int level, IndexOutput skipBuffer) throws IOException { - // To efficiently store payloads in the posting lists we do not store the length of - // every payload. Instead we omit the length for a payload if the previous payload had - // the same length. - // However, in order to support skipping the payload length at every skip point must be known. + // To efficiently store payloads/offsets in the posting lists we do not store the length of + // every payload/offset. Instead we omit the length if the previous lengths were the same + // + // However, in order to support skipping, the length at every skip point must be known. // So we use the same length encoding that we use for the posting lists for the skip data as well: - // Case 1: current field does not store payloads + // Case 1: current field does not store payloads/offsets // SkipDatum --> DocSkip, FreqSkip, ProxSkip // DocSkip,FreqSkip,ProxSkip --> VInt // DocSkip records the document number before every SkipInterval th document in TermFreqs. // Document numbers are represented as differences from the previous value in the sequence. - // Case 2: current field stores payloads - // SkipDatum --> DocSkip, PayloadLength?, FreqSkip,ProxSkip + // Case 2: current field stores payloads/offsets + // SkipDatum --> DocSkip, PayloadLength?,OffsetLength?,FreqSkip,ProxSkip // DocSkip,FreqSkip,ProxSkip --> VInt - // PayloadLength --> VInt + // PayloadLength,OffsetLength--> VInt // In this case DocSkip/2 is the difference between // the current and the previous value. If DocSkip // is odd, then a PayloadLength encoded as VInt follows, // if DocSkip is even, then it is assumed that the - // current payload length equals the length at the previous + // current payload/offset lengths equals the lengths at the previous // skip point - if (curStorePayloads) { - int delta = curDoc - lastSkipDoc[level]; - if (curPayloadLength == lastSkipPayloadLength[level]) { - // the current payload length equals the length at the previous skip point, - // so we don't store the length again - skipBuffer.writeVInt(delta * 2); + int delta = curDoc - lastSkipDoc[level]; + + if (curStorePayloads || curStoreOffsets) { + assert curStorePayloads || curPayloadLength == lastSkipPayloadLength[level]; + assert curStoreOffsets || curOffsetLength == lastSkipOffsetLength[level]; + + if (curPayloadLength == lastSkipPayloadLength[level] && curOffsetLength == lastSkipOffsetLength[level]) { + // the current payload/offset lengths equals the lengths at the previous skip point, + // so we don't store the lengths again + skipBuffer.writeVInt(delta << 1); } else { - // the payload length is different from the previous one. We shift the DocSkip, - // set the lowest bit and store the current payload length as VInt. - skipBuffer.writeVInt(delta * 2 + 1); - skipBuffer.writeVInt(curPayloadLength); - lastSkipPayloadLength[level] = curPayloadLength; + // the payload and/or offset length is different from the previous one. We shift the DocSkip, + // set the lowest bit and store the current payload and/or offset lengths as VInts. + skipBuffer.writeVInt(delta << 1 | 1); + + if (curStorePayloads) { + skipBuffer.writeVInt(curPayloadLength); + lastSkipPayloadLength[level] = curPayloadLength; + } + if (curStoreOffsets) { + skipBuffer.writeVInt(curOffsetLength); + lastSkipOffsetLength[level] = curOffsetLength; + } } } else { - // current field does not store payloads - skipBuffer.writeVInt(curDoc - lastSkipDoc[level]); - } - - // TODO: not sure it really helps to shove this somewhere else if its the same as the last skip - if (curStoreOffsets) { - skipBuffer.writeVInt(curOffsetLength); + // current field does not store payloads or offsets + skipBuffer.writeVInt(delta); } skipBuffer.writeVInt((int) (curFreqPointer - lastSkipFreqPointer[level]));