diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 4bce4455a78..8dbe8649d65 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -108,6 +108,10 @@ Optimizations
* LUCENE-4497: Don't write PosVIntCount to the positions file in
Lucene41PostingsFormat, as its always totalTermFreq % BLOCK_SIZE. (Robert Muir)
+* LUCENE-4498: In Lucene41PostingsFormat, when a term appears in only one document,
+ Instead of writing a file pointer to a VIntBlock containing the doc id, just
+ write the doc id. (Mike McCandless, Robert Muir)
+
Build
* LUCENE-4451: Memory leak per unique thread caused by
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsFormat.java
index df87cb61080..ce916865d03 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsFormat.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsFormat.java
@@ -127,10 +127,10 @@ import org.apache.lucene.util.packed.PackedInts;
*
*
* - Postings Metadata --> Header, PackedBlockSize
- * - Term Metadata --> DocFPDelta, PosFPDelta?, PosVIntBlockFPDelta?, PayFPDelta?,
+ *
- Term Metadata --> (DocFPDelta|SingletonDocID), PosFPDelta?, PosVIntBlockFPDelta?, PayFPDelta?,
* SkipFPDelta?
* - Header, --> {@link CodecUtil#writeHeader CodecHeader}
- * - PackedBlockSize --> {@link DataOutput#writeVInt VInt}
+ * - PackedBlockSize, SingletonDocID --> {@link DataOutput#writeVInt VInt}
* - DocFPDelta, PosFPDelta, PayFPDelta, PosVIntBlockFPDelta, SkipFPDelta --> {@link DataOutput#writeVLong VLong}
*
* Notes:
@@ -162,6 +162,9 @@ import org.apache.lucene.util.packed.PackedInts;
* file. In particular, it is the length of the TermFreq data.
* SkipDelta is only stored if DocFreq is not smaller than SkipMinimum
* (i.e. 8 in Lucene41PostingsFormat).
+ * SingletonDocID is an optimization when a term only appears in one document. In this case, instead
+ * of writing a file pointer to the .doc file (DocFPDelta), and then a VIntBlock at that location, the
+ * single document ID is written to the term dictionary.
*
*
*
@@ -277,7 +280,7 @@ import org.apache.lucene.util.packed.PackedInts;
* VIntBlock --> <PositionDelta[, PayloadLength?], PayloadData?,
* OffsetDelta?, OffsetLength?>PosVIntCount
* PackedPosDeltaBlock --> {@link PackedInts PackedInts}
- * PosVIntCount, PositionDelta, OffsetDelta, OffsetLength -->
+ * PositionDelta, OffsetDelta, OffsetLength -->
* {@link DataOutput#writeVInt VInt}
* PayloadData --> {@link DataOutput#writeByte byte}PayLength
*
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java
index 5979ad75116..02540ef5195 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java
@@ -148,6 +148,9 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
long payStartFP;
long skipOffset;
long lastPosBlockOffset;
+ // docid when there is a single pulsed posting, otherwise -1
+ // freq is always implicitly totalTermFreq in this case.
+ int singletonDocID;
// Only used by the "primary" TermState -- clones don't
// copy this (basically they are "transient"):
@@ -170,6 +173,7 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
payStartFP = other.payStartFP;
lastPosBlockOffset = other.lastPosBlockOffset;
skipOffset = other.skipOffset;
+ singletonDocID = other.singletonDocID;
// Do not copy bytes, bytesReader (else TermState is
// very heavy, ie drags around the entire block's
@@ -179,7 +183,7 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
@Override
public String toString() {
- return super.toString() + " docStartFP=" + docStartFP + " posStartFP=" + posStartFP + " payStartFP=" + payStartFP + " lastPosBlockOffset=" + lastPosBlockOffset;
+ return super.toString() + " docStartFP=" + docStartFP + " posStartFP=" + posStartFP + " payStartFP=" + payStartFP + " lastPosBlockOffset=" + lastPosBlockOffset + " singletonDocID=" + singletonDocID;
}
}
@@ -223,7 +227,13 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
final DataInput in = termState.bytesReader;
if (isFirstTerm) {
- termState.docStartFP = in.readVLong();
+ if (termState.docFreq == 1) {
+ termState.singletonDocID = in.readVInt();
+ termState.docStartFP = 0;
+ } else {
+ termState.singletonDocID = -1;
+ termState.docStartFP = in.readVLong();
+ }
if (fieldHasPositions) {
termState.posStartFP = in.readVLong();
if (termState.totalTermFreq > BLOCK_SIZE) {
@@ -238,7 +248,12 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
}
}
} else {
- termState.docStartFP += in.readVLong();
+ if (termState.docFreq == 1) {
+ termState.singletonDocID = in.readVInt();
+ } else {
+ termState.singletonDocID = -1;
+ termState.docStartFP += in.readVLong();
+ }
if (fieldHasPositions) {
termState.posStartFP += in.readVLong();
if (termState.totalTermFreq > BLOCK_SIZE) {
@@ -327,13 +342,14 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
final IndexInput startDocIn;
- final IndexInput docIn;
+ IndexInput docIn;
final boolean indexHasFreq;
final boolean indexHasPos;
final boolean indexHasOffsets;
final boolean indexHasPayloads;
private int docFreq; // number of docs in this posting list
+ private long totalTermFreq; // sum of freqs in this posting list (or docFreq when omitted)
private int docUpto; // how many docs we've read
private int doc; // doc we last read
private int accum; // accumulator for doc deltas
@@ -354,10 +370,11 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
private Bits liveDocs;
private boolean needsFreq; // true if the caller actually needs frequencies
+ private int singletonDocID; // docid when there is a single pulsed posting, otherwise -1
public BlockDocsEnum(FieldInfo fieldInfo) throws IOException {
this.startDocIn = Lucene41PostingsReader.this.docIn;
- this.docIn = startDocIn.clone();
+ this.docIn = null;
indexHasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
indexHasPos = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
@@ -378,9 +395,17 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
// System.out.println(" FPR.reset: termState=" + termState);
// }
docFreq = termState.docFreq;
+ totalTermFreq = indexHasFreq ? termState.totalTermFreq : docFreq;
docTermStartFP = termState.docStartFP;
- docIn.seek(docTermStartFP);
skipOffset = termState.skipOffset;
+ singletonDocID = termState.singletonDocID;
+ if (docFreq > 1) {
+ if (docIn == null) {
+ // lazy init
+ docIn = startDocIn.clone();
+ }
+ docIn.seek(docTermStartFP);
+ }
doc = -1;
this.needsFreq = (flags & DocsEnum.FLAG_FREQS) != 0;
@@ -425,6 +450,9 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
forUtil.skipBlock(docIn); // skip over freqs
}
}
+ } else if (docFreq == 1) {
+ docDeltaBuffer[0] = singletonDocID;
+ freqBuffer[0] = (int) totalTermFreq;
} else {
// Read vInts:
// if (DEBUG) {
@@ -590,7 +618,7 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
final IndexInput startDocIn;
- final IndexInput docIn;
+ IndexInput docIn;
final IndexInput posIn;
final boolean indexHasOffsets;
@@ -635,10 +663,11 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
private int nextSkipDoc;
private Bits liveDocs;
+ private int singletonDocID; // docid when there is a single pulsed posting, otherwise -1
public BlockDocsAndPositionsEnum(FieldInfo fieldInfo) throws IOException {
this.startDocIn = Lucene41PostingsReader.this.docIn;
- this.docIn = startDocIn.clone();
+ this.docIn = null;
this.posIn = Lucene41PostingsReader.this.posIn.clone();
encoded = new byte[MAX_ENCODED_SIZE];
indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
@@ -660,9 +689,16 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
docTermStartFP = termState.docStartFP;
posTermStartFP = termState.posStartFP;
payTermStartFP = termState.payStartFP;
- docIn.seek(docTermStartFP);
skipOffset = termState.skipOffset;
totalTermFreq = termState.totalTermFreq;
+ singletonDocID = termState.singletonDocID;
+ if (docFreq > 1) {
+ if (docIn == null) {
+ // lazy init
+ docIn = startDocIn.clone();
+ }
+ docIn.seek(docTermStartFP);
+ }
posPendingFP = posTermStartFP;
posPendingCount = 0;
if (termState.totalTermFreq < BLOCK_SIZE) {
@@ -705,6 +741,9 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
// System.out.println(" fill freq block from fp=" + docIn.getFilePointer());
// }
forUtil.readBlock(docIn, encoded, freqBuffer);
+ } else if (docFreq == 1) {
+ docDeltaBuffer[0] = singletonDocID;
+ freqBuffer[0] = (int) totalTermFreq;
} else {
// Read vInts:
// if (DEBUG) {
@@ -1002,7 +1041,7 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
final IndexInput startDocIn;
- final IndexInput docIn;
+ IndexInput docIn;
final IndexInput posIn;
final IndexInput payIn;
final BytesRef payload;
@@ -1056,10 +1095,11 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
private boolean needsOffsets; // true if we actually need offsets
private boolean needsPayloads; // true if we actually need payloads
+ private int singletonDocID; // docid when there is a single pulsed posting, otherwise -1
public EverythingEnum(FieldInfo fieldInfo) throws IOException {
this.startDocIn = Lucene41PostingsReader.this.docIn;
- this.docIn = startDocIn.clone();
+ this.docIn = null;
this.posIn = Lucene41PostingsReader.this.posIn.clone();
this.payIn = Lucene41PostingsReader.this.payIn.clone();
encoded = new byte[MAX_ENCODED_SIZE];
@@ -1101,9 +1141,16 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
docTermStartFP = termState.docStartFP;
posTermStartFP = termState.posStartFP;
payTermStartFP = termState.payStartFP;
- docIn.seek(docTermStartFP);
skipOffset = termState.skipOffset;
totalTermFreq = termState.totalTermFreq;
+ singletonDocID = termState.singletonDocID;
+ if (docFreq > 1) {
+ if (docIn == null) {
+ // lazy init
+ docIn = startDocIn.clone();
+ }
+ docIn.seek(docTermStartFP);
+ }
posPendingFP = posTermStartFP;
payPendingFP = payTermStartFP;
posPendingCount = 0;
@@ -1150,6 +1197,9 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
// System.out.println(" fill freq block from fp=" + docIn.getFilePointer());
// }
forUtil.readBlock(docIn, encoded, freqBuffer);
+ } else if (docFreq == 1) {
+ docDeltaBuffer[0] = singletonDocID;
+ freqBuffer[0] = (int) totalTermFreq;
} else {
// if (DEBUG) {
// System.out.println(" fill last vInt doc block from fp=" + docIn.getFilePointer());
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsWriter.java
index 8dd4cd10e4c..9e8728ff979 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsWriter.java
@@ -354,13 +354,15 @@ public final class Lucene41PostingsWriter extends PostingsWriterBase {
public final long payStartFP;
public final long skipOffset;
public final long lastPosBlockOffset;
+ public final int singletonDocID;
- public PendingTerm(long docStartFP, long posStartFP, long payStartFP, long skipOffset, long lastPosBlockOffset) {
+ public PendingTerm(long docStartFP, long posStartFP, long payStartFP, long skipOffset, long lastPosBlockOffset, int singletonDocID) {
this.docStartFP = docStartFP;
this.posStartFP = posStartFP;
this.payStartFP = payStartFP;
this.skipOffset = skipOffset;
this.lastPosBlockOffset = lastPosBlockOffset;
+ this.singletonDocID = singletonDocID;
}
}
@@ -384,18 +386,26 @@ public final class Lucene41PostingsWriter extends PostingsWriterBase {
// System.out.println(" write doc/freq vInt block (count=" + docBufferUpto + ") at fp=" + docOut.getFilePointer() + " docTermStartFP=" + docTermStartFP);
// }
// }
-
- // vInt encode the remaining doc deltas and freqs:
- for(int i=0;i