From 2c462fa3a6f1321d92b0817d7c3cd0e1278e7d00 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Tue, 23 Oct 2012 13:10:09 +0000 Subject: [PATCH] LUCENE-4498: pulse docFreq=1 in 4.1 codec git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1401284 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 4 + .../lucene41/Lucene41PostingsFormat.java | 9 ++- .../lucene41/Lucene41PostingsReader.java | 74 ++++++++++++++++--- .../lucene41/Lucene41PostingsWriter.java | 46 ++++++++---- 4 files changed, 102 insertions(+), 31 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 4bce4455a78..8dbe8649d65 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -108,6 +108,10 @@ Optimizations * LUCENE-4497: Don't write PosVIntCount to the positions file in Lucene41PostingsFormat, as its always totalTermFreq % BLOCK_SIZE. (Robert Muir) +* LUCENE-4498: In Lucene41PostingsFormat, when a term appears in only one document, + Instead of writing a file pointer to a VIntBlock containing the doc id, just + write the doc id. (Mike McCandless, Robert Muir) + Build * LUCENE-4451: Memory leak per unique thread caused by diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsFormat.java index df87cb61080..ce916865d03 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsFormat.java @@ -127,10 +127,10 @@ import org.apache.lucene.util.packed.PackedInts; * * *

Notes:

@@ -162,6 +162,9 @@ import org.apache.lucene.util.packed.PackedInts; * file. In particular, it is the length of the TermFreq data. * SkipDelta is only stored if DocFreq is not smaller than SkipMinimum * (i.e. 8 in Lucene41PostingsFormat). + *
  • SingletonDocID is an optimization when a term only appears in one document. In this case, instead + * of writing a file pointer to the .doc file (DocFPDelta), and then a VIntBlock at that location, the + * single document ID is written to the term dictionary.
  • * * * @@ -277,7 +280,7 @@ import org.apache.lucene.util.packed.PackedInts; *
  • VIntBlock --> <PositionDelta[, PayloadLength?], PayloadData?, * OffsetDelta?, OffsetLength?>PosVIntCount *
  • PackedPosDeltaBlock --> {@link PackedInts PackedInts}
  • - *
  • PosVIntCount, PositionDelta, OffsetDelta, OffsetLength --> + *
  • PositionDelta, OffsetDelta, OffsetLength --> * {@link DataOutput#writeVInt VInt}
  • *
  • PayloadData --> {@link DataOutput#writeByte byte}PayLength
  • * diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java index 5979ad75116..02540ef5195 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java @@ -148,6 +148,9 @@ public final class Lucene41PostingsReader extends PostingsReaderBase { long payStartFP; long skipOffset; long lastPosBlockOffset; + // docid when there is a single pulsed posting, otherwise -1 + // freq is always implicitly totalTermFreq in this case. + int singletonDocID; // Only used by the "primary" TermState -- clones don't // copy this (basically they are "transient"): @@ -170,6 +173,7 @@ public final class Lucene41PostingsReader extends PostingsReaderBase { payStartFP = other.payStartFP; lastPosBlockOffset = other.lastPosBlockOffset; skipOffset = other.skipOffset; + singletonDocID = other.singletonDocID; // Do not copy bytes, bytesReader (else TermState is // very heavy, ie drags around the entire block's @@ -179,7 +183,7 @@ public final class Lucene41PostingsReader extends PostingsReaderBase { @Override public String toString() { - return super.toString() + " docStartFP=" + docStartFP + " posStartFP=" + posStartFP + " payStartFP=" + payStartFP + " lastPosBlockOffset=" + lastPosBlockOffset; + return super.toString() + " docStartFP=" + docStartFP + " posStartFP=" + posStartFP + " payStartFP=" + payStartFP + " lastPosBlockOffset=" + lastPosBlockOffset + " singletonDocID=" + singletonDocID; } } @@ -223,7 +227,13 @@ public final class Lucene41PostingsReader extends PostingsReaderBase { final DataInput in = termState.bytesReader; if (isFirstTerm) { - termState.docStartFP = in.readVLong(); + if (termState.docFreq == 1) { + termState.singletonDocID = in.readVInt(); + termState.docStartFP = 0; + } else { + termState.singletonDocID = -1; + termState.docStartFP = in.readVLong(); + } if (fieldHasPositions) { termState.posStartFP = in.readVLong(); if (termState.totalTermFreq > BLOCK_SIZE) { @@ -238,7 +248,12 @@ public final class Lucene41PostingsReader extends PostingsReaderBase { } } } else { - termState.docStartFP += in.readVLong(); + if (termState.docFreq == 1) { + termState.singletonDocID = in.readVInt(); + } else { + termState.singletonDocID = -1; + termState.docStartFP += in.readVLong(); + } if (fieldHasPositions) { termState.posStartFP += in.readVLong(); if (termState.totalTermFreq > BLOCK_SIZE) { @@ -327,13 +342,14 @@ public final class Lucene41PostingsReader extends PostingsReaderBase { final IndexInput startDocIn; - final IndexInput docIn; + IndexInput docIn; final boolean indexHasFreq; final boolean indexHasPos; final boolean indexHasOffsets; final boolean indexHasPayloads; private int docFreq; // number of docs in this posting list + private long totalTermFreq; // sum of freqs in this posting list (or docFreq when omitted) private int docUpto; // how many docs we've read private int doc; // doc we last read private int accum; // accumulator for doc deltas @@ -354,10 +370,11 @@ public final class Lucene41PostingsReader extends PostingsReaderBase { private Bits liveDocs; private boolean needsFreq; // true if the caller actually needs frequencies + private int singletonDocID; // docid when there is a single pulsed posting, otherwise -1 public BlockDocsEnum(FieldInfo fieldInfo) throws IOException { this.startDocIn = Lucene41PostingsReader.this.docIn; - this.docIn = startDocIn.clone(); + this.docIn = null; indexHasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; indexHasPos = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; @@ -378,9 +395,17 @@ public final class Lucene41PostingsReader extends PostingsReaderBase { // System.out.println(" FPR.reset: termState=" + termState); // } docFreq = termState.docFreq; + totalTermFreq = indexHasFreq ? termState.totalTermFreq : docFreq; docTermStartFP = termState.docStartFP; - docIn.seek(docTermStartFP); skipOffset = termState.skipOffset; + singletonDocID = termState.singletonDocID; + if (docFreq > 1) { + if (docIn == null) { + // lazy init + docIn = startDocIn.clone(); + } + docIn.seek(docTermStartFP); + } doc = -1; this.needsFreq = (flags & DocsEnum.FLAG_FREQS) != 0; @@ -425,6 +450,9 @@ public final class Lucene41PostingsReader extends PostingsReaderBase { forUtil.skipBlock(docIn); // skip over freqs } } + } else if (docFreq == 1) { + docDeltaBuffer[0] = singletonDocID; + freqBuffer[0] = (int) totalTermFreq; } else { // Read vInts: // if (DEBUG) { @@ -590,7 +618,7 @@ public final class Lucene41PostingsReader extends PostingsReaderBase { final IndexInput startDocIn; - final IndexInput docIn; + IndexInput docIn; final IndexInput posIn; final boolean indexHasOffsets; @@ -635,10 +663,11 @@ public final class Lucene41PostingsReader extends PostingsReaderBase { private int nextSkipDoc; private Bits liveDocs; + private int singletonDocID; // docid when there is a single pulsed posting, otherwise -1 public BlockDocsAndPositionsEnum(FieldInfo fieldInfo) throws IOException { this.startDocIn = Lucene41PostingsReader.this.docIn; - this.docIn = startDocIn.clone(); + this.docIn = null; this.posIn = Lucene41PostingsReader.this.posIn.clone(); encoded = new byte[MAX_ENCODED_SIZE]; indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; @@ -660,9 +689,16 @@ public final class Lucene41PostingsReader extends PostingsReaderBase { docTermStartFP = termState.docStartFP; posTermStartFP = termState.posStartFP; payTermStartFP = termState.payStartFP; - docIn.seek(docTermStartFP); skipOffset = termState.skipOffset; totalTermFreq = termState.totalTermFreq; + singletonDocID = termState.singletonDocID; + if (docFreq > 1) { + if (docIn == null) { + // lazy init + docIn = startDocIn.clone(); + } + docIn.seek(docTermStartFP); + } posPendingFP = posTermStartFP; posPendingCount = 0; if (termState.totalTermFreq < BLOCK_SIZE) { @@ -705,6 +741,9 @@ public final class Lucene41PostingsReader extends PostingsReaderBase { // System.out.println(" fill freq block from fp=" + docIn.getFilePointer()); // } forUtil.readBlock(docIn, encoded, freqBuffer); + } else if (docFreq == 1) { + docDeltaBuffer[0] = singletonDocID; + freqBuffer[0] = (int) totalTermFreq; } else { // Read vInts: // if (DEBUG) { @@ -1002,7 +1041,7 @@ public final class Lucene41PostingsReader extends PostingsReaderBase { final IndexInput startDocIn; - final IndexInput docIn; + IndexInput docIn; final IndexInput posIn; final IndexInput payIn; final BytesRef payload; @@ -1056,10 +1095,11 @@ public final class Lucene41PostingsReader extends PostingsReaderBase { private boolean needsOffsets; // true if we actually need offsets private boolean needsPayloads; // true if we actually need payloads + private int singletonDocID; // docid when there is a single pulsed posting, otherwise -1 public EverythingEnum(FieldInfo fieldInfo) throws IOException { this.startDocIn = Lucene41PostingsReader.this.docIn; - this.docIn = startDocIn.clone(); + this.docIn = null; this.posIn = Lucene41PostingsReader.this.posIn.clone(); this.payIn = Lucene41PostingsReader.this.payIn.clone(); encoded = new byte[MAX_ENCODED_SIZE]; @@ -1101,9 +1141,16 @@ public final class Lucene41PostingsReader extends PostingsReaderBase { docTermStartFP = termState.docStartFP; posTermStartFP = termState.posStartFP; payTermStartFP = termState.payStartFP; - docIn.seek(docTermStartFP); skipOffset = termState.skipOffset; totalTermFreq = termState.totalTermFreq; + singletonDocID = termState.singletonDocID; + if (docFreq > 1) { + if (docIn == null) { + // lazy init + docIn = startDocIn.clone(); + } + docIn.seek(docTermStartFP); + } posPendingFP = posTermStartFP; payPendingFP = payTermStartFP; posPendingCount = 0; @@ -1150,6 +1197,9 @@ public final class Lucene41PostingsReader extends PostingsReaderBase { // System.out.println(" fill freq block from fp=" + docIn.getFilePointer()); // } forUtil.readBlock(docIn, encoded, freqBuffer); + } else if (docFreq == 1) { + docDeltaBuffer[0] = singletonDocID; + freqBuffer[0] = (int) totalTermFreq; } else { // if (DEBUG) { // System.out.println(" fill last vInt doc block from fp=" + docIn.getFilePointer()); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsWriter.java index 8dd4cd10e4c..9e8728ff979 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsWriter.java @@ -354,13 +354,15 @@ public final class Lucene41PostingsWriter extends PostingsWriterBase { public final long payStartFP; public final long skipOffset; public final long lastPosBlockOffset; + public final int singletonDocID; - public PendingTerm(long docStartFP, long posStartFP, long payStartFP, long skipOffset, long lastPosBlockOffset) { + public PendingTerm(long docStartFP, long posStartFP, long payStartFP, long skipOffset, long lastPosBlockOffset, int singletonDocID) { this.docStartFP = docStartFP; this.posStartFP = posStartFP; this.payStartFP = payStartFP; this.skipOffset = skipOffset; this.lastPosBlockOffset = lastPosBlockOffset; + this.singletonDocID = singletonDocID; } } @@ -384,18 +386,26 @@ public final class Lucene41PostingsWriter extends PostingsWriterBase { // System.out.println(" write doc/freq vInt block (count=" + docBufferUpto + ") at fp=" + docOut.getFilePointer() + " docTermStartFP=" + docTermStartFP); // } // } - - // vInt encode the remaining doc deltas and freqs: - for(int i=0;i