From 733654068a5e46db5e89613a7744b43eb14f412a Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Mon, 22 Oct 2012 15:27:04 +0000 Subject: [PATCH] LUCENE-4496: don't decode unnecessary blocks in 4.1 codec git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1400915 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 15 +-- .../lucene41/Lucene41PostingsReader.java | 58 +++++++--- .../lucene41/TestBlockPostingsFormat3.java | 102 +++++++++--------- 3 files changed, 102 insertions(+), 73 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index ba1315f9ecc..b94a00d9a5a 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -67,8 +67,8 @@ Bug Fixes * LUCENE-1822: BaseFragListBuilder hard-coded 6 char margin is too naive. (Alex Vigdor, Arcadius Ahouansou, Koji Sekiguchi) -* LUCENE-4468: Fix rareish integer overflows in Block and Lucene40 postings - formats (Robert Muir) +* LUCENE-4468: Fix rareish integer overflows in Lucene41 postings + format. (Robert Muir) * LUCENE-4486: Add support for ConstantScoreQuery in Highlighter. (Simon Willnauer) @@ -85,16 +85,15 @@ Bug Fixes Optimizations -* LUCENE-4443: BlockPostingsFormat no longer writes unnecessary offsets - into the skipdata. You need to reindex any indexes created with - this experimental codec. (Robert Muir) +* LUCENE-4443: Lucene41PostingsFormat no longer writes unnecessary offsets + into the skipdata. (Robert Muir) * LUCENE-4459: Improve WeakIdentityMap.keyIterator() to remove GCed keys from backing map early instead of waiting for reap(). This makes test failures in TestWeakIdentityMap disappear, too. (Uwe Schindler, Mike McCandless, Robert Muir) -* LUCENE-4473: BlockPostingsFormat encodes offsets more efficiently +* LUCENE-4473: Lucene41PostingsFormat encodes offsets more efficiently for low frequency terms (< 128 occurrences). (Robert Muir) * LUCENE-4462: DocumentsWriter now flushes deletes, segment infos and builds @@ -102,6 +101,10 @@ Optimizations was a single threaded process while now all IO and CPU heavy computation is done concurrently in DocumentsWriterPerThread. (Simon Willnauer) +* LUCENE-4496: Optimize Lucene41PostingsFormat when requesting a subset of + the postings data (via flags to TermsEnum.docs/docsAndPositions) to use + ForUtil.skipBlock. (Robert Muir) + Build * LUCENE-4451: Memory leak per unique thread caused by diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java index 6292b18e6a7..8c52f2723e4 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java @@ -275,10 +275,10 @@ public final class Lucene41PostingsReader extends PostingsReaderBase { } else { docsEnum = new BlockDocsEnum(fieldInfo); } - return docsEnum.reset(liveDocs, (IntBlockTermState) termState); + return docsEnum.reset(liveDocs, (IntBlockTermState) termState, flags); } - // TODO: specialize to liveDocs vs not, and freqs vs not + // TODO: specialize to liveDocs vs not @Override public DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, BlockTermState termState, Bits liveDocs, @@ -310,7 +310,7 @@ public final class Lucene41PostingsReader extends PostingsReaderBase { } else { everythingEnum = new EverythingEnum(fieldInfo); } - return everythingEnum.reset(liveDocs, (IntBlockTermState) termState); + return everythingEnum.reset(liveDocs, (IntBlockTermState) termState, flags); } } @@ -352,6 +352,8 @@ public final class Lucene41PostingsReader extends PostingsReaderBase { private int nextSkipDoc; private Bits liveDocs; + + private boolean needsFreq; // true if the caller actually needs frequencies public BlockDocsEnum(FieldInfo fieldInfo) throws IOException { this.startDocIn = Lucene41PostingsReader.this.docIn; @@ -370,7 +372,7 @@ public final class Lucene41PostingsReader extends PostingsReaderBase { indexHasPayloads == fieldInfo.hasPayloads(); } - public DocsEnum reset(Bits liveDocs, IntBlockTermState termState) throws IOException { + public DocsEnum reset(Bits liveDocs, IntBlockTermState termState, int flags) throws IOException { this.liveDocs = liveDocs; // if (DEBUG) { // System.out.println(" FPR.reset: termState=" + termState); @@ -381,6 +383,7 @@ public final class Lucene41PostingsReader extends PostingsReaderBase { skipOffset = termState.skipOffset; doc = -1; + this.needsFreq = (flags & DocsEnum.FLAG_FREQS) != 0; if (!indexHasFreq) { Arrays.fill(freqBuffer, 1); } @@ -416,7 +419,11 @@ public final class Lucene41PostingsReader extends PostingsReaderBase { // if (DEBUG) { // System.out.println(" fill freq block from fp=" + docIn.getFilePointer()); // } - forUtil.readBlock(docIn, encoded, freqBuffer); + if (needsFreq) { + forUtil.readBlock(docIn, encoded, freqBuffer); + } else { + forUtil.skipBlock(docIn); // skip over freqs + } } } else { // Read vInts: @@ -1044,6 +1051,9 @@ public final class Lucene41PostingsReader extends PostingsReaderBase { private Bits liveDocs; + private boolean needsOffsets; // true if we actually need offsets + private boolean needsPayloads; // true if we actually need payloads + public EverythingEnum(FieldInfo fieldInfo) throws IOException { this.startDocIn = Lucene41PostingsReader.this.docIn; this.docIn = startDocIn.clone(); @@ -1079,7 +1089,7 @@ public final class Lucene41PostingsReader extends PostingsReaderBase { indexHasPayloads == fieldInfo.hasPayloads(); } - public EverythingEnum reset(Bits liveDocs, IntBlockTermState termState) throws IOException { + public EverythingEnum reset(Bits liveDocs, IntBlockTermState termState, int flags) throws IOException { this.liveDocs = liveDocs; // if (DEBUG) { // System.out.println(" FPR.reset: termState=" + termState); @@ -1101,6 +1111,9 @@ public final class Lucene41PostingsReader extends PostingsReaderBase { lastPosBlockFP = posTermStartFP + termState.lastPosBlockOffset; } + this.needsOffsets = (flags & DocsAndPositionsEnum.FLAG_OFFSETS) != 0; + this.needsPayloads = (flags & DocsAndPositionsEnum.FLAG_PAYLOADS) != 0; + doc = -1; accum = 0; docUpto = 0; @@ -1203,15 +1216,22 @@ public final class Lucene41PostingsReader extends PostingsReaderBase { // if (DEBUG) { // System.out.println(" bulk payload block @ pay.fp=" + payIn.getFilePointer()); // } - forUtil.readBlock(payIn, encoded, payloadLengthBuffer); - int numBytes = payIn.readVInt(); - // if (DEBUG) { - // System.out.println(" " + numBytes + " payload bytes @ pay.fp=" + payIn.getFilePointer()); - // } - if (numBytes > payloadBytes.length) { - payloadBytes = ArrayUtil.grow(payloadBytes, numBytes); + if (needsPayloads) { + forUtil.readBlock(payIn, encoded, payloadLengthBuffer); + int numBytes = payIn.readVInt(); + // if (DEBUG) { + // System.out.println(" " + numBytes + " payload bytes @ pay.fp=" + payIn.getFilePointer()); + // } + if (numBytes > payloadBytes.length) { + payloadBytes = ArrayUtil.grow(payloadBytes, numBytes); + } + payIn.readBytes(payloadBytes, 0, numBytes); + } else { + // this works, because when writing a vint block we always force the first length to be written + forUtil.skipBlock(payIn); // skip over lengths + int numBytes = payIn.readVInt(); // read length of payloadBytes + payIn.seek(payIn.getFilePointer() + numBytes); // skip over payloadBytes } - payIn.readBytes(payloadBytes, 0, numBytes); payloadByteUpto = 0; } @@ -1219,8 +1239,14 @@ public final class Lucene41PostingsReader extends PostingsReaderBase { // if (DEBUG) { // System.out.println(" bulk offset block @ pay.fp=" + payIn.getFilePointer()); // } - forUtil.readBlock(payIn, encoded, offsetStartDeltaBuffer); - forUtil.readBlock(payIn, encoded, offsetLengthBuffer); + if (needsOffsets) { + forUtil.readBlock(payIn, encoded, offsetStartDeltaBuffer); + forUtil.readBlock(payIn, encoded, offsetLengthBuffer); + } else { + // this works, because when writing a vint block we always force the first length to be written + forUtil.skipBlock(payIn); // skip over starts + forUtil.skipBlock(payIn); // skip over lengths + } } } } diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene41/TestBlockPostingsFormat3.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene41/TestBlockPostingsFormat3.java index 34bd00789d3..b8bdfab5886 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene41/TestBlockPostingsFormat3.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene41/TestBlockPostingsFormat3.java @@ -29,7 +29,6 @@ import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.MockVariableLengthPayloadFilter; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.lucene41.Lucene41Codec; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; @@ -61,12 +60,12 @@ import org.apache.lucene.util.automaton.CompiledAutomaton; import org.apache.lucene.util.automaton.RegExp; /** - * Tests partial enumeration (only pulling a subset of the prox data) + * Tests partial enumeration (only pulling a subset of the indexed data) */ public class TestBlockPostingsFormat3 extends LuceneTestCase { static final int MAXDOC = Lucene41PostingsFormat.BLOCK_SIZE * 20; - // creates 6 fields with different options and does "duels" of fields against each other + // creates 8 fields with different options and does "duels" of fields against each other public void test() throws Exception { Directory dir = newDirectory(); Analyzer analyzer = new Analyzer(new Analyzer.PerFieldReuseStrategy()) { @@ -85,35 +84,45 @@ public class TestBlockPostingsFormat3 extends LuceneTestCase { } }; IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer); - iwc.setCodec(new Lucene41Codec() { - @Override - public PostingsFormat getPostingsFormatForField(String field) { - return PostingsFormat.forName("Lucene41"); - // TODO: we could actually add more fields implemented with different PFs - } - }); + iwc.setCodec(new Lucene41Codec()); + // TODO we could actually add more fields implemented with different PFs + // or, just put this test into the usual rotation? RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); Document doc = new Document(); - FieldType bareType = new FieldType(TextField.TYPE_NOT_STORED); + FieldType docsOnlyType = new FieldType(TextField.TYPE_NOT_STORED); + // turn this on for a cross-check + docsOnlyType.setStoreTermVectors(true); + docsOnlyType.setIndexOptions(IndexOptions.DOCS_ONLY); + + FieldType docsAndFreqsType = new FieldType(TextField.TYPE_NOT_STORED); + // turn this on for a cross-check + docsAndFreqsType.setStoreTermVectors(true); + docsAndFreqsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS); + + FieldType positionsType = new FieldType(TextField.TYPE_NOT_STORED); // turn these on for a cross-check - bareType.setStoreTermVectors(true); - bareType.setStoreTermVectorPositions(true); - bareType.setStoreTermVectorOffsets(true); - bareType.setStoreTermVectorPayloads(true); - FieldType offsetsType = new FieldType(bareType); + positionsType.setStoreTermVectors(true); + positionsType.setStoreTermVectorPositions(true); + positionsType.setStoreTermVectorOffsets(true); + positionsType.setStoreTermVectorPayloads(true); + FieldType offsetsType = new FieldType(positionsType); offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); - Field field1 = new Field("field1bare", "", bareType); - Field field2 = new Field("field2offsets", "", offsetsType); - Field field3 = new Field("field3payloadsFixed", "", bareType); - Field field4 = new Field("field4payloadsVariable", "", bareType); - Field field5 = new Field("field5payloadsFixedOffsets", "", offsetsType); - Field field6 = new Field("field6payloadsVariableOffsets", "", offsetsType); + Field field1 = new Field("field1docs", "", docsOnlyType); + Field field2 = new Field("field2freqs", "", docsAndFreqsType); + Field field3 = new Field("field3positions", "", positionsType); + Field field4 = new Field("field4offsets", "", offsetsType); + Field field5 = new Field("field5payloadsFixed", "", positionsType); + Field field6 = new Field("field6payloadsVariable", "", positionsType); + Field field7 = new Field("field7payloadsFixedOffsets", "", offsetsType); + Field field8 = new Field("field8payloadsVariableOffsets", "", offsetsType); doc.add(field1); doc.add(field2); doc.add(field3); doc.add(field4); doc.add(field5); doc.add(field6); + doc.add(field7); + doc.add(field8); for (int i = 0; i < MAXDOC; i++) { String stringValue = Integer.toString(i) + " verycommon " + English.intToEnglish(i).replace('-', ' ') + " " + _TestUtil.randomSimpleString(random()); field1.setStringValue(stringValue); @@ -122,6 +131,8 @@ public class TestBlockPostingsFormat3 extends LuceneTestCase { field4.setStringValue(stringValue); field5.setStringValue(stringValue); field6.setStringValue(stringValue); + field7.setStringValue(stringValue); + field8.setStringValue(stringValue); iw.addDocument(doc); } iw.close(); @@ -139,11 +150,12 @@ public class TestBlockPostingsFormat3 extends LuceneTestCase { DirectoryReader ir = DirectoryReader.open(dir); for (AtomicReaderContext leaf : ir.leaves()) { AtomicReader leafReader = leaf.reader(); - assertTerms(leafReader.terms("field1bare"), leafReader.terms("field2offsets"), true); - assertTerms(leafReader.terms("field2offsets"), leafReader.terms("field3payloadsFixed"), true); - assertTerms(leafReader.terms("field3payloadsFixed"), leafReader.terms("field4payloadsVariable"), true); - assertTerms(leafReader.terms("field4payloadsVariable"), leafReader.terms("field5payloadsFixedOffsets"), true); - assertTerms(leafReader.terms("field5payloadsFixedOffsets"), leafReader.terms("field6payloadsVariableOffsets"), true); + assertTerms(leafReader.terms("field1docs"), leafReader.terms("field2freqs"), true); + assertTerms(leafReader.terms("field3positions"), leafReader.terms("field4offsets"), true); + assertTerms(leafReader.terms("field4offsets"), leafReader.terms("field5payloadsFixed"), true); + assertTerms(leafReader.terms("field5payloadsFixed"), leafReader.terms("field6payloadsVariable"), true); + assertTerms(leafReader.terms("field6payloadsVariable"), leafReader.terms("field7payloadsFixedOffsets"), true); + assertTerms(leafReader.terms("field7payloadsFixedOffsets"), leafReader.terms("field8payloadsVariableOffsets"), true); } ir.close(); } @@ -334,39 +346,31 @@ public class TestBlockPostingsFormat3 extends LuceneTestCase { // with freqs: assertDocsEnum(leftDocs = leftTermsEnum.docs(null, leftDocs), - rightDocs = rightTermsEnum.docs(null, rightDocs), - true); + rightDocs = rightTermsEnum.docs(null, rightDocs)); assertDocsEnum(leftDocs = leftTermsEnum.docs(randomBits, leftDocs), - rightDocs = rightTermsEnum.docs(randomBits, rightDocs), - true); + rightDocs = rightTermsEnum.docs(randomBits, rightDocs)); // w/o freqs: assertDocsEnum(leftDocs = leftTermsEnum.docs(null, leftDocs, 0), - rightDocs = rightTermsEnum.docs(null, rightDocs, 0), - false); + rightDocs = rightTermsEnum.docs(null, rightDocs, 0)); assertDocsEnum(leftDocs = leftTermsEnum.docs(randomBits, leftDocs, 0), - rightDocs = rightTermsEnum.docs(randomBits, rightDocs, 0), - false); + rightDocs = rightTermsEnum.docs(randomBits, rightDocs, 0)); // with freqs: assertDocsSkipping(leftTermsEnum.docFreq(), leftDocs = leftTermsEnum.docs(null, leftDocs), - rightDocs = rightTermsEnum.docs(null, rightDocs), - true); + rightDocs = rightTermsEnum.docs(null, rightDocs)); assertDocsSkipping(leftTermsEnum.docFreq(), leftDocs = leftTermsEnum.docs(randomBits, leftDocs), - rightDocs = rightTermsEnum.docs(randomBits, rightDocs), - true); + rightDocs = rightTermsEnum.docs(randomBits, rightDocs)); // w/o freqs: assertDocsSkipping(leftTermsEnum.docFreq(), leftDocs = leftTermsEnum.docs(null, leftDocs, 0), - rightDocs = rightTermsEnum.docs(null, rightDocs, 0), - false); + rightDocs = rightTermsEnum.docs(null, rightDocs, 0)); assertDocsSkipping(leftTermsEnum.docFreq(), leftDocs = leftTermsEnum.docs(randomBits, leftDocs, 0), - rightDocs = rightTermsEnum.docs(randomBits, rightDocs, 0), - false); + rightDocs = rightTermsEnum.docs(randomBits, rightDocs, 0)); } } assertNull(rightTermsEnum.next()); @@ -409,7 +413,7 @@ public class TestBlockPostingsFormat3 extends LuceneTestCase { /** * checks docs + freqs, sequentially */ - public void assertDocsEnum(DocsEnum leftDocs, DocsEnum rightDocs, boolean hasFreqs) throws Exception { + public void assertDocsEnum(DocsEnum leftDocs, DocsEnum rightDocs) throws Exception { if (leftDocs == null) { assertNull(rightDocs); return; @@ -419,9 +423,7 @@ public class TestBlockPostingsFormat3 extends LuceneTestCase { int docid; while ((docid = leftDocs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { assertEquals(docid, rightDocs.nextDoc()); - if (hasFreqs) { - assertEquals(leftDocs.freq(), rightDocs.freq()); - } + // we don't assert freqs, they are allowed to be different } assertEquals(DocIdSetIterator.NO_MORE_DOCS, rightDocs.nextDoc()); } @@ -429,7 +431,7 @@ public class TestBlockPostingsFormat3 extends LuceneTestCase { /** * checks advancing docs */ - public void assertDocsSkipping(int docFreq, DocsEnum leftDocs, DocsEnum rightDocs, boolean hasFreqs) throws Exception { + public void assertDocsSkipping(int docFreq, DocsEnum leftDocs, DocsEnum rightDocs) throws Exception { if (leftDocs == null) { assertNull(rightDocs); return; @@ -453,9 +455,7 @@ public class TestBlockPostingsFormat3 extends LuceneTestCase { if (docid == DocIdSetIterator.NO_MORE_DOCS) { return; } - if (hasFreqs) { - assertEquals(leftDocs.freq(), rightDocs.freq()); - } + // we don't assert freqs, they are allowed to be different } }