LUCENE-4496: don't decode unnecessary blocks in 4.1 codec

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1400915 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2012-10-22 15:27:04 +00:00
parent dc49949558
commit 733654068a
3 changed files with 102 additions and 73 deletions

View File

@ -67,8 +67,8 @@ Bug Fixes
* LUCENE-1822: BaseFragListBuilder hard-coded 6 char margin is too naive. * LUCENE-1822: BaseFragListBuilder hard-coded 6 char margin is too naive.
(Alex Vigdor, Arcadius Ahouansou, Koji Sekiguchi) (Alex Vigdor, Arcadius Ahouansou, Koji Sekiguchi)
* LUCENE-4468: Fix rareish integer overflows in Block and Lucene40 postings * LUCENE-4468: Fix rareish integer overflows in Lucene41 postings
formats (Robert Muir) format. (Robert Muir)
* LUCENE-4486: Add support for ConstantScoreQuery in Highlighter. * LUCENE-4486: Add support for ConstantScoreQuery in Highlighter.
(Simon Willnauer) (Simon Willnauer)
@ -85,16 +85,15 @@ Bug Fixes
Optimizations Optimizations
* LUCENE-4443: BlockPostingsFormat no longer writes unnecessary offsets * LUCENE-4443: Lucene41PostingsFormat no longer writes unnecessary offsets
into the skipdata. You need to reindex any indexes created with into the skipdata. (Robert Muir)
this experimental codec. (Robert Muir)
* LUCENE-4459: Improve WeakIdentityMap.keyIterator() to remove GCed keys * LUCENE-4459: Improve WeakIdentityMap.keyIterator() to remove GCed keys
from backing map early instead of waiting for reap(). This makes test from backing map early instead of waiting for reap(). This makes test
failures in TestWeakIdentityMap disappear, too. failures in TestWeakIdentityMap disappear, too.
(Uwe Schindler, Mike McCandless, Robert Muir) (Uwe Schindler, Mike McCandless, Robert Muir)
* LUCENE-4473: BlockPostingsFormat encodes offsets more efficiently * LUCENE-4473: Lucene41PostingsFormat encodes offsets more efficiently
for low frequency terms (< 128 occurrences). (Robert Muir) for low frequency terms (< 128 occurrences). (Robert Muir)
* LUCENE-4462: DocumentsWriter now flushes deletes, segment infos and builds * LUCENE-4462: DocumentsWriter now flushes deletes, segment infos and builds
@ -102,6 +101,10 @@ Optimizations
was a single threaded process while now all IO and CPU heavy computation is done was a single threaded process while now all IO and CPU heavy computation is done
concurrently in DocumentsWriterPerThread. (Simon Willnauer) concurrently in DocumentsWriterPerThread. (Simon Willnauer)
* LUCENE-4496: Optimize Lucene41PostingsFormat when requesting a subset of
the postings data (via flags to TermsEnum.docs/docsAndPositions) to use
ForUtil.skipBlock. (Robert Muir)
Build Build
* LUCENE-4451: Memory leak per unique thread caused by * LUCENE-4451: Memory leak per unique thread caused by

View File

@ -275,10 +275,10 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
} else { } else {
docsEnum = new BlockDocsEnum(fieldInfo); docsEnum = new BlockDocsEnum(fieldInfo);
} }
return docsEnum.reset(liveDocs, (IntBlockTermState) termState); return docsEnum.reset(liveDocs, (IntBlockTermState) termState, flags);
} }
// TODO: specialize to liveDocs vs not, and freqs vs not // TODO: specialize to liveDocs vs not
@Override @Override
public DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, BlockTermState termState, Bits liveDocs, public DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, BlockTermState termState, Bits liveDocs,
@ -310,7 +310,7 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
} else { } else {
everythingEnum = new EverythingEnum(fieldInfo); everythingEnum = new EverythingEnum(fieldInfo);
} }
return everythingEnum.reset(liveDocs, (IntBlockTermState) termState); return everythingEnum.reset(liveDocs, (IntBlockTermState) termState, flags);
} }
} }
@ -352,6 +352,8 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
private int nextSkipDoc; private int nextSkipDoc;
private Bits liveDocs; private Bits liveDocs;
private boolean needsFreq; // true if the caller actually needs frequencies
public BlockDocsEnum(FieldInfo fieldInfo) throws IOException { public BlockDocsEnum(FieldInfo fieldInfo) throws IOException {
this.startDocIn = Lucene41PostingsReader.this.docIn; this.startDocIn = Lucene41PostingsReader.this.docIn;
@ -370,7 +372,7 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
indexHasPayloads == fieldInfo.hasPayloads(); indexHasPayloads == fieldInfo.hasPayloads();
} }
public DocsEnum reset(Bits liveDocs, IntBlockTermState termState) throws IOException { public DocsEnum reset(Bits liveDocs, IntBlockTermState termState, int flags) throws IOException {
this.liveDocs = liveDocs; this.liveDocs = liveDocs;
// if (DEBUG) { // if (DEBUG) {
// System.out.println(" FPR.reset: termState=" + termState); // System.out.println(" FPR.reset: termState=" + termState);
@ -381,6 +383,7 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
skipOffset = termState.skipOffset; skipOffset = termState.skipOffset;
doc = -1; doc = -1;
this.needsFreq = (flags & DocsEnum.FLAG_FREQS) != 0;
if (!indexHasFreq) { if (!indexHasFreq) {
Arrays.fill(freqBuffer, 1); Arrays.fill(freqBuffer, 1);
} }
@ -416,7 +419,11 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
// if (DEBUG) { // if (DEBUG) {
// System.out.println(" fill freq block from fp=" + docIn.getFilePointer()); // System.out.println(" fill freq block from fp=" + docIn.getFilePointer());
// } // }
forUtil.readBlock(docIn, encoded, freqBuffer); if (needsFreq) {
forUtil.readBlock(docIn, encoded, freqBuffer);
} else {
forUtil.skipBlock(docIn); // skip over freqs
}
} }
} else { } else {
// Read vInts: // Read vInts:
@ -1044,6 +1051,9 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
private Bits liveDocs; private Bits liveDocs;
private boolean needsOffsets; // true if we actually need offsets
private boolean needsPayloads; // true if we actually need payloads
public EverythingEnum(FieldInfo fieldInfo) throws IOException { public EverythingEnum(FieldInfo fieldInfo) throws IOException {
this.startDocIn = Lucene41PostingsReader.this.docIn; this.startDocIn = Lucene41PostingsReader.this.docIn;
this.docIn = startDocIn.clone(); this.docIn = startDocIn.clone();
@ -1079,7 +1089,7 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
indexHasPayloads == fieldInfo.hasPayloads(); indexHasPayloads == fieldInfo.hasPayloads();
} }
public EverythingEnum reset(Bits liveDocs, IntBlockTermState termState) throws IOException { public EverythingEnum reset(Bits liveDocs, IntBlockTermState termState, int flags) throws IOException {
this.liveDocs = liveDocs; this.liveDocs = liveDocs;
// if (DEBUG) { // if (DEBUG) {
// System.out.println(" FPR.reset: termState=" + termState); // System.out.println(" FPR.reset: termState=" + termState);
@ -1101,6 +1111,9 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
lastPosBlockFP = posTermStartFP + termState.lastPosBlockOffset; lastPosBlockFP = posTermStartFP + termState.lastPosBlockOffset;
} }
this.needsOffsets = (flags & DocsAndPositionsEnum.FLAG_OFFSETS) != 0;
this.needsPayloads = (flags & DocsAndPositionsEnum.FLAG_PAYLOADS) != 0;
doc = -1; doc = -1;
accum = 0; accum = 0;
docUpto = 0; docUpto = 0;
@ -1203,15 +1216,22 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
// if (DEBUG) { // if (DEBUG) {
// System.out.println(" bulk payload block @ pay.fp=" + payIn.getFilePointer()); // System.out.println(" bulk payload block @ pay.fp=" + payIn.getFilePointer());
// } // }
forUtil.readBlock(payIn, encoded, payloadLengthBuffer); if (needsPayloads) {
int numBytes = payIn.readVInt(); forUtil.readBlock(payIn, encoded, payloadLengthBuffer);
// if (DEBUG) { int numBytes = payIn.readVInt();
// System.out.println(" " + numBytes + " payload bytes @ pay.fp=" + payIn.getFilePointer()); // if (DEBUG) {
// } // System.out.println(" " + numBytes + " payload bytes @ pay.fp=" + payIn.getFilePointer());
if (numBytes > payloadBytes.length) { // }
payloadBytes = ArrayUtil.grow(payloadBytes, numBytes); if (numBytes > payloadBytes.length) {
payloadBytes = ArrayUtil.grow(payloadBytes, numBytes);
}
payIn.readBytes(payloadBytes, 0, numBytes);
} else {
// this works, because when writing a vint block we always force the first length to be written
forUtil.skipBlock(payIn); // skip over lengths
int numBytes = payIn.readVInt(); // read length of payloadBytes
payIn.seek(payIn.getFilePointer() + numBytes); // skip over payloadBytes
} }
payIn.readBytes(payloadBytes, 0, numBytes);
payloadByteUpto = 0; payloadByteUpto = 0;
} }
@ -1219,8 +1239,14 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
// if (DEBUG) { // if (DEBUG) {
// System.out.println(" bulk offset block @ pay.fp=" + payIn.getFilePointer()); // System.out.println(" bulk offset block @ pay.fp=" + payIn.getFilePointer());
// } // }
forUtil.readBlock(payIn, encoded, offsetStartDeltaBuffer); if (needsOffsets) {
forUtil.readBlock(payIn, encoded, offsetLengthBuffer); forUtil.readBlock(payIn, encoded, offsetStartDeltaBuffer);
forUtil.readBlock(payIn, encoded, offsetLengthBuffer);
} else {
// this works, because when writing a vint block we always force the first length to be written
forUtil.skipBlock(payIn); // skip over starts
forUtil.skipBlock(payIn); // skip over lengths
}
} }
} }
} }

View File

@ -29,7 +29,6 @@ import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.MockVariableLengthPayloadFilter; import org.apache.lucene.analysis.MockVariableLengthPayloadFilter;
import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.lucene41.Lucene41Codec; import org.apache.lucene.codecs.lucene41.Lucene41Codec;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field; import org.apache.lucene.document.Field;
@ -61,12 +60,12 @@ import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.apache.lucene.util.automaton.RegExp; import org.apache.lucene.util.automaton.RegExp;
/** /**
* Tests partial enumeration (only pulling a subset of the prox data) * Tests partial enumeration (only pulling a subset of the indexed data)
*/ */
public class TestBlockPostingsFormat3 extends LuceneTestCase { public class TestBlockPostingsFormat3 extends LuceneTestCase {
static final int MAXDOC = Lucene41PostingsFormat.BLOCK_SIZE * 20; static final int MAXDOC = Lucene41PostingsFormat.BLOCK_SIZE * 20;
// creates 6 fields with different options and does "duels" of fields against each other // creates 8 fields with different options and does "duels" of fields against each other
public void test() throws Exception { public void test() throws Exception {
Directory dir = newDirectory(); Directory dir = newDirectory();
Analyzer analyzer = new Analyzer(new Analyzer.PerFieldReuseStrategy()) { Analyzer analyzer = new Analyzer(new Analyzer.PerFieldReuseStrategy()) {
@ -85,35 +84,45 @@ public class TestBlockPostingsFormat3 extends LuceneTestCase {
} }
}; };
IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer); IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
iwc.setCodec(new Lucene41Codec() { iwc.setCodec(new Lucene41Codec());
@Override // TODO we could actually add more fields implemented with different PFs
public PostingsFormat getPostingsFormatForField(String field) { // or, just put this test into the usual rotation?
return PostingsFormat.forName("Lucene41");
// TODO: we could actually add more fields implemented with different PFs
}
});
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
Document doc = new Document(); Document doc = new Document();
FieldType bareType = new FieldType(TextField.TYPE_NOT_STORED); FieldType docsOnlyType = new FieldType(TextField.TYPE_NOT_STORED);
// turn this on for a cross-check
docsOnlyType.setStoreTermVectors(true);
docsOnlyType.setIndexOptions(IndexOptions.DOCS_ONLY);
FieldType docsAndFreqsType = new FieldType(TextField.TYPE_NOT_STORED);
// turn this on for a cross-check
docsAndFreqsType.setStoreTermVectors(true);
docsAndFreqsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
FieldType positionsType = new FieldType(TextField.TYPE_NOT_STORED);
// turn these on for a cross-check // turn these on for a cross-check
bareType.setStoreTermVectors(true); positionsType.setStoreTermVectors(true);
bareType.setStoreTermVectorPositions(true); positionsType.setStoreTermVectorPositions(true);
bareType.setStoreTermVectorOffsets(true); positionsType.setStoreTermVectorOffsets(true);
bareType.setStoreTermVectorPayloads(true); positionsType.setStoreTermVectorPayloads(true);
FieldType offsetsType = new FieldType(bareType); FieldType offsetsType = new FieldType(positionsType);
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
Field field1 = new Field("field1bare", "", bareType); Field field1 = new Field("field1docs", "", docsOnlyType);
Field field2 = new Field("field2offsets", "", offsetsType); Field field2 = new Field("field2freqs", "", docsAndFreqsType);
Field field3 = new Field("field3payloadsFixed", "", bareType); Field field3 = new Field("field3positions", "", positionsType);
Field field4 = new Field("field4payloadsVariable", "", bareType); Field field4 = new Field("field4offsets", "", offsetsType);
Field field5 = new Field("field5payloadsFixedOffsets", "", offsetsType); Field field5 = new Field("field5payloadsFixed", "", positionsType);
Field field6 = new Field("field6payloadsVariableOffsets", "", offsetsType); Field field6 = new Field("field6payloadsVariable", "", positionsType);
Field field7 = new Field("field7payloadsFixedOffsets", "", offsetsType);
Field field8 = new Field("field8payloadsVariableOffsets", "", offsetsType);
doc.add(field1); doc.add(field1);
doc.add(field2); doc.add(field2);
doc.add(field3); doc.add(field3);
doc.add(field4); doc.add(field4);
doc.add(field5); doc.add(field5);
doc.add(field6); doc.add(field6);
doc.add(field7);
doc.add(field8);
for (int i = 0; i < MAXDOC; i++) { for (int i = 0; i < MAXDOC; i++) {
String stringValue = Integer.toString(i) + " verycommon " + English.intToEnglish(i).replace('-', ' ') + " " + _TestUtil.randomSimpleString(random()); String stringValue = Integer.toString(i) + " verycommon " + English.intToEnglish(i).replace('-', ' ') + " " + _TestUtil.randomSimpleString(random());
field1.setStringValue(stringValue); field1.setStringValue(stringValue);
@ -122,6 +131,8 @@ public class TestBlockPostingsFormat3 extends LuceneTestCase {
field4.setStringValue(stringValue); field4.setStringValue(stringValue);
field5.setStringValue(stringValue); field5.setStringValue(stringValue);
field6.setStringValue(stringValue); field6.setStringValue(stringValue);
field7.setStringValue(stringValue);
field8.setStringValue(stringValue);
iw.addDocument(doc); iw.addDocument(doc);
} }
iw.close(); iw.close();
@ -139,11 +150,12 @@ public class TestBlockPostingsFormat3 extends LuceneTestCase {
DirectoryReader ir = DirectoryReader.open(dir); DirectoryReader ir = DirectoryReader.open(dir);
for (AtomicReaderContext leaf : ir.leaves()) { for (AtomicReaderContext leaf : ir.leaves()) {
AtomicReader leafReader = leaf.reader(); AtomicReader leafReader = leaf.reader();
assertTerms(leafReader.terms("field1bare"), leafReader.terms("field2offsets"), true); assertTerms(leafReader.terms("field1docs"), leafReader.terms("field2freqs"), true);
assertTerms(leafReader.terms("field2offsets"), leafReader.terms("field3payloadsFixed"), true); assertTerms(leafReader.terms("field3positions"), leafReader.terms("field4offsets"), true);
assertTerms(leafReader.terms("field3payloadsFixed"), leafReader.terms("field4payloadsVariable"), true); assertTerms(leafReader.terms("field4offsets"), leafReader.terms("field5payloadsFixed"), true);
assertTerms(leafReader.terms("field4payloadsVariable"), leafReader.terms("field5payloadsFixedOffsets"), true); assertTerms(leafReader.terms("field5payloadsFixed"), leafReader.terms("field6payloadsVariable"), true);
assertTerms(leafReader.terms("field5payloadsFixedOffsets"), leafReader.terms("field6payloadsVariableOffsets"), true); assertTerms(leafReader.terms("field6payloadsVariable"), leafReader.terms("field7payloadsFixedOffsets"), true);
assertTerms(leafReader.terms("field7payloadsFixedOffsets"), leafReader.terms("field8payloadsVariableOffsets"), true);
} }
ir.close(); ir.close();
} }
@ -334,39 +346,31 @@ public class TestBlockPostingsFormat3 extends LuceneTestCase {
// with freqs: // with freqs:
assertDocsEnum(leftDocs = leftTermsEnum.docs(null, leftDocs), assertDocsEnum(leftDocs = leftTermsEnum.docs(null, leftDocs),
rightDocs = rightTermsEnum.docs(null, rightDocs), rightDocs = rightTermsEnum.docs(null, rightDocs));
true);
assertDocsEnum(leftDocs = leftTermsEnum.docs(randomBits, leftDocs), assertDocsEnum(leftDocs = leftTermsEnum.docs(randomBits, leftDocs),
rightDocs = rightTermsEnum.docs(randomBits, rightDocs), rightDocs = rightTermsEnum.docs(randomBits, rightDocs));
true);
// w/o freqs: // w/o freqs:
assertDocsEnum(leftDocs = leftTermsEnum.docs(null, leftDocs, 0), assertDocsEnum(leftDocs = leftTermsEnum.docs(null, leftDocs, 0),
rightDocs = rightTermsEnum.docs(null, rightDocs, 0), rightDocs = rightTermsEnum.docs(null, rightDocs, 0));
false);
assertDocsEnum(leftDocs = leftTermsEnum.docs(randomBits, leftDocs, 0), assertDocsEnum(leftDocs = leftTermsEnum.docs(randomBits, leftDocs, 0),
rightDocs = rightTermsEnum.docs(randomBits, rightDocs, 0), rightDocs = rightTermsEnum.docs(randomBits, rightDocs, 0));
false);
// with freqs: // with freqs:
assertDocsSkipping(leftTermsEnum.docFreq(), assertDocsSkipping(leftTermsEnum.docFreq(),
leftDocs = leftTermsEnum.docs(null, leftDocs), leftDocs = leftTermsEnum.docs(null, leftDocs),
rightDocs = rightTermsEnum.docs(null, rightDocs), rightDocs = rightTermsEnum.docs(null, rightDocs));
true);
assertDocsSkipping(leftTermsEnum.docFreq(), assertDocsSkipping(leftTermsEnum.docFreq(),
leftDocs = leftTermsEnum.docs(randomBits, leftDocs), leftDocs = leftTermsEnum.docs(randomBits, leftDocs),
rightDocs = rightTermsEnum.docs(randomBits, rightDocs), rightDocs = rightTermsEnum.docs(randomBits, rightDocs));
true);
// w/o freqs: // w/o freqs:
assertDocsSkipping(leftTermsEnum.docFreq(), assertDocsSkipping(leftTermsEnum.docFreq(),
leftDocs = leftTermsEnum.docs(null, leftDocs, 0), leftDocs = leftTermsEnum.docs(null, leftDocs, 0),
rightDocs = rightTermsEnum.docs(null, rightDocs, 0), rightDocs = rightTermsEnum.docs(null, rightDocs, 0));
false);
assertDocsSkipping(leftTermsEnum.docFreq(), assertDocsSkipping(leftTermsEnum.docFreq(),
leftDocs = leftTermsEnum.docs(randomBits, leftDocs, 0), leftDocs = leftTermsEnum.docs(randomBits, leftDocs, 0),
rightDocs = rightTermsEnum.docs(randomBits, rightDocs, 0), rightDocs = rightTermsEnum.docs(randomBits, rightDocs, 0));
false);
} }
} }
assertNull(rightTermsEnum.next()); assertNull(rightTermsEnum.next());
@ -409,7 +413,7 @@ public class TestBlockPostingsFormat3 extends LuceneTestCase {
/** /**
* checks docs + freqs, sequentially * checks docs + freqs, sequentially
*/ */
public void assertDocsEnum(DocsEnum leftDocs, DocsEnum rightDocs, boolean hasFreqs) throws Exception { public void assertDocsEnum(DocsEnum leftDocs, DocsEnum rightDocs) throws Exception {
if (leftDocs == null) { if (leftDocs == null) {
assertNull(rightDocs); assertNull(rightDocs);
return; return;
@ -419,9 +423,7 @@ public class TestBlockPostingsFormat3 extends LuceneTestCase {
int docid; int docid;
while ((docid = leftDocs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { while ((docid = leftDocs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
assertEquals(docid, rightDocs.nextDoc()); assertEquals(docid, rightDocs.nextDoc());
if (hasFreqs) { // we don't assert freqs, they are allowed to be different
assertEquals(leftDocs.freq(), rightDocs.freq());
}
} }
assertEquals(DocIdSetIterator.NO_MORE_DOCS, rightDocs.nextDoc()); assertEquals(DocIdSetIterator.NO_MORE_DOCS, rightDocs.nextDoc());
} }
@ -429,7 +431,7 @@ public class TestBlockPostingsFormat3 extends LuceneTestCase {
/** /**
* checks advancing docs * checks advancing docs
*/ */
public void assertDocsSkipping(int docFreq, DocsEnum leftDocs, DocsEnum rightDocs, boolean hasFreqs) throws Exception { public void assertDocsSkipping(int docFreq, DocsEnum leftDocs, DocsEnum rightDocs) throws Exception {
if (leftDocs == null) { if (leftDocs == null) {
assertNull(rightDocs); assertNull(rightDocs);
return; return;
@ -453,9 +455,7 @@ public class TestBlockPostingsFormat3 extends LuceneTestCase {
if (docid == DocIdSetIterator.NO_MORE_DOCS) { if (docid == DocIdSetIterator.NO_MORE_DOCS) {
return; return;
} }
if (hasFreqs) { // we don't assert freqs, they are allowed to be different
assertEquals(leftDocs.freq(), rightDocs.freq());
}
} }
} }