mirror of https://github.com/apache/lucene.git
LUCENE-9827: Speed up merging of stored fields and term vectors for small segments
Stored Fields and Term Vectors are block-compressed. Decompressing and recompressing all the documents on every merge is too slow, so we try to avoid doing it unless it will actually improve the compression ratio. If we can get away with it, we just bulk-copy existing compressed blocks to the new segment. Previously, small segments would always be considered dirty and recompressed... the special optimized bulk merge wouldn't kick in until segments were relatively large. But as block size and ratio (shared dictionaries etc) have increased, "relatively large" has become a much bigger number. So try to avoid doing wasted work: if there's only 1 dirty chunk (incompletely filled compression block), then don't recompress: it will likely only give us 1 dirty chunk as a result, at the expense of cpu. Require at least 2 dirty chunks to recompress: this way the recompression actually buys us something (reduces 2 to 1). The change also means that bulk merge will now happen often in the unit test suite, increasing coverage.
This commit is contained in:
parent
12999d30f2
commit
1b36406ec4
|
@ -196,6 +196,9 @@ Improvements
|
|||
|
||||
* LUCENE-9794: Speed up implementations of DataInput.skipBytes(). (Greg Miller)
|
||||
|
||||
* LUCENE-9827: Speed up merging of stored fields and term vectors for smaller segments.
|
||||
(Daniel Mitterdorfer, Dimitrios Liapis, Adrien Grand, Robert Muir)
|
||||
|
||||
Bug fixes
|
||||
|
||||
|
||||
|
|
|
@ -711,7 +711,8 @@ public final class Lucene90CompressingStoredFieldsWriter extends StoredFieldsWri
|
|||
boolean tooDirty(Lucene90CompressingStoredFieldsReader candidate) {
|
||||
// more than 1% dirty, or more than hard limit of 1024 dirty chunks
|
||||
return candidate.getNumDirtyChunks() > 1024
|
||||
|| candidate.getNumDirtyDocs() * 100 > candidate.getNumDocs();
|
||||
|| (candidate.getNumDirtyChunks() > 1
|
||||
&& candidate.getNumDirtyDocs() * 100 > candidate.getNumDocs());
|
||||
}
|
||||
|
||||
private static class CompressingStoredFieldsMergeSub extends DocIDMerger.Sub {
|
||||
|
|
|
@ -939,7 +939,8 @@ public final class Lucene90CompressingTermVectorsWriter extends TermVectorsWrite
|
|||
boolean tooDirty(Lucene90CompressingTermVectorsReader candidate) {
|
||||
// more than 1% dirty, or more than hard limit of 1024 dirty chunks
|
||||
return candidate.getNumDirtyChunks() > 1024
|
||||
|| candidate.getNumDirtyDocs() * 100 > candidate.getNumDocs();
|
||||
|| (candidate.getNumDirtyChunks() > 1
|
||||
&& candidate.getNumDirtyDocs() * 100 > candidate.getNumDocs());
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -307,6 +307,11 @@ public class TestCompressingStoredFieldsFormat extends BaseStoredFieldsFormatTes
|
|||
}
|
||||
iw.getConfig().setMergePolicy(newLogMergePolicy());
|
||||
iw.forceMerge(1);
|
||||
// add a single doc and merge again
|
||||
Document doc = new Document();
|
||||
doc.add(new StoredField("text", "not very long at all"));
|
||||
iw.addDocument(doc);
|
||||
iw.forceMerge(1);
|
||||
DirectoryReader ir2 = DirectoryReader.openIfChanged(ir);
|
||||
assertNotNull(ir2);
|
||||
ir.close();
|
||||
|
@ -314,8 +319,8 @@ public class TestCompressingStoredFieldsFormat extends BaseStoredFieldsFormatTes
|
|||
CodecReader sr = (CodecReader) getOnlyLeafReader(ir);
|
||||
Lucene90CompressingStoredFieldsReader reader =
|
||||
(Lucene90CompressingStoredFieldsReader) sr.getFieldsReader();
|
||||
// we could get lucky, and have zero, but typically one.
|
||||
assertTrue(reader.getNumDirtyChunks() <= 1);
|
||||
// at most 2: the 5 chunks from 5 doc segment will be collapsed into a single chunk
|
||||
assertTrue(reader.getNumDirtyChunks() <= 2);
|
||||
ir.close();
|
||||
iw.close();
|
||||
dir.close();
|
||||
|
|
|
@ -109,6 +109,13 @@ public class TestCompressingTermVectorsFormat extends BaseTermVectorsFormatTestC
|
|||
}
|
||||
iw.getConfig().setMergePolicy(newLogMergePolicy());
|
||||
iw.forceMerge(1);
|
||||
// add one more doc and merge again
|
||||
Document doc = new Document();
|
||||
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
|
||||
ft.setStoreTermVectors(true);
|
||||
doc.add(new Field("text", "not very long at all", ft));
|
||||
iw.addDocument(doc);
|
||||
iw.forceMerge(1);
|
||||
DirectoryReader ir2 = DirectoryReader.openIfChanged(ir);
|
||||
assertNotNull(ir2);
|
||||
ir.close();
|
||||
|
@ -116,8 +123,8 @@ public class TestCompressingTermVectorsFormat extends BaseTermVectorsFormatTestC
|
|||
CodecReader sr = (CodecReader) getOnlyLeafReader(ir);
|
||||
Lucene90CompressingTermVectorsReader reader =
|
||||
(Lucene90CompressingTermVectorsReader) sr.getTermVectorsReader();
|
||||
// we could get lucky, and have zero, but typically one.
|
||||
assertTrue(reader.getNumDirtyChunks() <= 1);
|
||||
// at most 2: the 5 chunks from 5 doc segment will be collapsed into a single chunk
|
||||
assertTrue(reader.getNumDirtyChunks() <= 2);
|
||||
ir.close();
|
||||
iw.close();
|
||||
dir.close();
|
||||
|
|
Loading…
Reference in New Issue