diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 8243635bbd0..4aa3164a886 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -196,6 +196,9 @@ Improvements * LUCENE-9794: Speed up implementations of DataInput.skipBytes(). (Greg Miller) +* LUCENE-9827: Speed up merging of stored fields and term vectors for smaller segments. + (Daniel Mitterdorfer, Dimitrios Liapis, Adrien Grand, Robert Muir) + Bug fixes diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingStoredFieldsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingStoredFieldsWriter.java index 815bbc0eaa8..3e32545b86c 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingStoredFieldsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingStoredFieldsWriter.java @@ -711,7 +711,8 @@ public final class Lucene90CompressingStoredFieldsWriter extends StoredFieldsWri boolean tooDirty(Lucene90CompressingStoredFieldsReader candidate) { // more than 1% dirty, or more than hard limit of 1024 dirty chunks return candidate.getNumDirtyChunks() > 1024 - || candidate.getNumDirtyDocs() * 100 > candidate.getNumDocs(); + || (candidate.getNumDirtyChunks() > 1 + && candidate.getNumDirtyDocs() * 100 > candidate.getNumDocs()); } private static class CompressingStoredFieldsMergeSub extends DocIDMerger.Sub { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingTermVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingTermVectorsWriter.java index 9908ccf78de..b33e3534651 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingTermVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingTermVectorsWriter.java @@ -939,7 +939,8 @@ public final class Lucene90CompressingTermVectorsWriter extends TermVectorsWrite boolean tooDirty(Lucene90CompressingTermVectorsReader candidate) { // more than 1% dirty, or more than hard limit of 1024 dirty chunks return candidate.getNumDirtyChunks() > 1024 - || candidate.getNumDirtyDocs() * 100 > candidate.getNumDocs(); + || (candidate.getNumDirtyChunks() > 1 + && candidate.getNumDirtyDocs() * 100 > candidate.getNumDocs()); } @Override diff --git a/lucene/test-framework/src/test/org/apache/lucene/codecs/lucene90/compressing/TestCompressingStoredFieldsFormat.java b/lucene/test-framework/src/test/org/apache/lucene/codecs/lucene90/compressing/TestCompressingStoredFieldsFormat.java index c96c9734af1..d89223d8d35 100644 --- a/lucene/test-framework/src/test/org/apache/lucene/codecs/lucene90/compressing/TestCompressingStoredFieldsFormat.java +++ b/lucene/test-framework/src/test/org/apache/lucene/codecs/lucene90/compressing/TestCompressingStoredFieldsFormat.java @@ -307,6 +307,11 @@ public class TestCompressingStoredFieldsFormat extends BaseStoredFieldsFormatTes } iw.getConfig().setMergePolicy(newLogMergePolicy()); iw.forceMerge(1); + // add a single doc and merge again + Document doc = new Document(); + doc.add(new StoredField("text", "not very long at all")); + iw.addDocument(doc); + iw.forceMerge(1); DirectoryReader ir2 = DirectoryReader.openIfChanged(ir); assertNotNull(ir2); ir.close(); @@ -314,8 +319,8 @@ public class TestCompressingStoredFieldsFormat extends BaseStoredFieldsFormatTes CodecReader sr = (CodecReader) getOnlyLeafReader(ir); Lucene90CompressingStoredFieldsReader reader = (Lucene90CompressingStoredFieldsReader) sr.getFieldsReader(); - // we could get lucky, and have zero, but typically one. - assertTrue(reader.getNumDirtyChunks() <= 1); + // at most 2: the 5 chunks from 5 doc segment will be collapsed into a single chunk + assertTrue(reader.getNumDirtyChunks() <= 2); ir.close(); iw.close(); dir.close(); diff --git a/lucene/test-framework/src/test/org/apache/lucene/codecs/lucene90/compressing/TestCompressingTermVectorsFormat.java b/lucene/test-framework/src/test/org/apache/lucene/codecs/lucene90/compressing/TestCompressingTermVectorsFormat.java index 02924be5ae2..93e26157479 100644 --- a/lucene/test-framework/src/test/org/apache/lucene/codecs/lucene90/compressing/TestCompressingTermVectorsFormat.java +++ b/lucene/test-framework/src/test/org/apache/lucene/codecs/lucene90/compressing/TestCompressingTermVectorsFormat.java @@ -109,6 +109,13 @@ public class TestCompressingTermVectorsFormat extends BaseTermVectorsFormatTestC } iw.getConfig().setMergePolicy(newLogMergePolicy()); iw.forceMerge(1); + // add one more doc and merge again + Document doc = new Document(); + FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); + ft.setStoreTermVectors(true); + doc.add(new Field("text", "not very long at all", ft)); + iw.addDocument(doc); + iw.forceMerge(1); DirectoryReader ir2 = DirectoryReader.openIfChanged(ir); assertNotNull(ir2); ir.close(); @@ -116,8 +123,8 @@ public class TestCompressingTermVectorsFormat extends BaseTermVectorsFormatTestC CodecReader sr = (CodecReader) getOnlyLeafReader(ir); Lucene90CompressingTermVectorsReader reader = (Lucene90CompressingTermVectorsReader) sr.getTermVectorsReader(); - // we could get lucky, and have zero, but typically one. - assertTrue(reader.getNumDirtyChunks() <= 1); + // at most 2: the 5 chunks from 5 doc segment will be collapsed into a single chunk + assertTrue(reader.getNumDirtyChunks() <= 2); ir.close(); iw.close(); dir.close();