From 21735161dcbdfcad52220d0389637c43f0d7989d Mon Sep 17 00:00:00 2001 From: Mike McCandless Date: Tue, 22 Nov 2016 14:10:16 -0500 Subject: [PATCH] LUCENE-7568: Optimize merging when index sorting is used but the index is already sorted --- lucene/CHANGES.txt | 5 + .../lucene/codecs/DocValuesConsumer.java | 10 +- .../apache/lucene/codecs/NormsConsumer.java | 2 +- .../lucene/codecs/StoredFieldsWriter.java | 2 +- .../lucene/codecs/TermVectorsWriter.java | 2 +- .../CompressingStoredFieldsWriter.java | 2 +- .../CompressingTermVectorsWriter.java | 2 +- .../codecs/lucene60/Lucene60PointsWriter.java | 2 +- .../index/MappingMultiPostingsEnum.java | 2 +- .../org/apache/lucene/index/MergeState.java | 84 ++++---- .../org/apache/lucene/index/MultiSorter.java | 14 +- .../apache/lucene/index/TestIndexSorting.java | 184 +++++++++++++++++- 12 files changed, 261 insertions(+), 50 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index ca39e65d889..e62a99d1eb0 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -117,6 +117,11 @@ Improvements control how text is analyzed and converted into a query (Matt Weber via Mike McCandless) +Optimizations + +* LUCENE-7568: Optimize merging when index sorting is used but the + index is already sorted (Jim Ferenczi via Mike McCandless) + Other * LUCENE-7546: Fixed references to benchmark wikipedia data and the Jenkins line-docs file diff --git a/lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java b/lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java index e61724fbab1..ba2f2aa501c 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java @@ -198,7 +198,7 @@ public abstract class DocValuesConsumer implements Closeable { } } - final DocIDMerger docIDMerger = new DocIDMerger<>(subs, mergeState.segmentInfo.getIndexSort() != null); + final DocIDMerger docIDMerger = new DocIDMerger<>(subs, mergeState.needsIndexSort); final long finalCost = cost; @@ -296,7 +296,7 @@ public abstract class DocValuesConsumer implements Closeable { } } - final DocIDMerger docIDMerger = new DocIDMerger<>(subs, mergeState.segmentInfo.getIndexSort() != null); + final DocIDMerger docIDMerger = new DocIDMerger<>(subs, mergeState.needsIndexSort); final long finalCost = cost; return new BinaryDocValues() { @@ -397,7 +397,7 @@ public abstract class DocValuesConsumer implements Closeable { final long finalCost = cost; - final DocIDMerger docIDMerger = new DocIDMerger<>(subs, mergeState.segmentInfo.getIndexSort() != null); + final DocIDMerger docIDMerger = new DocIDMerger<>(subs, mergeState.needsIndexSort); return new SortedNumericDocValues() { @@ -555,7 +555,7 @@ public abstract class DocValuesConsumer implements Closeable { final long finalCost = cost; - final DocIDMerger docIDMerger = new DocIDMerger<>(subs, mergeState.segmentInfo.getIndexSort() != null); + final DocIDMerger docIDMerger = new DocIDMerger<>(subs, mergeState.needsIndexSort); return new SortedDocValues() { private int docID = -1; @@ -721,7 +721,7 @@ public abstract class DocValuesConsumer implements Closeable { subs.add(new SortedSetDocValuesSub(mergeState.docMaps[i], values, map.getGlobalOrds(i))); } - final DocIDMerger docIDMerger = new DocIDMerger<>(subs, mergeState.segmentInfo.getIndexSort() != null); + final DocIDMerger docIDMerger = new DocIDMerger<>(subs, mergeState.needsIndexSort); final long finalCost = cost; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/NormsConsumer.java b/lucene/core/src/java/org/apache/lucene/codecs/NormsConsumer.java index 51abb69b91e..7ad7a7c52a4 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/NormsConsumer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/NormsConsumer.java @@ -130,7 +130,7 @@ public abstract class NormsConsumer implements Closeable { } } - final DocIDMerger docIDMerger = new DocIDMerger<>(subs, mergeState.segmentInfo.getIndexSort() != null); + final DocIDMerger docIDMerger = new DocIDMerger<>(subs, mergeState.needsIndexSort); return new NumericDocValues() { private int docID = -1; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/StoredFieldsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/StoredFieldsWriter.java index 1e912713cf6..80a9c498d55 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/StoredFieldsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/StoredFieldsWriter.java @@ -117,7 +117,7 @@ public abstract class StoredFieldsWriter implements Closeable { subs.add(new StoredFieldsMergeSub(new MergeVisitor(mergeState, i), mergeState.docMaps[i], storedFieldsReader, mergeState.maxDocs[i])); } - final DocIDMerger docIDMerger = new DocIDMerger<>(subs, mergeState.segmentInfo.getIndexSort() != null); + final DocIDMerger docIDMerger = new DocIDMerger<>(subs, mergeState.needsIndexSort); int docCount = 0; while (true) { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/TermVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/TermVectorsWriter.java index 50cfca8a97d..c8ad9f6099f 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/TermVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/TermVectorsWriter.java @@ -205,7 +205,7 @@ public abstract class TermVectorsWriter implements Closeable { subs.add(new TermVectorsMergeSub(mergeState.docMaps[i], reader, mergeState.maxDocs[i])); } - final DocIDMerger docIDMerger = new DocIDMerger<>(subs, mergeState.segmentInfo.getIndexSort() != null); + final DocIDMerger docIDMerger = new DocIDMerger<>(subs, mergeState.needsIndexSort); int docCount = 0; while (true) { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java index d5bf4ad8e88..1956ab70683 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java @@ -486,7 +486,7 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter { @Override public int merge(MergeState mergeState) throws IOException { - if (mergeState.segmentInfo.getIndexSort() != null) { + if (mergeState.needsIndexSort) { // TODO: can we gain back some optos even if index is sorted? E.g. if sort results in large chunks of contiguous docs from one sub // being copied over...? return super.merge(mergeState); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java index 9f8f44ef36d..46a289a97b5 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java @@ -730,7 +730,7 @@ public final class CompressingTermVectorsWriter extends TermVectorsWriter { @Override public int merge(MergeState mergeState) throws IOException { - if (mergeState.segmentInfo.getIndexSort() != null) { + if (mergeState.needsIndexSort) { // TODO: can we gain back some optos even if index is sorted? E.g. if sort results in large chunks of contiguous docs from one sub // being copied over...? return super.merge(mergeState); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60PointsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60PointsWriter.java index 0f3af856980..6e404a15d25 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60PointsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60PointsWriter.java @@ -134,7 +134,7 @@ public class Lucene60PointsWriter extends PointsWriter implements Closeable { @Override public void merge(MergeState mergeState) throws IOException { - if (mergeState.segmentInfo.getIndexSort() != null) { + if (mergeState.needsIndexSort) { // TODO: can we gain back some optos even if index is sorted? E.g. if sort results in large chunks of contiguous docs from one sub // being copied over...? super.merge(mergeState); diff --git a/lucene/core/src/java/org/apache/lucene/index/MappingMultiPostingsEnum.java b/lucene/core/src/java/org/apache/lucene/index/MappingMultiPostingsEnum.java index adadc4036a5..d93c7715e93 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MappingMultiPostingsEnum.java +++ b/lucene/core/src/java/org/apache/lucene/index/MappingMultiPostingsEnum.java @@ -62,7 +62,7 @@ final class MappingMultiPostingsEnum extends PostingsEnum { for(int i=0;i(subs, allSubs.length, mergeState.segmentInfo.getIndexSort() != null); + this.docIDMerger = new DocIDMerger(subs, allSubs.length, mergeState.needsIndexSort); } MappingMultiPostingsEnum reset(MultiPostingsEnum postingsEnum) throws IOException { diff --git a/lucene/core/src/java/org/apache/lucene/index/MergeState.java b/lucene/core/src/java/org/apache/lucene/index/MergeState.java index fcaad517d16..fdedf3e93c9 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MergeState.java +++ b/lucene/core/src/java/org/apache/lucene/index/MergeState.java @@ -42,7 +42,7 @@ public class MergeState { /** Maps document IDs from old segments to document IDs in the new segment */ public final DocMap[] docMaps; - // Only used by IW when it must remap deletes that arrived against the merging segmetns while a merge was running: + // Only used by IW when it must remap deletes that arrived against the merging segments while a merge was running: final DocMap[] leafDocMaps; /** {@link SegmentInfo} of the newly merged segment. */ @@ -81,6 +81,9 @@ public class MergeState { /** InfoStream for debugging messages. */ public final InfoStream infoStream; + /** Indicates if the index needs to be sorted **/ + public boolean needsIndexSort; + /** Sole constructor. */ MergeState(List originalReaders, SegmentInfo segmentInfo, InfoStream infoStream) throws IOException { @@ -143,50 +146,58 @@ public class MergeState { this.docMaps = buildDocMaps(readers, indexSort); } - private DocMap[] buildDocMaps(List readers, Sort indexSort) throws IOException { + // Remap docIDs around deletions + private DocMap[] buildDeletionDocMaps(List readers) { + int totalDocs = 0; int numReaders = readers.size(); + DocMap[] docMaps = new DocMap[numReaders]; + + for (int i = 0; i < numReaders; i++) { + LeafReader reader = readers.get(i); + Bits liveDocs = reader.getLiveDocs(); + + final PackedLongValues delDocMap; + if (liveDocs != null) { + delDocMap = removeDeletes(reader.maxDoc(), liveDocs); + } else { + delDocMap = null; + } + + final int docBase = totalDocs; + docMaps[i] = new DocMap() { + @Override + public int get(int docID) { + if (liveDocs == null) { + return docBase + docID; + } else if (liveDocs.get(docID)) { + return docBase + (int) delDocMap.get(docID); + } else { + return -1; + } + } + }; + totalDocs += reader.numDocs(); + } + + return docMaps; + } + + private DocMap[] buildDocMaps(List readers, Sort indexSort) throws IOException { if (indexSort == null) { // no index sort ... we only must map around deletions, and rebase to the merged segment's docID space - - int totalDocs = 0; - DocMap[] docMaps = new DocMap[numReaders]; - - // Remap docIDs around deletions: - for (int i = 0; i < numReaders; i++) { - LeafReader reader = readers.get(i); - Bits liveDocs = reader.getLiveDocs(); - - final PackedLongValues delDocMap; - if (liveDocs != null) { - delDocMap = removeDeletes(reader.maxDoc(), liveDocs); - } else { - delDocMap = null; - } - - final int docBase = totalDocs; - docMaps[i] = new DocMap() { - @Override - public int get(int docID) { - if (liveDocs == null) { - return docBase + docID; - } else if (liveDocs.get(docID)) { - return docBase + (int) delDocMap.get(docID); - } else { - return -1; - } - } - }; - totalDocs += reader.numDocs(); - } - - return docMaps; - + return buildDeletionDocMaps(readers); } else { // do a merge sort of the incoming leaves: long t0 = System.nanoTime(); DocMap[] result = MultiSorter.sort(indexSort, readers); + if (result == null) { + // already sorted so we can switch back to map around deletions + return buildDeletionDocMaps(readers); + } else { + needsIndexSort = true; + } long t1 = System.nanoTime(); if (infoStream.isEnabled("SM")) { infoStream.message("SM", String.format(Locale.ROOT, "%.2f msec to build merge sorted DocMaps", (t1-t0)/1000000.0)); @@ -233,6 +244,7 @@ public class MergeState { if (infoStream.isEnabled("SM")) { infoStream.message("SM", String.format(Locale.ROOT, "segment %s is not sorted; wrapping for sort %s now (%.2f msec to sort)", leaf, indexSort, msec)); } + needsIndexSort = true; leaf = SlowCodecReaderWrapper.wrap(SortingLeafReader.wrap(new MergeReaderWrapper(leaf), sortDocMap)); leafDocMaps[readers.size()] = new DocMap() { @Override diff --git a/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java b/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java index 5ca6b65a7bb..630b65cb065 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java +++ b/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java @@ -33,7 +33,9 @@ import org.apache.lucene.util.packed.PackedLongValues; final class MultiSorter { /** Does a merge sort of the leaves of the incoming reader, returning {@link DocMap} to map each leaf's - * documents into the merged segment. The documents for each incoming leaf reader must already be sorted by the same sort! */ + * documents into the merged segment. The documents for each incoming leaf reader must already be sorted by the same sort! + * Returns null if the merge sort is not needed (segments are already in index sort order). + **/ static MergeState.DocMap[] sort(Sort sort, List readers) throws IOException { // TODO: optimize if only 1 reader is incoming, though that's a rare case @@ -80,8 +82,15 @@ final class MultiSorter { // merge sort: int mappedDocID = 0; + int lastReaderIndex = 0; + boolean isSorted = true; while (queue.size() != 0) { LeafAndDocID top = queue.top(); + if (lastReaderIndex > top.readerIndex) { + // merge sort is needed + isSorted = false; + } + lastReaderIndex = top.readerIndex; builders[top.readerIndex].add(mappedDocID); if (top.liveDocs == null || top.liveDocs.get(top.docID)) { mappedDocID++; @@ -97,6 +106,9 @@ final class MultiSorter { queue.pop(); } } + if (isSorted) { + return null; + } MergeState.DocMap[] docMaps = new MergeState.DocMap[leafCount]; for(int i=0;i defaultValueConsumer, Consumer randomValueConsumer) throws Exception { + Directory dir = newDirectory(); + IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); + AssertingNeedsIndexSortCodec codec = new AssertingNeedsIndexSortCodec(); + iwc.setCodec(codec); + Sort indexSort = new Sort(sortField, + new SortField("id", SortField.Type.INT)); + iwc.setIndexSort(indexSort); + iwc.setMergePolicy(newLogMergePolicy()); + + // add already sorted documents + codec.numCalls = 0; + codec.needsIndexSort = false; + IndexWriter w = new IndexWriter(dir, iwc); + boolean withValues = random().nextBoolean(); + for (int i = 100; i < 200; i++) { + Document doc = new Document(); + doc.add(new StringField("id", Integer.toString(i), Store.YES)); + doc.add(new NumericDocValuesField("id", i)); + doc.add(new IntPoint("point", random().nextInt())); + if (withValues) { + defaultValueConsumer.accept(doc); + } + w.addDocument(doc); + if (i % 10 == 0) { + w.commit(); + } + } + Set deletedDocs = new HashSet<> (); + int num = random().nextInt(20); + for (int i = 0; i < num; i++) { + int nextDoc = random().nextInt(100); + w.deleteDocuments(new Term("id", Integer.toString(nextDoc))); + deletedDocs.add(nextDoc); + } + w.commit(); + w.waitForMerges(); + w.forceMerge(1); + assertTrue(codec.numCalls > 0); + + + // merge sort is needed + codec.numCalls = 0; + codec.needsIndexSort = true; + for (int i = 10; i >= 0; i--) { + Document doc = new Document(); + doc.add(new StringField("id", Integer.toString(i), Store.YES)); + doc.add(new NumericDocValuesField("id", i)); + doc.add(new IntPoint("point", random().nextInt())); + if (withValues) { + defaultValueConsumer.accept(doc); + } + w.addDocument(doc); + w.commit(); + } + w.commit(); + w.waitForMerges(); + w.forceMerge(1); + assertTrue(codec.numCalls > 0); + + // segment sort is needed + codec.needsIndexSort = true; + codec.numCalls = 0; + for (int i = 200; i < 300; i++) { + Document doc = new Document(); + doc.add(new StringField("id", Integer.toString(i), Store.YES)); + doc.add(new NumericDocValuesField("id", i)); + doc.add(new IntPoint("point", random().nextInt())); + randomValueConsumer.accept(doc); + w.addDocument(doc); + if (i % 10 == 0) { + w.commit(); + } + } + w.commit(); + w.waitForMerges(); + w.forceMerge(1); + assertTrue(codec.numCalls > 0); + + w.close(); + dir.close(); + } + + public void testNumericAlreadySorted() throws Exception { + assertNeedsIndexSortMerge(new SortField("foo", SortField.Type.INT), + (doc) -> doc.add(new NumericDocValuesField("foo", 0)), + (doc) -> doc.add(new NumericDocValuesField("foo", random().nextInt()))); + } + + public void testStringAlreadySorted() throws Exception { + assertNeedsIndexSortMerge(new SortField("foo", SortField.Type.STRING), + (doc) -> doc.add(new SortedDocValuesField("foo", new BytesRef("default"))), + (doc) -> doc.add(new SortedDocValuesField("foo", TestUtil.randomBinaryTerm(random())))); + } + + public void testMultiValuedNumericAlreadySorted() throws Exception { + assertNeedsIndexSortMerge(new SortedNumericSortField("foo", SortField.Type.INT), + (doc) -> { + doc.add(new SortedNumericDocValuesField("foo", Integer.MIN_VALUE)); + int num = random().nextInt(5); + for (int j = 0; j < num; j++) { + doc.add(new SortedNumericDocValuesField("foo", random().nextInt())); + } + }, + (doc) -> { + int num = random().nextInt(5); + for (int j = 0; j < num; j++) { + doc.add(new SortedNumericDocValuesField("foo", random().nextInt())); + } + }); + } + + public void testMultiValuedStringAlreadySorted() throws Exception { + assertNeedsIndexSortMerge(new SortedSetSortField("foo", false), + (doc) -> { + doc.add(new SortedSetDocValuesField("foo", new BytesRef(""))); + int num = random().nextInt(5); + for (int j = 0; j < num; j++) { + doc.add(new SortedSetDocValuesField("foo", TestUtil.randomBinaryTerm(random()))); + } + }, + (doc) -> { + int num = random().nextInt(5); + for (int j = 0; j < num; j++) { + doc.add(new SortedSetDocValuesField("foo", TestUtil.randomBinaryTerm(random()))); + } + }); + } public void testBasicString() throws Exception { Directory dir = newDirectory();