LUCENE-8757: Improving Default Segments To Thread Mapping Algorithm

The current slicing algorithm assigns a thread per segment, which
can be detrimental to performance in case the distribution has
a large number of small segments. The patch introduces a slicing
algorithm which coalesces smaller segments to a single thread,
thus reducing the impact of context switching by limiting the
number of threads

Signed-off-by: Adrien Grand <jpountz@gmail.com>
This commit is contained in:
Atri Sharma 2019-04-22 19:44:52 +05:30 committed by Adrien Grand
parent 5a694ea26f
commit 87e936f1bb
5 changed files with 346 additions and 8 deletions

View File

@ -56,6 +56,10 @@ Improvements
* LUCENE-8770: BlockMaxConjunctionScorer now leverages two-phase iterators in order to avoid * LUCENE-8770: BlockMaxConjunctionScorer now leverages two-phase iterators in order to avoid
executing the second phase when scorers don't intersect. (Adrien Grand, Jim Ferenczi) executing the second phase when scorers don't intersect. (Adrien Grand, Jim Ferenczi)
* LUCENE-8757: When provided with an ExecutorService to run queries across
multiple threads, IndexSearcher now groups small segments together, up to
250k docs per slice. (Atri Sharma via Adrien Grand)
======================= Lucene 8.1.1 ======================= ======================= Lucene 8.1.1 =======================
(No Changes) (No Changes)

View File

@ -22,6 +22,7 @@ import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.Collection; import java.util.Collection;
import java.util.Collections; import java.util.Collections;
import java.util.Comparator;
import java.util.List; import java.util.List;
import java.util.Objects; import java.util.Objects;
import java.util.Set; import java.util.Set;
@ -100,6 +101,13 @@ public class IndexSearcher {
*/ */
private static final int TOTAL_HITS_THRESHOLD = 1000; private static final int TOTAL_HITS_THRESHOLD = 1000;
/**
* Thresholds for index slice allocation logic. To change the default, extend
* <code> IndexSearcher</code> and use custom values
*/
private static final int MAX_DOCS_PER_SLICE = 250_000;
private static final int MAX_SEGMENTS_PER_SLICE = 5;
final IndexReader reader; // package private for testing! final IndexReader reader; // package private for testing!
// NOTE: these members might change in incompatible ways // NOTE: these members might change in incompatible ways
@ -268,14 +276,57 @@ public class IndexSearcher {
/** /**
* Expert: Creates an array of leaf slices each holding a subset of the given leaves. * Expert: Creates an array of leaf slices each holding a subset of the given leaves.
* Each {@link LeafSlice} is executed in a single thread. By default there * Each {@link LeafSlice} is executed in a single thread. By default, segments with more than
* will be one {@link LeafSlice} per leaf ({@link org.apache.lucene.index.LeafReaderContext}). * MAX_DOCS_PER_SLICE will get their own thread
*/ */
protected LeafSlice[] slices(List<LeafReaderContext> leaves) { protected LeafSlice[] slices(List<LeafReaderContext> leaves) {
LeafSlice[] slices = new LeafSlice[leaves.size()]; return slices(leaves, MAX_DOCS_PER_SLICE, MAX_SEGMENTS_PER_SLICE);
for (int i = 0; i < slices.length; i++) {
slices[i] = new LeafSlice(leaves.get(i));
} }
/**
* Static method to segregate LeafReaderContexts amongst multiple slices
*/
public static LeafSlice[] slices (List<LeafReaderContext> leaves, int maxDocsPerSlice,
int maxSegmentsPerSlice) {
// Make a copy so we can sort:
List<LeafReaderContext> sortedLeaves = new ArrayList<>(leaves);
// Sort by maxDoc, descending:
Collections.sort(sortedLeaves,
Collections.reverseOrder(Comparator.comparingInt(l -> l.reader().maxDoc())));
final List<List<LeafReaderContext>> groupedLeaves = new ArrayList<>();
long docSum = 0;
List<LeafReaderContext> group = null;
for (LeafReaderContext ctx : sortedLeaves) {
if (ctx.reader().maxDoc() > maxDocsPerSlice) {
assert group == null;
groupedLeaves.add(Collections.singletonList(ctx));
} else {
if (group == null) {
group = new ArrayList<>();
group.add(ctx);
groupedLeaves.add(group);
} else {
group.add(ctx);
}
docSum += ctx.reader().maxDoc();
if (group.size() >= maxSegmentsPerSlice || docSum > maxDocsPerSlice) {
group = null;
docSum = 0;
}
}
}
LeafSlice[] slices = new LeafSlice[groupedLeaves.size()];
int upto = 0;
for (List<LeafReaderContext> currentLeaf : groupedLeaves) {
slices[upto] = new LeafSlice(currentLeaf);
++upto;
}
return slices; return slices;
} }
@ -743,8 +794,9 @@ public class IndexSearcher {
* @lucene.experimental */ * @lucene.experimental */
public final LeafReaderContext[] leaves; public final LeafReaderContext[] leaves;
public LeafSlice(LeafReaderContext... leaves) { public LeafSlice(List<LeafReaderContext> leavesList) {
this.leaves = leaves; Collections.sort(leavesList, Comparator.comparingInt(l -> l.docBase));
this.leaves = leavesList.toArray(new LeafReaderContext[0]);
} }
} }

View File

@ -0,0 +1,268 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.index;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import org.apache.lucene.document.Document;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.NamedThreadFactory;
import org.apache.lucene.util.Version;
public class TestSegmentToThreadMapping extends LuceneTestCase {
public LeafReader dummyIndexReader(final int maxDoc) {
return new LeafReader() {
@Override
public int maxDoc() {
return maxDoc;
}
@Override
public int numDocs() {
return maxDoc;
}
@Override
public FieldInfos getFieldInfos() {
return FieldInfos.EMPTY;
}
@Override
public Bits getLiveDocs() {
return null;
}
@Override
public Terms terms(String field) throws IOException {
return null;
}
@Override
public Fields getTermVectors(int doc) {
return null;
}
@Override
public NumericDocValues getNumericDocValues(String field) {
return null;
}
@Override
public BinaryDocValues getBinaryDocValues(String field) {
return null;
}
@Override
public SortedDocValues getSortedDocValues(String field) {
return null;
}
@Override
public SortedNumericDocValues getSortedNumericDocValues(String field) {
return null;
}
@Override
public SortedSetDocValues getSortedSetDocValues(String field) {
return null;
}
@Override
public NumericDocValues getNormValues(String field) {
return null;
}
@Override
public PointValues getPointValues(String field) {
return null;
}
@Override
protected void doClose() {
}
@Override
public void document(int doc, StoredFieldVisitor visitor) {
}
@Override
public void checkIntegrity() throws IOException {
}
@Override
public LeafMetaData getMetaData() {
return new LeafMetaData(Version.LATEST.major, Version.LATEST, null);
}
@Override
public CacheHelper getCoreCacheHelper() {
return null;
}
@Override
public CacheHelper getReaderCacheHelper() {
return null;
}
};
}
public void testSingleSlice() {
LeafReader largeSegmentReader = dummyIndexReader(50_000);
LeafReader firstMediumSegmentReader = dummyIndexReader(30_000);
LeafReader secondMediumSegmentReader = dummyIndexReader(30__000);
LeafReader thirdMediumSegmentReader = dummyIndexReader(30_000);
List<LeafReaderContext> leafReaderContexts = new ArrayList<>();
leafReaderContexts.add(new LeafReaderContext(largeSegmentReader));
leafReaderContexts.add(new LeafReaderContext(firstMediumSegmentReader));
leafReaderContexts.add(new LeafReaderContext(secondMediumSegmentReader));
leafReaderContexts.add(new LeafReaderContext(thirdMediumSegmentReader));
IndexSearcher.LeafSlice[] resultSlices = IndexSearcher.slices(leafReaderContexts, 250_000, 5);
assertTrue(resultSlices.length == 1);
final LeafReaderContext[] leaves = resultSlices[0].leaves;
assertTrue(leaves.length == 4);
}
public void testSmallSegments() {
LeafReader firstMediumSegmentReader = dummyIndexReader(10_000);
LeafReader secondMediumSegmentReader = dummyIndexReader(10_000);
LeafReader thirdMediumSegmentReader = dummyIndexReader(10_000);
LeafReader fourthMediumSegmentReader = dummyIndexReader(10_000);
LeafReader fifthMediumSegmentReader = dummyIndexReader(10_000);
LeafReader sixthMediumSegmentReader = dummyIndexReader(10_000);
LeafReader seventhLargeSegmentReader = dummyIndexReader(130_000);
LeafReader eigthLargeSegmentReader = dummyIndexReader(130_000);
List<LeafReaderContext> leafReaderContexts = new ArrayList<>();
leafReaderContexts.add(new LeafReaderContext(firstMediumSegmentReader));
leafReaderContexts.add(new LeafReaderContext(secondMediumSegmentReader));
leafReaderContexts.add(new LeafReaderContext(thirdMediumSegmentReader));
leafReaderContexts.add(new LeafReaderContext(fourthMediumSegmentReader));
leafReaderContexts.add(new LeafReaderContext(fifthMediumSegmentReader));
leafReaderContexts.add(new LeafReaderContext(sixthMediumSegmentReader));
leafReaderContexts.add(new LeafReaderContext(seventhLargeSegmentReader));
leafReaderContexts.add(new LeafReaderContext(eigthLargeSegmentReader));
IndexSearcher.LeafSlice[] resultSlices = IndexSearcher.slices(leafReaderContexts, 250_000, 5);
assertTrue(resultSlices.length == 3);
final LeafReaderContext[] firstSliceleaves = resultSlices[0].leaves;
final LeafReaderContext[] secondSliceleaves = resultSlices[1].leaves;
final LeafReaderContext[] thirdSliceleaves = resultSlices[2].leaves;
assertTrue(firstSliceleaves.length == 2);
assertTrue(secondSliceleaves.length == 5);
assertTrue(thirdSliceleaves.length == 1);
}
public void testLargeSlices() {
LeafReader largeSegmentReader = dummyIndexReader(290_900);
LeafReader firstMediumSegmentReader = dummyIndexReader(170_000);
LeafReader secondMediumSegmentReader = dummyIndexReader(170_000);
LeafReader thirdMediumSegmentReader = dummyIndexReader(170_000);
List<LeafReaderContext> leafReaderContexts = new ArrayList<>();
leafReaderContexts.add(new LeafReaderContext(largeSegmentReader));
leafReaderContexts.add(new LeafReaderContext(firstMediumSegmentReader));
leafReaderContexts.add(new LeafReaderContext(secondMediumSegmentReader));
leafReaderContexts.add(new LeafReaderContext(thirdMediumSegmentReader));
IndexSearcher.LeafSlice[] resultSlices = IndexSearcher.slices(leafReaderContexts, 250_000, 5);
assertTrue(resultSlices.length == 3);
final LeafReaderContext[] firstSliceleaves = resultSlices[0].leaves;
final LeafReaderContext[] secondSliceleaves = resultSlices[1].leaves;
final LeafReaderContext[] thirdSliceleaves = resultSlices[2].leaves;
assertTrue(firstSliceleaves.length == 1);
assertTrue(secondSliceleaves.length == 2);
assertTrue(thirdSliceleaves.length == 1);
}
public void testIntraSliceDocIDOrder() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
w.addDocument(new Document());
w.addDocument(new Document());
w.commit();
w.addDocument(new Document());
w.addDocument(new Document());
w.commit();
IndexReader r = w.getReader();
w.close();
ExecutorService service = new ThreadPoolExecutor(4, 4, 0L, TimeUnit.MILLISECONDS,
new LinkedBlockingQueue<Runnable>(),
new NamedThreadFactory("TestSegmentToThreadMapping"));
IndexSearcher s = new IndexSearcher(r, service);
Query query = new MatchAllDocsQuery();
s.search(query, Integer.MAX_VALUE);
IndexSearcher.LeafSlice[] slices = s.getSlices();
assertNotNull(slices);
for (IndexSearcher.LeafSlice leafSlice : slices) {
LeafReaderContext[] leafReaderContexts = leafSlice.leaves;
int previousDocBase = leafReaderContexts[0].docBase;
for (LeafReaderContext leafReaderContext : leafReaderContexts) {
assertTrue(previousDocBase <= leafReaderContext.docBase);
previousDocBase = leafReaderContext.docBase;
}
}
service.shutdown();
IOUtils.close(r, dir);
}
public void testRandom() {
List<LeafReaderContext> leafReaderContexts = new ArrayList<>();
int max = 500_000;
int min = 10_000;
int numSegments = random().nextInt(50);
for (int i = 0; i < numSegments; i++) {
leafReaderContexts.add(new LeafReaderContext(dummyIndexReader(random().nextInt((max - min) + 1) + min)));
}
IndexSearcher.LeafSlice[] resultSlices = IndexSearcher.slices(leafReaderContexts, 250_000, 5);
assertTrue(resultSlices.length > 0);
}
}

View File

@ -26,6 +26,7 @@ import org.apache.lucene.index.LeafReaderContext;
class AssertingCollector extends FilterCollector { class AssertingCollector extends FilterCollector {
private int maxDoc = -1; private int maxDoc = -1;
private int previousLeafMaxDoc = 0;
/** Wrap the given collector in order to add assertions. */ /** Wrap the given collector in order to add assertions. */
public static Collector wrap(Collector in) { public static Collector wrap(Collector in) {
@ -41,6 +42,9 @@ class AssertingCollector extends FilterCollector {
@Override @Override
public LeafCollector getLeafCollector(LeafReaderContext context) throws IOException { public LeafCollector getLeafCollector(LeafReaderContext context) throws IOException {
assert context.docBase >= previousLeafMaxDoc;
previousLeafMaxDoc = context.docBase + context.reader().maxDoc();
final LeafCollector in = super.getLeafCollector(context); final LeafCollector in = super.getLeafCollector(context);
final int docBase = context.docBase; final int docBase = context.docBase;
return new AssertingLeafCollector(in, 0, DocIdSetIterator.NO_MORE_DOCS) { return new AssertingLeafCollector(in, 0, DocIdSetIterator.NO_MORE_DOCS) {
@ -50,6 +54,7 @@ class AssertingCollector extends FilterCollector {
// not only per segment // not only per segment
assert docBase + doc >= maxDoc : "collection is not in order: current doc=" assert docBase + doc >= maxDoc : "collection is not in order: current doc="
+ (docBase + doc) + " while " + maxDoc + " has already been collected"; + (docBase + doc) + " while " + maxDoc + " has already been collected";
super.collect(doc); super.collect(doc);
maxDoc = docBase + doc; maxDoc = docBase + doc;
} }

View File

@ -1932,6 +1932,15 @@ public abstract class LuceneTestCase extends Assert {
ret = random.nextBoolean() ret = random.nextBoolean()
? new AssertingIndexSearcher(random, r, ex) ? new AssertingIndexSearcher(random, r, ex)
: new AssertingIndexSearcher(random, r.getContext(), ex); : new AssertingIndexSearcher(random, r.getContext(), ex);
} else if (random.nextBoolean()) {
int maxDocPerSlice = 1 + random.nextInt(100000);
int maxSegmentsPerSlice = 1 + random.nextInt(20);
ret = new IndexSearcher(r, ex) {
@Override
protected LeafSlice[] slices(List<LeafReaderContext> leaves) {
return slices(leaves, maxDocPerSlice, maxSegmentsPerSlice);
}
};
} else { } else {
ret = random.nextBoolean() ret = random.nextBoolean()
? new IndexSearcher(r, ex) ? new IndexSearcher(r, ex)