Tune the amount of memory that is allocated to sorting postings upon flushing. (#12011)

When flushing segments that have an index sort configured, postings lists get
loaded into arrays and get reordered according to the index sort.

This reordering is implemented with `TimSorter`, a variant of merge sort. Like
merge sort, an important part of `TimSorter` consists of merging two contiguous
sorted slices of the array into a combined sorted slice. This merging can be
done either with external memory, which is the classical approach, or in-place,
which still runs in linear time but with a much higher factor. Until now we
were allocating a fixed budget of `maxDoc/64` for doing these merges with
external memory. If this is not enough, sorted slices would be merged in place.

I've been looking at some profiles recently for an index where a non-negligible
chunk of the time was spent on in-place merges. So I would like to propose the
following change:
 - Increase the maximum RAM budget to `maxDoc / 8`. This should help avoid
   in-place merges for all postings up to `docFreq = maxDoc / 4`.
 - Make this RAM budget lazily allocated, rather than eagerly like today. This
   would help not allocate memory in O(maxDoc) for fields like primary keys
   that only have a couple postings per term.

So overall memory usage would never be more than 50% higher than what it is
today, because `TimSorter` never needs more than X temporary slots if the
postings list doesn't have at least 2*X entries, and these 2*X entries already
get loaded into memory today. And for fields that have short postings, memory
usage should actually be lower.
This commit is contained in:
Adrien Grand 2022-12-27 11:11:18 +01:00 committed by GitHub
parent f5ea0412eb
commit ddd63d2da3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 24 additions and 8 deletions

View File

@ -250,8 +250,12 @@ Optimizations
* GITHUB#12006: Do ints compare instead of ArrayUtil#compareUnsigned4 in LatlonPointQueries. (Guo Feng)
* GITHUB#12011: Minor speedup to flushing long postings lists when an index
sort is configured. (Adrien Grand)
* GITHUB#12017: Aggressive count in BooleanWeight. (Lu Xugang)
Other
---------------------

View File

@ -34,6 +34,8 @@ import org.apache.lucene.util.CollectionUtil;
import org.apache.lucene.util.Counter;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.IntBlockPool;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.LongsRef;
import org.apache.lucene.util.TimSorter;
import org.apache.lucene.util.automaton.CompiledAutomaton;
@ -228,12 +230,12 @@ final class FreqProxTermsWriter extends TermsHash {
private int[] docs;
private int[] freqs;
private final int[] tmpDocs;
private int[] tmpDocs;
private int[] tmpFreqs;
DocFreqSorter(int maxDoc) {
super(maxDoc / 64);
this.tmpDocs = new int[maxDoc / 64];
super(maxDoc / 8);
this.tmpDocs = IntsRef.EMPTY_INTS;
}
public void reset(int[] docs, int[] freqs) {
@ -272,6 +274,12 @@ final class FreqProxTermsWriter extends TermsHash {
@Override
protected void save(int i, int len) {
if (tmpDocs.length < len) {
tmpDocs = new int[ArrayUtil.oversize(len, Integer.BYTES)];
if (freqs != null) {
tmpFreqs = new int[tmpDocs.length];
}
}
System.arraycopy(docs, i, tmpDocs, 0, len);
if (freqs != null) {
System.arraycopy(freqs, i, tmpFreqs, 0, len);
@ -423,13 +431,13 @@ final class FreqProxTermsWriter extends TermsHash {
private int[] docs;
private long[] offsets;
private final int[] tmpDocs;
private final long[] tmpOffsets;
private int[] tmpDocs;
private long[] tmpOffsets;
public DocOffsetSorter(int maxDoc) {
super(maxDoc / 64);
this.tmpDocs = new int[maxDoc / 64];
this.tmpOffsets = new long[maxDoc / 64];
super(maxDoc / 8);
this.tmpDocs = IntsRef.EMPTY_INTS;
this.tmpOffsets = LongsRef.EMPTY_LONGS;
}
public void reset(int[] docs, long[] offsets) {
@ -461,6 +469,10 @@ final class FreqProxTermsWriter extends TermsHash {
@Override
protected void save(int i, int len) {
if (tmpDocs.length < len) {
tmpDocs = new int[ArrayUtil.oversize(len, Integer.BYTES)];
tmpOffsets = new long[tmpDocs.length];
}
System.arraycopy(docs, i, tmpDocs, 0, len);
System.arraycopy(offsets, i, tmpOffsets, 0, len);
}