Optimize flush of doc-value fields that are effectively single-valued when an index sort is configured. (#12037)

This iterates on #399 to also optimize the case when an index sort is
configured. When cutting over the NYC taxis benchmark to the new numeric
fields,
[flush times](http://people.apache.org/~mikemccand/lucenebench/sparseResults.html#flush_times)
stayed mostly the same when index sorting is disabled and increased by 7-8%
when index sorting is enabled. I expect this change to address this slowdown.
This commit is contained in:
Adrien Grand 2022-12-27 11:12:56 +01:00 committed by GitHub
parent ddd63d2da3
commit 6f477e5831
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 114 additions and 79 deletions

View File

@ -20,6 +20,7 @@ import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
import java.io.IOException;
import org.apache.lucene.codecs.DocValuesConsumer;
import org.apache.lucene.codecs.DocValuesProducer;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.BitSet;
import org.apache.lucene.util.Counter;
@ -99,30 +100,38 @@ class NumericDocValuesWriter extends DocValuesWriter<NumericDocValues> {
if (finalValues == null) {
finalValues = pending.build();
}
dvConsumer.addNumericField(
fieldInfo, getDocValuesProducer(fieldInfo, finalValues, docsWithField, sortMap));
}
static DocValuesProducer getDocValuesProducer(
FieldInfo writerFieldInfo,
PackedLongValues values,
DocsWithFieldSet docsWithField,
Sorter.DocMap sortMap)
throws IOException {
final NumericDVs sorted;
if (sortMap != null) {
NumericDocValues oldValues =
new BufferedNumericDocValues(finalValues, docsWithField.iterator());
sorted = sortDocValues(state.segmentInfo.maxDoc(), sortMap, oldValues);
NumericDocValues oldValues = new BufferedNumericDocValues(values, docsWithField.iterator());
sorted = sortDocValues(sortMap.size(), sortMap, oldValues);
} else {
sorted = null;
}
dvConsumer.addNumericField(
fieldInfo,
new EmptyDocValuesProducer() {
return new EmptyDocValuesProducer() {
@Override
public NumericDocValues getNumeric(FieldInfo fieldInfo) {
if (fieldInfo != NumericDocValuesWriter.this.fieldInfo) {
if (fieldInfo != writerFieldInfo) {
throw new IllegalArgumentException("wrong fieldInfo");
}
if (sorted == null) {
return new BufferedNumericDocValues(finalValues, docsWithField.iterator());
return new BufferedNumericDocValues(values, docsWithField.iterator());
} else {
return new SortingNumericDocValues(sorted);
}
}
});
};
}
// iterates over the values we have in ram

View File

@ -22,6 +22,7 @@ import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_SIZE;
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.codecs.DocValuesConsumer;
import org.apache.lucene.codecs.DocValuesProducer;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.ByteBlockPool;
import org.apache.lucene.util.BytesRef;
@ -109,24 +110,28 @@ class SortedDocValuesWriter extends DocValuesWriter<SortedDocValues> {
bytesUsed = newBytesUsed;
}
@Override
SortedDocValues getDocValues() {
int valueCount = hash.size();
private void finish() {
if (finalSortedValues == null) {
int valueCount = hash.size();
updateBytesUsed();
assert finalOrdMap == null && finalOrds == null;
finalSortedValues = hash.sort();
finalOrds = pending.build();
finalOrdMap = new int[valueCount];
}
for (int ord = 0; ord < valueCount; ord++) {
finalOrdMap[finalSortedValues[ord]] = ord;
}
}
}
@Override
SortedDocValues getDocValues() {
finish();
return new BufferedSortedDocValues(
hash, finalOrds, finalSortedValues, finalOrdMap, docsWithField.iterator());
}
private int[] sortDocValues(int maxDoc, Sorter.DocMap sortMap, SortedDocValues oldValues)
private static int[] sortDocValues(int maxDoc, Sorter.DocMap sortMap, SortedDocValues oldValues)
throws IOException {
int[] ords = new int[maxDoc];
Arrays.fill(ords, -1);
@ -141,45 +146,48 @@ class SortedDocValuesWriter extends DocValuesWriter<SortedDocValues> {
@Override
public void flush(SegmentWriteState state, Sorter.DocMap sortMap, DocValuesConsumer dvConsumer)
throws IOException {
final int valueCount = hash.size();
if (finalOrds == null) {
updateBytesUsed();
finalSortedValues = hash.sort();
finalOrds = pending.build();
finalOrdMap = new int[valueCount];
for (int ord = 0; ord < valueCount; ord++) {
finalOrdMap[finalSortedValues[ord]] = ord;
}
finish();
dvConsumer.addSortedField(
fieldInfo,
getDocValuesProducer(
fieldInfo, hash, finalOrds, finalSortedValues, finalOrdMap, docsWithField, sortMap));
}
static DocValuesProducer getDocValuesProducer(
FieldInfo writerFieldInfo,
BytesRefHash hash,
PackedLongValues ords,
int[] sortedValues,
int[] ordMap,
DocsWithFieldSet docsWithField,
Sorter.DocMap sortMap)
throws IOException {
final int[] sorted;
if (sortMap != null) {
sorted =
sortDocValues(
state.segmentInfo.maxDoc(),
sortMap.size(),
sortMap,
new BufferedSortedDocValues(
hash, finalOrds, finalSortedValues, finalOrdMap, docsWithField.iterator()));
hash, ords, sortedValues, ordMap, docsWithField.iterator()));
} else {
sorted = null;
}
dvConsumer.addSortedField(
fieldInfo,
new EmptyDocValuesProducer() {
return new EmptyDocValuesProducer() {
@Override
public SortedDocValues getSorted(FieldInfo fieldInfoIn) {
if (fieldInfoIn != fieldInfo) {
if (fieldInfoIn != writerFieldInfo) {
throw new IllegalArgumentException("wrong fieldInfo");
}
final SortedDocValues buf =
new BufferedSortedDocValues(
hash, finalOrds, finalSortedValues, finalOrdMap, docsWithField.iterator());
new BufferedSortedDocValues(hash, ords, sortedValues, ordMap, docsWithField.iterator());
if (sorted == null) {
return buf;
}
return new SortingSortedDocValues(buf, sorted);
}
});
};
}
static class BufferedSortedDocValues extends SortedDocValues {

View File

@ -21,6 +21,7 @@ import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.codecs.DocValuesConsumer;
import org.apache.lucene.codecs.DocValuesProducer;
import org.apache.lucene.index.NumericDocValuesWriter.BufferedNumericDocValues;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.ArrayUtil;
@ -175,6 +176,20 @@ class SortedNumericDocValuesWriter extends DocValuesWriter<SortedNumericDocValue
valueCounts = finalValuesCount;
}
if (valueCounts == null) {
DocValuesProducer singleValueProducer =
NumericDocValuesWriter.getDocValuesProducer(fieldInfo, values, docsWithField, sortMap);
dvConsumer.addSortedNumericField(
fieldInfo,
new EmptyDocValuesProducer() {
@Override
public SortedNumericDocValues getSortedNumeric(FieldInfo fieldInfo) throws IOException {
return DocValues.singleton(singleValueProducer.getNumeric(fieldInfo));
}
});
return;
}
final LongValues sorted;
if (sortMap != null) {
sorted =

View File

@ -22,6 +22,7 @@ import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_SIZE;
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.codecs.DocValuesConsumer;
import org.apache.lucene.codecs.DocValuesProducer;
import org.apache.lucene.index.SortedDocValuesWriter.BufferedSortedDocValues;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.ArrayUtil;
@ -162,8 +163,7 @@ class SortedSetDocValuesWriter extends DocValuesWriter<SortedSetDocValues> {
bytesUsed = newBytesUsed;
}
@Override
SortedSetDocValues getDocValues() {
private void finish() {
if (finalOrds == null) {
assert finalOrdCounts == null && finalSortedValues == null && finalOrdMap == null;
finishCurrentDoc();
@ -172,10 +172,15 @@ class SortedSetDocValuesWriter extends DocValuesWriter<SortedSetDocValues> {
finalOrdCounts = pendingCounts == null ? null : pendingCounts.build();
finalSortedValues = hash.sort();
finalOrdMap = new int[valueCount];
}
for (int ord = 0; ord < finalOrdMap.length; ord++) {
finalOrdMap[finalSortedValues[ord]] = ord;
}
}
}
@Override
SortedSetDocValues getDocValues() {
finish();
return getValues(
finalSortedValues, finalOrdMap, hash, finalOrds, finalOrdCounts, maxCount, docsWithField);
}
@ -200,27 +205,25 @@ class SortedSetDocValuesWriter extends DocValuesWriter<SortedSetDocValues> {
@Override
public void flush(SegmentWriteState state, Sorter.DocMap sortMap, DocValuesConsumer dvConsumer)
throws IOException {
final int valueCount = hash.size();
final PackedLongValues ords;
final PackedLongValues ordCounts;
final int[] sortedValues;
final int[] ordMap;
finish();
final PackedLongValues ords = finalOrds;
final PackedLongValues ordCounts = finalOrdCounts;
final int[] sortedValues = finalSortedValues;
final int[] ordMap = finalOrdMap;
if (finalOrds == null) {
assert finalOrdCounts == null && finalSortedValues == null && finalOrdMap == null;
finishCurrentDoc();
ords = pending.build();
ordCounts = pendingCounts == null ? null : pendingCounts.build();
sortedValues = hash.sort();
ordMap = new int[valueCount];
for (int ord = 0; ord < valueCount; ord++) {
ordMap[sortedValues[ord]] = ord;
if (ordCounts == null) {
DocValuesProducer singleValueProducer =
SortedDocValuesWriter.getDocValuesProducer(
fieldInfo, hash, ords, sortedValues, ordMap, docsWithField, sortMap);
dvConsumer.addSortedSetField(
fieldInfo,
new EmptyDocValuesProducer() {
@Override
public SortedSetDocValues getSortedSet(FieldInfo fieldInfo) throws IOException {
return DocValues.singleton(singleValueProducer.getSorted(fieldInfo));
}
} else {
ords = finalOrds;
ordCounts = finalOrdCounts;
sortedValues = finalSortedValues;
ordMap = finalOrdMap;
});
return;
}
final DocOrds docOrds;