LUCENE-10189: Optimize flush of doc-value fields that are effectively single-valued. (#399)

This commit is contained in:
Adrien Grand 2021-10-20 19:05:40 +02:00 committed by GitHub
parent 0e1f9fcf31
commit 3a11983de2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 82 additions and 66 deletions

View File

@ -32,7 +32,7 @@ final class DocsWithFieldSet extends DocIdSet {
RamUsageEstimator.shallowSizeOfInstance(DocsWithFieldSet.class); RamUsageEstimator.shallowSizeOfInstance(DocsWithFieldSet.class);
private FixedBitSet set; private FixedBitSet set;
private int cost = 0; private int cardinality = 0;
private int lastDocId = -1; private int lastDocId = -1;
void add(int docID) { void add(int docID) {
@ -43,14 +43,14 @@ final class DocsWithFieldSet extends DocIdSet {
if (set != null) { if (set != null) {
set = FixedBitSet.ensureCapacity(set, docID); set = FixedBitSet.ensureCapacity(set, docID);
set.set(docID); set.set(docID);
} else if (docID != cost) { } else if (docID != cardinality) {
// migrate to a sparse encoding using a bit set // migrate to a sparse encoding using a bit set
set = new FixedBitSet(docID + 1); set = new FixedBitSet(docID + 1);
set.set(0, cost); set.set(0, cardinality);
set.set(docID); set.set(docID);
} }
lastDocId = docID; lastDocId = docID;
cost++; cardinality++;
} }
@Override @Override
@ -60,6 +60,11 @@ final class DocsWithFieldSet extends DocIdSet {
@Override @Override
public DocIdSetIterator iterator() { public DocIdSetIterator iterator() {
return set != null ? new BitSetIterator(set, cost) : DocIdSetIterator.all(cost); return set != null ? new BitSetIterator(set, cardinality) : DocIdSetIterator.all(cardinality);
}
/** Return the number of documents of this set. */
int cardinality() {
return cardinality;
} }
} }

View File

@ -30,7 +30,7 @@ import org.apache.lucene.util.packed.PackedLongValues;
/** Buffers up pending long per doc, then flushes when segment flushes. */ /** Buffers up pending long per doc, then flushes when segment flushes. */
class NumericDocValuesWriter extends DocValuesWriter<NumericDocValues> { class NumericDocValuesWriter extends DocValuesWriter<NumericDocValues> {
private PackedLongValues.Builder pending; private final PackedLongValues.Builder pending;
private PackedLongValues finalValues; private PackedLongValues finalValues;
private final Counter iwBytesUsed; private final Counter iwBytesUsed;
private long bytesUsed; private long bytesUsed;
@ -126,7 +126,7 @@ class NumericDocValuesWriter extends DocValuesWriter<NumericDocValues> {
} }
// iterates over the values we have in ram // iterates over the values we have in ram
private static class BufferedNumericDocValues extends NumericDocValues { static class BufferedNumericDocValues extends NumericDocValues {
final PackedLongValues.Iterator iter; final PackedLongValues.Iterator iter;
final DocIdSetIterator docsWithField; final DocIdSetIterator docsWithField;
private long value; private long value;

View File

@ -37,8 +37,8 @@ import org.apache.lucene.util.packed.PackedLongValues;
*/ */
class SortedDocValuesWriter extends DocValuesWriter<SortedDocValues> { class SortedDocValuesWriter extends DocValuesWriter<SortedDocValues> {
final BytesRefHash hash; final BytesRefHash hash;
private PackedLongValues.Builder pending; private final PackedLongValues.Builder pending;
private DocsWithFieldSet docsWithField; private final DocsWithFieldSet docsWithField;
private final Counter iwBytesUsed; private final Counter iwBytesUsed;
private long bytesUsed; // this currently only tracks differences in 'pending' private long bytesUsed; // this currently only tracks differences in 'pending'
private final FieldInfo fieldInfo; private final FieldInfo fieldInfo;
@ -123,7 +123,7 @@ class SortedDocValuesWriter extends DocValuesWriter<SortedDocValues> {
finalOrdMap[finalSortedValues[ord]] = ord; finalOrdMap[finalSortedValues[ord]] = ord;
} }
return new BufferedSortedDocValues( return new BufferedSortedDocValues(
hash, valueCount, finalOrds, finalSortedValues, finalOrdMap, docsWithField.iterator()); hash, finalOrds, finalSortedValues, finalOrdMap, docsWithField.iterator());
} }
private int[] sortDocValues(int maxDoc, Sorter.DocMap sortMap, SortedDocValues oldValues) private int[] sortDocValues(int maxDoc, Sorter.DocMap sortMap, SortedDocValues oldValues)
@ -159,12 +159,7 @@ class SortedDocValuesWriter extends DocValuesWriter<SortedDocValues> {
state.segmentInfo.maxDoc(), state.segmentInfo.maxDoc(),
sortMap, sortMap,
new BufferedSortedDocValues( new BufferedSortedDocValues(
hash, hash, finalOrds, finalSortedValues, finalOrdMap, docsWithField.iterator()));
valueCount,
finalOrds,
finalSortedValues,
finalOrdMap,
docsWithField.iterator()));
} else { } else {
sorted = null; sorted = null;
} }
@ -178,12 +173,7 @@ class SortedDocValuesWriter extends DocValuesWriter<SortedDocValues> {
} }
final SortedDocValues buf = final SortedDocValues buf =
new BufferedSortedDocValues( new BufferedSortedDocValues(
hash, hash, finalOrds, finalSortedValues, finalOrdMap, docsWithField.iterator());
valueCount,
finalOrds,
finalSortedValues,
finalOrdMap,
docsWithField.iterator());
if (sorted == null) { if (sorted == null) {
return buf; return buf;
} }
@ -192,25 +182,22 @@ class SortedDocValuesWriter extends DocValuesWriter<SortedDocValues> {
}); });
} }
private static class BufferedSortedDocValues extends SortedDocValues { static class BufferedSortedDocValues extends SortedDocValues {
final BytesRefHash hash; final BytesRefHash hash;
final BytesRef scratch = new BytesRef(); final BytesRef scratch = new BytesRef();
final int[] sortedValues; final int[] sortedValues;
final int[] ordMap; final int[] ordMap;
final int valueCount;
private int ord; private int ord;
final PackedLongValues.Iterator iter; final PackedLongValues.Iterator iter;
final DocIdSetIterator docsWithField; final DocIdSetIterator docsWithField;
public BufferedSortedDocValues( public BufferedSortedDocValues(
BytesRefHash hash, BytesRefHash hash,
int valueCount,
PackedLongValues docToOrd, PackedLongValues docToOrd,
int[] sortedValues, int[] sortedValues,
int[] ordMap, int[] ordMap,
DocIdSetIterator docsWithField) { DocIdSetIterator docsWithField) {
this.hash = hash; this.hash = hash;
this.valueCount = valueCount;
this.sortedValues = sortedValues; this.sortedValues = sortedValues;
this.iter = docToOrd.iterator(); this.iter = docToOrd.iterator();
this.ordMap = ordMap; this.ordMap = ordMap;
@ -262,7 +249,7 @@ class SortedDocValuesWriter extends DocValuesWriter<SortedDocValues> {
@Override @Override
public int getValueCount() { public int getValueCount() {
return valueCount; return hash.size();
} }
} }

View File

@ -21,6 +21,7 @@ import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
import java.io.IOException; import java.io.IOException;
import java.util.Arrays; import java.util.Arrays;
import org.apache.lucene.codecs.DocValuesConsumer; import org.apache.lucene.codecs.DocValuesConsumer;
import org.apache.lucene.index.NumericDocValuesWriter.BufferedNumericDocValues;
import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Counter; import org.apache.lucene.util.Counter;
@ -30,9 +31,9 @@ import org.apache.lucene.util.packed.PackedLongValues;
/** Buffers up pending long[] per doc, sorts, then flushes when segment flushes. */ /** Buffers up pending long[] per doc, sorts, then flushes when segment flushes. */
class SortedNumericDocValuesWriter extends DocValuesWriter<SortedNumericDocValues> { class SortedNumericDocValuesWriter extends DocValuesWriter<SortedNumericDocValues> {
private PackedLongValues.Builder pending; // stream of all values private final PackedLongValues.Builder pending; // stream of all values
private PackedLongValues.Builder pendingCounts; // count of values per doc private PackedLongValues.Builder pendingCounts; // count of values per doc
private DocsWithFieldSet docsWithField; private final DocsWithFieldSet docsWithField;
private final Counter iwBytesUsed; private final Counter iwBytesUsed;
private long bytesUsed; // this only tracks differences in 'pending' and 'pendingCounts' private long bytesUsed; // this only tracks differences in 'pending' and 'pendingCounts'
private final FieldInfo fieldInfo; private final FieldInfo fieldInfo;
@ -47,11 +48,9 @@ class SortedNumericDocValuesWriter extends DocValuesWriter<SortedNumericDocValue
this.fieldInfo = fieldInfo; this.fieldInfo = fieldInfo;
this.iwBytesUsed = iwBytesUsed; this.iwBytesUsed = iwBytesUsed;
pending = PackedLongValues.deltaPackedBuilder(PackedInts.COMPACT); pending = PackedLongValues.deltaPackedBuilder(PackedInts.COMPACT);
pendingCounts = PackedLongValues.deltaPackedBuilder(PackedInts.COMPACT);
docsWithField = new DocsWithFieldSet(); docsWithField = new DocsWithFieldSet();
bytesUsed = bytesUsed =
pending.ramBytesUsed() pending.ramBytesUsed()
+ pendingCounts.ramBytesUsed()
+ docsWithField.ramBytesUsed() + docsWithField.ramBytesUsed()
+ RamUsageEstimator.sizeOf(currentValues); + RamUsageEstimator.sizeOf(currentValues);
iwBytesUsed.addAndGet(bytesUsed); iwBytesUsed.addAndGet(bytesUsed);
@ -78,7 +77,15 @@ class SortedNumericDocValuesWriter extends DocValuesWriter<SortedNumericDocValue
pending.add(currentValues[i]); pending.add(currentValues[i]);
} }
// record the number of values for this doc // record the number of values for this doc
pendingCounts.add(currentUpto); if (pendingCounts != null) {
pendingCounts.add(currentUpto);
} else if (currentUpto != 1) {
pendingCounts = PackedLongValues.deltaPackedBuilder(PackedInts.COMPACT);
for (int i = 0; i < docsWithField.cardinality(); ++i) {
pendingCounts.add(1);
}
pendingCounts.add(currentUpto);
}
currentUpto = 0; currentUpto = 0;
docsWithField.add(currentDoc); docsWithField.add(currentDoc);
@ -96,7 +103,7 @@ class SortedNumericDocValuesWriter extends DocValuesWriter<SortedNumericDocValue
private void updateBytesUsed() { private void updateBytesUsed() {
final long newBytesUsed = final long newBytesUsed =
pending.ramBytesUsed() pending.ramBytesUsed()
+ pendingCounts.ramBytesUsed() + (pendingCounts == null ? 0 : pendingCounts.ramBytesUsed())
+ docsWithField.ramBytesUsed() + docsWithField.ramBytesUsed()
+ RamUsageEstimator.sizeOf(currentValues); + RamUsageEstimator.sizeOf(currentValues);
iwBytesUsed.addAndGet(newBytesUsed - bytesUsed); iwBytesUsed.addAndGet(newBytesUsed - bytesUsed);
@ -109,10 +116,9 @@ class SortedNumericDocValuesWriter extends DocValuesWriter<SortedNumericDocValue
assert finalValuesCount == null; assert finalValuesCount == null;
finishCurrentDoc(); finishCurrentDoc();
finalValues = pending.build(); finalValues = pending.build();
finalValuesCount = pendingCounts.build(); finalValuesCount = pendingCounts == null ? null : pendingCounts.build();
} }
return new BufferedSortedNumericDocValues( return getValues(finalValues, finalValuesCount, docsWithField);
finalValues, finalValuesCount, docsWithField.iterator());
} }
static final class LongValues { static final class LongValues {
@ -144,6 +150,15 @@ class SortedNumericDocValuesWriter extends DocValuesWriter<SortedNumericDocValue
} }
} }
private SortedNumericDocValues getValues(
PackedLongValues values, PackedLongValues valueCounts, DocsWithFieldSet docsWithField) {
if (valueCounts == null) {
return DocValues.singleton(new BufferedNumericDocValues(values, docsWithField.iterator()));
} else {
return new BufferedSortedNumericDocValues(values, valueCounts, docsWithField.iterator());
}
}
@Override @Override
public void flush(SegmentWriteState state, Sorter.DocMap sortMap, DocValuesConsumer dvConsumer) public void flush(SegmentWriteState state, Sorter.DocMap sortMap, DocValuesConsumer dvConsumer)
throws IOException { throws IOException {
@ -152,7 +167,7 @@ class SortedNumericDocValuesWriter extends DocValuesWriter<SortedNumericDocValue
if (finalValues == null) { if (finalValues == null) {
finishCurrentDoc(); finishCurrentDoc();
values = pending.build(); values = pending.build();
valueCounts = pendingCounts.build(); valueCounts = pendingCounts == null ? null : pendingCounts.build();
} else { } else {
values = finalValues; values = finalValues;
valueCounts = finalValuesCount; valueCounts = finalValuesCount;
@ -164,7 +179,7 @@ class SortedNumericDocValuesWriter extends DocValuesWriter<SortedNumericDocValue
new LongValues( new LongValues(
state.segmentInfo.maxDoc(), state.segmentInfo.maxDoc(),
sortMap, sortMap,
new BufferedSortedNumericDocValues(values, valueCounts, docsWithField.iterator()), getValues(values, valueCounts, docsWithField),
PackedInts.FASTEST); PackedInts.FASTEST);
} else { } else {
sorted = null; sorted = null;
@ -178,8 +193,7 @@ class SortedNumericDocValuesWriter extends DocValuesWriter<SortedNumericDocValue
if (fieldInfoIn != fieldInfo) { if (fieldInfoIn != fieldInfo) {
throw new IllegalArgumentException("wrong fieldInfo"); throw new IllegalArgumentException("wrong fieldInfo");
} }
final SortedNumericDocValues buf = final SortedNumericDocValues buf = getValues(values, valueCounts, docsWithField);
new BufferedSortedNumericDocValues(values, valueCounts, docsWithField.iterator());
if (sorted == null) { if (sorted == null) {
return buf; return buf;
} else { } else {

View File

@ -23,6 +23,7 @@ import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_SIZE;
import java.io.IOException; import java.io.IOException;
import java.util.Arrays; import java.util.Arrays;
import org.apache.lucene.codecs.DocValuesConsumer; import org.apache.lucene.codecs.DocValuesConsumer;
import org.apache.lucene.index.SortedDocValuesWriter.BufferedSortedDocValues;
import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.ByteBlockPool; import org.apache.lucene.util.ByteBlockPool;
@ -40,9 +41,9 @@ import org.apache.lucene.util.packed.PackedLongValues;
*/ */
class SortedSetDocValuesWriter extends DocValuesWriter<SortedSetDocValues> { class SortedSetDocValuesWriter extends DocValuesWriter<SortedSetDocValues> {
final BytesRefHash hash; final BytesRefHash hash;
private PackedLongValues.Builder pending; // stream of all termIDs private final PackedLongValues.Builder pending; // stream of all termIDs
private PackedLongValues.Builder pendingCounts; // termIDs per doc private PackedLongValues.Builder pendingCounts; // termIDs per doc
private DocsWithFieldSet docsWithField; private final DocsWithFieldSet docsWithField;
private final Counter iwBytesUsed; private final Counter iwBytesUsed;
private long bytesUsed; // this only tracks differences in 'pending' and 'pendingCounts' private long bytesUsed; // this only tracks differences in 'pending' and 'pendingCounts'
private final FieldInfo fieldInfo; private final FieldInfo fieldInfo;
@ -65,11 +66,9 @@ class SortedSetDocValuesWriter extends DocValuesWriter<SortedSetDocValues> {
BytesRefHash.DEFAULT_CAPACITY, BytesRefHash.DEFAULT_CAPACITY,
new DirectBytesStartArray(BytesRefHash.DEFAULT_CAPACITY, iwBytesUsed)); new DirectBytesStartArray(BytesRefHash.DEFAULT_CAPACITY, iwBytesUsed));
pending = PackedLongValues.packedBuilder(PackedInts.COMPACT); pending = PackedLongValues.packedBuilder(PackedInts.COMPACT);
pendingCounts = PackedLongValues.deltaPackedBuilder(PackedInts.COMPACT);
docsWithField = new DocsWithFieldSet(); docsWithField = new DocsWithFieldSet();
bytesUsed = bytesUsed =
pending.ramBytesUsed() pending.ramBytesUsed()
+ pendingCounts.ramBytesUsed()
+ docsWithField.ramBytesUsed() + docsWithField.ramBytesUsed()
+ RamUsageEstimator.sizeOf(currentValues); + RamUsageEstimator.sizeOf(currentValues);
iwBytesUsed.addAndGet(bytesUsed); iwBytesUsed.addAndGet(bytesUsed);
@ -116,7 +115,15 @@ class SortedSetDocValuesWriter extends DocValuesWriter<SortedSetDocValues> {
lastValue = termID; lastValue = termID;
} }
// record the number of unique term ids for this doc // record the number of unique term ids for this doc
pendingCounts.add(count); if (pendingCounts != null) {
pendingCounts.add(count);
} else if (count != 1) {
pendingCounts = PackedLongValues.deltaPackedBuilder(PackedInts.COMPACT);
for (int i = 0; i < docsWithField.cardinality(); ++i) {
pendingCounts.add(1);
}
pendingCounts.add(count);
}
maxCount = Math.max(maxCount, count); maxCount = Math.max(maxCount, count);
currentUpto = 0; currentUpto = 0;
docsWithField.add(currentDoc); docsWithField.add(currentDoc);
@ -146,7 +153,7 @@ class SortedSetDocValuesWriter extends DocValuesWriter<SortedSetDocValues> {
private void updateBytesUsed() { private void updateBytesUsed() {
final long newBytesUsed = final long newBytesUsed =
pending.ramBytesUsed() pending.ramBytesUsed()
+ pendingCounts.ramBytesUsed() + (pendingCounts == null ? 0 : pendingCounts.ramBytesUsed())
+ docsWithField.ramBytesUsed() + docsWithField.ramBytesUsed()
+ RamUsageEstimator.sizeOf(currentValues); + RamUsageEstimator.sizeOf(currentValues);
iwBytesUsed.addAndGet(newBytesUsed - bytesUsed); iwBytesUsed.addAndGet(newBytesUsed - bytesUsed);
@ -160,21 +167,32 @@ class SortedSetDocValuesWriter extends DocValuesWriter<SortedSetDocValues> {
finishCurrentDoc(); finishCurrentDoc();
int valueCount = hash.size(); int valueCount = hash.size();
finalOrds = pending.build(); finalOrds = pending.build();
finalOrdCounts = pendingCounts.build(); finalOrdCounts = pendingCounts == null ? null : pendingCounts.build();
finalSortedValues = hash.sort(); finalSortedValues = hash.sort();
finalOrdMap = new int[valueCount]; finalOrdMap = new int[valueCount];
} }
for (int ord = 0; ord < finalOrdMap.length; ord++) { for (int ord = 0; ord < finalOrdMap.length; ord++) {
finalOrdMap[finalSortedValues[ord]] = ord; finalOrdMap[finalSortedValues[ord]] = ord;
} }
return new BufferedSortedSetDocValues( return getValues(
finalSortedValues, finalSortedValues, finalOrdMap, hash, finalOrds, finalOrdCounts, maxCount, docsWithField);
finalOrdMap, }
hash,
finalOrds, private SortedSetDocValues getValues(
finalOrdCounts, int[] sortedValues,
maxCount, int[] ordMap,
docsWithField.iterator()); BytesRefHash hash,
PackedLongValues ords,
PackedLongValues ordCounts,
int maxCount,
DocsWithFieldSet docsWithField) {
if (ordCounts == null) {
return DocValues.singleton(
new BufferedSortedDocValues(hash, ords, sortedValues, ordMap, docsWithField.iterator()));
} else {
return new BufferedSortedSetDocValues(
sortedValues, ordMap, hash, ords, ordCounts, maxCount, docsWithField.iterator());
}
} }
@Override @Override
@ -190,7 +208,7 @@ class SortedSetDocValuesWriter extends DocValuesWriter<SortedSetDocValues> {
assert finalOrdCounts == null && finalSortedValues == null && finalOrdMap == null; assert finalOrdCounts == null && finalSortedValues == null && finalOrdMap == null;
finishCurrentDoc(); finishCurrentDoc();
ords = pending.build(); ords = pending.build();
ordCounts = pendingCounts.build(); ordCounts = pendingCounts == null ? null : pendingCounts.build();
sortedValues = hash.sort(); sortedValues = hash.sort();
ordMap = new int[valueCount]; ordMap = new int[valueCount];
for (int ord = 0; ord < valueCount; ord++) { for (int ord = 0; ord < valueCount; ord++) {
@ -209,8 +227,7 @@ class SortedSetDocValuesWriter extends DocValuesWriter<SortedSetDocValues> {
new DocOrds( new DocOrds(
state.segmentInfo.maxDoc(), state.segmentInfo.maxDoc(),
sortMap, sortMap,
new BufferedSortedSetDocValues( getValues(sortedValues, ordMap, hash, ords, ordCounts, maxCount, docsWithField),
sortedValues, ordMap, hash, ords, ordCounts, maxCount, docsWithField.iterator()),
PackedInts.FASTEST); PackedInts.FASTEST);
} else { } else {
docOrds = null; docOrds = null;
@ -224,14 +241,7 @@ class SortedSetDocValuesWriter extends DocValuesWriter<SortedSetDocValues> {
throw new IllegalArgumentException("wrong fieldInfo"); throw new IllegalArgumentException("wrong fieldInfo");
} }
final SortedSetDocValues buf = final SortedSetDocValues buf =
new BufferedSortedSetDocValues( getValues(sortedValues, ordMap, hash, ords, ordCounts, maxCount, docsWithField);
sortedValues,
ordMap,
hash,
ords,
ordCounts,
maxCount,
docsWithField.iterator());
if (docOrds == null) { if (docOrds == null) {
return buf; return buf;
} else { } else {