LUCENE-9539: Remove caches from SortingCodecReader (#1909)

SortingCodecReader keeps all docvalues in memory that are loaded from this reader.
Yet, this reader should only be used for merging which happens sequentially. This makes
caching docvalues unnecessary.

Co-authored-by: Jim Ferenczi <jim.ferenczi@elastic.co>
This commit is contained in:
Simon Willnauer 2020-09-23 14:21:28 +02:00 committed by GitHub
parent fd0c08615d
commit 17c285d617
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 107 additions and 102 deletions

View File

@ -274,6 +274,13 @@ Documentation
* LUCENE-9424: Add a performance warning to AttributeSource.captureState javadocs (Haoyu Zhai) * LUCENE-9424: Add a performance warning to AttributeSource.captureState javadocs (Haoyu Zhai)
Changes in Runtime Behavior
---------------------
* LUCENE-9539: SortingCodecReader now doesn't cache doc values fields anymore. Previously, SortingCodecReader
used to cache all doc values fields after they were loaded into memory. This reader should only be used
to sort segments after the fact using IndexWriter#addIndices. (Simon Willnauer)
Other Other
--------------------- ---------------------

View File

@ -113,9 +113,9 @@ class BinaryDocValuesWriter extends DocValuesWriter<BinaryDocValues> {
if (finalLengths == null) { if (finalLengths == null) {
finalLengths = this.lengths.build(); finalLengths = this.lengths.build();
} }
final CachedBinaryDVs sorted; final BinaryDVs sorted;
if (sortMap != null) { if (sortMap != null) {
sorted = new CachedBinaryDVs(state.segmentInfo.maxDoc(), sortMap, sorted = new BinaryDVs(state.segmentInfo.maxDoc(), sortMap,
new BufferedBinaryDocValues(finalLengths, maxLength, bytes.getDataInput(), docsWithField.iterator())); new BufferedBinaryDocValues(finalLengths, maxLength, bytes.getDataInput(), docsWithField.iterator()));
} else { } else {
sorted = null; sorted = null;
@ -189,11 +189,11 @@ class BinaryDocValuesWriter extends DocValuesWriter<BinaryDocValues> {
} }
static class SortingBinaryDocValues extends BinaryDocValues { static class SortingBinaryDocValues extends BinaryDocValues {
private final CachedBinaryDVs dvs; private final BinaryDVs dvs;
private final BytesRefBuilder spare = new BytesRefBuilder(); private final BytesRefBuilder spare = new BytesRefBuilder();
private int docID = -1; private int docID = -1;
SortingBinaryDocValues(CachedBinaryDVs dvs) { SortingBinaryDocValues(BinaryDVs dvs) {
this.dvs = dvs; this.dvs = dvs;
} }
@ -235,10 +235,10 @@ class BinaryDocValuesWriter extends DocValuesWriter<BinaryDocValues> {
} }
} }
static final class CachedBinaryDVs { static final class BinaryDVs {
final int[] offsets; final int[] offsets;
final BytesRefArray values; final BytesRefArray values;
CachedBinaryDVs(int maxDoc, Sorter.DocMap sortMap, BinaryDocValues oldValues) throws IOException { BinaryDVs(int maxDoc, Sorter.DocMap sortMap, BinaryDocValues oldValues) throws IOException {
offsets = new int[maxDoc]; offsets = new int[maxDoc];
values = new BytesRefArray(Counter.newCounter()); values = new BytesRefArray(Counter.newCounter());
int offset = 1; // 0 means no values for this document int offset = 1; // 0 means no values for this document

View File

@ -70,7 +70,7 @@ class NormValuesWriter {
public void flush(SegmentWriteState state, Sorter.DocMap sortMap, NormsConsumer normsConsumer) throws IOException { public void flush(SegmentWriteState state, Sorter.DocMap sortMap, NormsConsumer normsConsumer) throws IOException {
final PackedLongValues values = pending.build(); final PackedLongValues values = pending.build();
final NumericDocValuesWriter.CachedNumericDVs sorted; final NumericDocValuesWriter.NumericDVs sorted;
if (sortMap != null) { if (sortMap != null) {
sorted = NumericDocValuesWriter.sortDocValues(state.segmentInfo.maxDoc(), sortMap, sorted = NumericDocValuesWriter.sortDocValues(state.segmentInfo.maxDoc(), sortMap,
new BufferedNorms(values, docsWithField.iterator())); new BufferedNorms(values, docsWithField.iterator()));

View File

@ -77,7 +77,7 @@ class NumericDocValuesWriter extends DocValuesWriter<NumericDocValues> {
return new BufferedNumericDocValues(finalValues, docsWithField.iterator()); return new BufferedNumericDocValues(finalValues, docsWithField.iterator());
} }
static CachedNumericDVs sortDocValues(int maxDoc, Sorter.DocMap sortMap, NumericDocValues oldDocValues) throws IOException { static NumericDVs sortDocValues(int maxDoc, Sorter.DocMap sortMap, NumericDocValues oldDocValues) throws IOException {
FixedBitSet docsWithField = new FixedBitSet(maxDoc); FixedBitSet docsWithField = new FixedBitSet(maxDoc);
long[] values = new long[maxDoc]; long[] values = new long[maxDoc];
while (true) { while (true) {
@ -89,7 +89,7 @@ class NumericDocValuesWriter extends DocValuesWriter<NumericDocValues> {
docsWithField.set(newDocID); docsWithField.set(newDocID);
values[newDocID] = oldDocValues.longValue(); values[newDocID] = oldDocValues.longValue();
} }
return new CachedNumericDVs(values, docsWithField); return new NumericDVs(values, docsWithField);
} }
@Override @Override
@ -97,7 +97,7 @@ class NumericDocValuesWriter extends DocValuesWriter<NumericDocValues> {
if (finalValues == null) { if (finalValues == null) {
finalValues = pending.build(); finalValues = pending.build();
} }
final CachedNumericDVs sorted; final NumericDVs sorted;
if (sortMap != null) { if (sortMap != null) {
NumericDocValues oldValues = new BufferedNumericDocValues(finalValues, docsWithField.iterator()); NumericDocValues oldValues = new BufferedNumericDocValues(finalValues, docsWithField.iterator());
sorted = sortDocValues(state.segmentInfo.maxDoc(), sortMap, oldValues); sorted = sortDocValues(state.segmentInfo.maxDoc(), sortMap, oldValues);
@ -169,11 +169,11 @@ class NumericDocValuesWriter extends DocValuesWriter<NumericDocValues> {
static class SortingNumericDocValues extends NumericDocValues { static class SortingNumericDocValues extends NumericDocValues {
private final CachedNumericDVs dvs; private final NumericDVs dvs;
private int docID = -1; private int docID = -1;
private long cost = -1; private long cost = -1;
SortingNumericDocValues(CachedNumericDVs dvs) { SortingNumericDocValues(NumericDVs dvs) {
this.dvs = dvs; this.dvs = dvs;
} }
@ -218,11 +218,11 @@ class NumericDocValuesWriter extends DocValuesWriter<NumericDocValues> {
} }
} }
static class CachedNumericDVs { static class NumericDVs {
private final long[] values; private final long[] values;
private final BitSet docsWithField; private final BitSet docsWithField;
CachedNumericDVs(long[] values, BitSet docsWithField) { NumericDVs(long[] values, BitSet docsWithField) {
this.values = values; this.values = values;
this.docsWithField = docsWithField; this.docsWithField = docsWithField;
} }

View File

@ -30,8 +30,10 @@ import org.apache.lucene.codecs.PointsReader;
import org.apache.lucene.codecs.StoredFieldsReader; import org.apache.lucene.codecs.StoredFieldsReader;
import org.apache.lucene.codecs.TermVectorsReader; import org.apache.lucene.codecs.TermVectorsReader;
import org.apache.lucene.search.Sort; import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.util.Bits; import org.apache.lucene.util.Bits;
import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.IOSupplier;
import org.apache.lucene.util.packed.PackedInts; import org.apache.lucene.util.packed.PackedInts;
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
@ -41,21 +43,13 @@ import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
* {@link Sort}. This can be used to re-sort and index after it's been created by wrapping all * {@link Sort}. This can be used to re-sort and index after it's been created by wrapping all
* readers of the index with this reader and adding it to a fresh IndexWriter via * readers of the index with this reader and adding it to a fresh IndexWriter via
* {@link IndexWriter#addIndexes(CodecReader...)}. * {@link IndexWriter#addIndexes(CodecReader...)}.
* NOTE: This reader should only be used for merging. Pulling fields from this reader might be very costly and memory
* intensive.
* *
* @lucene.experimental * @lucene.experimental
*/ */
public final class SortingCodecReader extends FilterCodecReader { public final class SortingCodecReader extends FilterCodecReader {
private final Map<String, NumericDocValuesWriter.CachedNumericDVs> cachedNumericDVs = new HashMap<>();
private final Map<String, BinaryDocValuesWriter.CachedBinaryDVs> cachedBinaryDVs = new HashMap<>();
private final Map<String, int[]> cachedSortedDVs = new HashMap<>();
private final Map<String, SortedSetDocValuesWriter.DocOrds> cachedSortedSetDVs = new HashMap<>();
private final Map<String, SortedNumericDocValuesWriter.LongValues> cachedSortedNumericDVs = new HashMap<>();
private static class SortingBits implements Bits { private static class SortingBits implements Bits {
private final Bits in; private final Bits in;
@ -148,10 +142,6 @@ public final class SortingCodecReader extends FilterCodecReader {
} }
} }
/** Return a sorted view of <code>reader</code> according to the order /** Return a sorted view of <code>reader</code> according to the order
* defined by <code>sort</code>. If the reader is already sorted, this * defined by <code>sort</code>. If the reader is already sorted, this
* method might return the reader as-is. */ * method might return the reader as-is. */
@ -313,15 +303,13 @@ public final class SortingCodecReader extends FilterCodecReader {
}; };
} }
private final Map<String, NumericDocValuesWriter.CachedNumericDVs> cachedNorms = new HashMap<>();
@Override @Override
public NormsProducer getNormsReader() { public NormsProducer getNormsReader() {
final NormsProducer delegate = in.getNormsReader(); final NormsProducer delegate = in.getNormsReader();
return new NormsProducer() { return new NormsProducer() {
@Override @Override
public NumericDocValues getNorms(FieldInfo field) throws IOException { public NumericDocValues getNorms(FieldInfo field) throws IOException {
return produceNumericDocValues(field, delegate.getNorms(field), cachedNorms); return new NumericDocValuesWriter.SortingNumericDocValues(getOrCreateNorms(field.name, () -> getNumericDocValues(delegate.getNorms(field))));
} }
@Override @Override
@ -341,101 +329,48 @@ public final class SortingCodecReader extends FilterCodecReader {
}; };
} }
private NumericDocValues produceNumericDocValues(FieldInfo field, NumericDocValues oldNorms,
Map<String, NumericDocValuesWriter.CachedNumericDVs> cachedNorms) throws IOException {
NumericDocValuesWriter.CachedNumericDVs norms;
synchronized (cachedNorms) {
norms = cachedNorms.get(field);
if (norms == null) {
FixedBitSet docsWithField = new FixedBitSet(maxDoc());
long[] values = new long[maxDoc()];
while (true) {
int docID = oldNorms.nextDoc();
if (docID == NO_MORE_DOCS) {
break;
}
int newDocID = docMap.oldToNew(docID);
docsWithField.set(newDocID);
values[newDocID] = oldNorms.longValue();
}
norms = new NumericDocValuesWriter.CachedNumericDVs(values, docsWithField);
cachedNorms.put(field.name, norms);
}
}
return new NumericDocValuesWriter.SortingNumericDocValues(norms);
}
@Override @Override
public DocValuesProducer getDocValuesReader() { public DocValuesProducer getDocValuesReader() {
final DocValuesProducer delegate = in.getDocValuesReader(); final DocValuesProducer delegate = in.getDocValuesReader();
return new DocValuesProducer() { return new DocValuesProducer() {
@Override @Override
public NumericDocValues getNumeric(FieldInfo field) throws IOException { public NumericDocValues getNumeric(FieldInfo field) throws IOException {
return produceNumericDocValues(field,delegate.getNumeric(field), cachedNumericDVs); return new NumericDocValuesWriter.SortingNumericDocValues(getOrCreateDV(field.name, () -> getNumericDocValues(delegate.getNumeric(field))));
} }
@Override @Override
public BinaryDocValues getBinary(FieldInfo field) throws IOException { public BinaryDocValues getBinary(FieldInfo field) throws IOException {
final BinaryDocValues oldDocValues = delegate.getBinary(field); return new BinaryDocValuesWriter.SortingBinaryDocValues(getOrCreateDV(field.name,
BinaryDocValuesWriter.CachedBinaryDVs dvs; () -> new BinaryDocValuesWriter.BinaryDVs(maxDoc(), docMap, delegate.getBinary(field))));
synchronized (cachedBinaryDVs) {
dvs = cachedBinaryDVs.get(field);
if (dvs == null) {
dvs = new BinaryDocValuesWriter.CachedBinaryDVs(maxDoc(), docMap, oldDocValues);
cachedBinaryDVs.put(field.name, dvs);
}
}
return new BinaryDocValuesWriter.SortingBinaryDocValues(dvs);
} }
@Override @Override
public SortedDocValues getSorted(FieldInfo field) throws IOException { public SortedDocValues getSorted(FieldInfo field) throws IOException {
SortedDocValues oldDocValues = delegate.getSorted(field); SortedDocValues oldDocValues = delegate.getSorted(field);
int[] ords; return new SortedDocValuesWriter.SortingSortedDocValues(oldDocValues, getOrCreateDV(field.name, () -> {
synchronized (cachedSortedDVs) { int[] ords = new int[maxDoc()];
ords = cachedSortedDVs.get(field); Arrays.fill(ords, -1);
if (ords == null) { int docID;
ords = new int[maxDoc()]; while ((docID = oldDocValues.nextDoc()) != NO_MORE_DOCS) {
Arrays.fill(ords, -1); int newDocID = docMap.oldToNew(docID);
int docID; ords[newDocID] = oldDocValues.ordValue();
while ((docID = oldDocValues.nextDoc()) != NO_MORE_DOCS) {
int newDocID = docMap.oldToNew(docID);
ords[newDocID] = oldDocValues.ordValue();
}
cachedSortedDVs.put(field.name, ords);
} }
} return ords;
}));
return new SortedDocValuesWriter.SortingSortedDocValues(oldDocValues, ords);
} }
@Override @Override
public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException { public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException {
final SortedNumericDocValues oldDocValues = delegate.getSortedNumeric(field); final SortedNumericDocValues oldDocValues = delegate.getSortedNumeric(field);
SortedNumericDocValuesWriter.LongValues values; return new SortedNumericDocValuesWriter.SortingSortedNumericDocValues(oldDocValues, getOrCreateDV(field.name, () ->
synchronized (cachedSortedNumericDVs) { new SortedNumericDocValuesWriter.LongValues(maxDoc(), docMap, oldDocValues, PackedInts.FAST)));
values = cachedSortedNumericDVs.get(field);
if (values == null) {
values = new SortedNumericDocValuesWriter.LongValues(maxDoc(), docMap, oldDocValues, PackedInts.FAST);
cachedSortedNumericDVs.put(field.name, values);
}
}
return new SortedNumericDocValuesWriter.SortingSortedNumericDocValues(oldDocValues, values);
} }
@Override @Override
public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException { public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException {
SortedSetDocValues oldDocValues = delegate.getSortedSet(field); SortedSetDocValues oldDocValues = delegate.getSortedSet(field);
SortedSetDocValuesWriter.DocOrds ords; return new SortedSetDocValuesWriter.SortingSortedSetDocValues(oldDocValues, getOrCreateDV(field.name, () ->
synchronized (cachedSortedSetDVs) { new SortedSetDocValuesWriter.DocOrds(maxDoc(), docMap, oldDocValues, PackedInts.FAST)));
ords = cachedSortedSetDVs.get(field);
if (ords == null) {
ords = new SortedSetDocValuesWriter.DocOrds(maxDoc(), docMap, oldDocValues, PackedInts.FASTEST);
cachedSortedSetDVs.put(field.name, ords);
}
}
return new SortedSetDocValuesWriter.SortingSortedSetDocValues(oldDocValues, ords);
} }
@Override @Override
@ -455,6 +390,18 @@ public final class SortingCodecReader extends FilterCodecReader {
}; };
} }
private NumericDocValuesWriter.NumericDVs getNumericDocValues(NumericDocValues oldNumerics) throws IOException {
FixedBitSet docsWithField = new FixedBitSet(maxDoc());
long[] values = new long[maxDoc()];
int docID;
while ((docID = oldNumerics.nextDoc()) != NO_MORE_DOCS) {
int newDocID = docMap.oldToNew(docID);
docsWithField.set(newDocID);
values[newDocID] = oldNumerics.longValue();
}
return new NumericDocValuesWriter.NumericDVs(values, docsWithField);
}
@Override @Override
public TermVectorsReader getTermVectorsReader() { public TermVectorsReader getTermVectorsReader() {
return newTermVectorsReader(in.getTermVectorsReader()); return newTermVectorsReader(in.getTermVectorsReader());
@ -510,4 +457,51 @@ public final class SortingCodecReader extends FilterCodecReader {
return metaData; return metaData;
} }
// we try to cache the last used DV or Norms instance since during merge
// this instance is used more than once. We could in addition to this single instance
// also cache the fields that are used for sorting since we do the work twice for these fields
private String cachedField;
private Object cachedObject;
private boolean cacheIsNorms;
private <T> T getOrCreateNorms(String field, IOSupplier<T> supplier) throws IOException {
return getOrCreate(field, true, supplier);
}
@SuppressWarnings("unchecked")
private synchronized <T> T getOrCreate(String field, boolean norms, IOSupplier<T> supplier) throws IOException {
if ((field.equals(cachedField) && cacheIsNorms == norms) == false) {
assert assertCreatedOnlyOnce(field, norms);
cachedObject = supplier.get();
cachedField = field;
cacheIsNorms = norms;
}
assert cachedObject != null;
return (T) cachedObject;
}
private final Map<String, Integer> cacheStats = new HashMap<>(); // only with assertions enabled
private boolean assertCreatedOnlyOnce(String field, boolean norms) {
assert Thread.holdsLock(this);
// this is mainly there to make sure we change anything in the way we merge we realize it early
Integer timesCached = cacheStats.compute(field + "N:" + norms, (s, i) -> i == null ? 1 : i.intValue() + 1);
if (timesCached > 1) {
assert norms == false :"[" + field + "] norms must not be cached twice";
boolean isSortField = false;
for (SortField sf : metaData.getSort().getSort()) {
if (field.equals(sf.getField())) {
isSortField = true;
break;
}
}
assert timesCached == 2 : "[" + field + "] must not be cached more than twice but was cached: "
+ timesCached + " times isSortField: " + isSortField;
assert isSortField : "only sort fields should be cached twice but [" + field + "] is not a sort field";
}
return true;
}
private <T> T getOrCreateDV(String field, IOSupplier<T> supplier) throws IOException {
return getOrCreate(field, false, supplier);
}
} }

View File

@ -115,14 +115,18 @@ public class TestSortingCodecReader extends LuceneTestCase {
for (int i = 0; i < numDocs; i++) { for (int i = 0; i < numDocs; i++) {
int docId = docIds.get(i); int docId = docIds.get(i);
Document doc = new Document(); Document doc = new Document();
doc.add(new NumericDocValuesField("foo", random().nextInt(20)));
doc.add(new StringField("id", Integer.toString(docId), Field.Store.YES)); doc.add(new StringField("id", Integer.toString(docId), Field.Store.YES));
doc.add(new LongPoint("id", docId)); doc.add(new LongPoint("id", docId));
doc.add(new TextField("text_field", RandomStrings.randomRealisticUnicodeOfLength(random(), 25), Field.Store.YES)); String s = RandomStrings.randomRealisticUnicodeOfLength(random(), 25);
doc.add(new TextField("text_field", s, Field.Store.YES));
doc.add(new BinaryDocValuesField("text_field", new BytesRef(s)));
doc.add(new TextField("another_text_field", s, Field.Store.YES));
doc.add(new BinaryDocValuesField("another_text_field", new BytesRef(s)));
doc.add(new SortedNumericDocValuesField("sorted_numeric_dv", docId)); doc.add(new SortedNumericDocValuesField("sorted_numeric_dv", docId));
doc.add(new SortedDocValuesField("binary_sorted_dv", new BytesRef(Integer.toString(docId)))); doc.add(new SortedDocValuesField("binary_sorted_dv", new BytesRef(Integer.toString(docId))));
doc.add(new BinaryDocValuesField("binary_dv", new BytesRef(Integer.toString(docId)))); doc.add(new BinaryDocValuesField("binary_dv", new BytesRef(Integer.toString(docId))));
doc.add(new SortedSetDocValuesField("sorted_set_dv", new BytesRef(Integer.toString(docId)))); doc.add(new SortedSetDocValuesField("sorted_set_dv", new BytesRef(Integer.toString(docId))));
doc.add(new NumericDocValuesField("foo", random().nextInt(20)));
FieldType ft = new FieldType(StringField.TYPE_NOT_STORED); FieldType ft = new FieldType(StringField.TYPE_NOT_STORED);
ft.setStoreTermVectors(true); ft.setStoreTermVectors(true);