LUCENE-9539: Remove caches from SortingCodecReader (#1909)

SortingCodecReader keeps all docvalues in memory that are loaded from this reader.
Yet, this reader should only be used for merging which happens sequentially. This makes
caching docvalues unnecessary.

Co-authored-by: Jim Ferenczi <jim.ferenczi@elastic.co>
This commit is contained in:
Simon Willnauer 2020-09-23 14:21:28 +02:00 committed by GitHub
parent fd0c08615d
commit 17c285d617
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 107 additions and 102 deletions

View File

@ -274,6 +274,13 @@ Documentation
* LUCENE-9424: Add a performance warning to AttributeSource.captureState javadocs (Haoyu Zhai)
Changes in Runtime Behavior
---------------------
* LUCENE-9539: SortingCodecReader now doesn't cache doc values fields anymore. Previously, SortingCodecReader
used to cache all doc values fields after they were loaded into memory. This reader should only be used
to sort segments after the fact using IndexWriter#addIndices. (Simon Willnauer)
Other
---------------------

View File

@ -113,9 +113,9 @@ class BinaryDocValuesWriter extends DocValuesWriter<BinaryDocValues> {
if (finalLengths == null) {
finalLengths = this.lengths.build();
}
final CachedBinaryDVs sorted;
final BinaryDVs sorted;
if (sortMap != null) {
sorted = new CachedBinaryDVs(state.segmentInfo.maxDoc(), sortMap,
sorted = new BinaryDVs(state.segmentInfo.maxDoc(), sortMap,
new BufferedBinaryDocValues(finalLengths, maxLength, bytes.getDataInput(), docsWithField.iterator()));
} else {
sorted = null;
@ -189,11 +189,11 @@ class BinaryDocValuesWriter extends DocValuesWriter<BinaryDocValues> {
}
static class SortingBinaryDocValues extends BinaryDocValues {
private final CachedBinaryDVs dvs;
private final BinaryDVs dvs;
private final BytesRefBuilder spare = new BytesRefBuilder();
private int docID = -1;
SortingBinaryDocValues(CachedBinaryDVs dvs) {
SortingBinaryDocValues(BinaryDVs dvs) {
this.dvs = dvs;
}
@ -235,10 +235,10 @@ class BinaryDocValuesWriter extends DocValuesWriter<BinaryDocValues> {
}
}
static final class CachedBinaryDVs {
static final class BinaryDVs {
final int[] offsets;
final BytesRefArray values;
CachedBinaryDVs(int maxDoc, Sorter.DocMap sortMap, BinaryDocValues oldValues) throws IOException {
BinaryDVs(int maxDoc, Sorter.DocMap sortMap, BinaryDocValues oldValues) throws IOException {
offsets = new int[maxDoc];
values = new BytesRefArray(Counter.newCounter());
int offset = 1; // 0 means no values for this document

View File

@ -70,7 +70,7 @@ class NormValuesWriter {
public void flush(SegmentWriteState state, Sorter.DocMap sortMap, NormsConsumer normsConsumer) throws IOException {
final PackedLongValues values = pending.build();
final NumericDocValuesWriter.CachedNumericDVs sorted;
final NumericDocValuesWriter.NumericDVs sorted;
if (sortMap != null) {
sorted = NumericDocValuesWriter.sortDocValues(state.segmentInfo.maxDoc(), sortMap,
new BufferedNorms(values, docsWithField.iterator()));

View File

@ -77,7 +77,7 @@ class NumericDocValuesWriter extends DocValuesWriter<NumericDocValues> {
return new BufferedNumericDocValues(finalValues, docsWithField.iterator());
}
static CachedNumericDVs sortDocValues(int maxDoc, Sorter.DocMap sortMap, NumericDocValues oldDocValues) throws IOException {
static NumericDVs sortDocValues(int maxDoc, Sorter.DocMap sortMap, NumericDocValues oldDocValues) throws IOException {
FixedBitSet docsWithField = new FixedBitSet(maxDoc);
long[] values = new long[maxDoc];
while (true) {
@ -89,7 +89,7 @@ class NumericDocValuesWriter extends DocValuesWriter<NumericDocValues> {
docsWithField.set(newDocID);
values[newDocID] = oldDocValues.longValue();
}
return new CachedNumericDVs(values, docsWithField);
return new NumericDVs(values, docsWithField);
}
@Override
@ -97,7 +97,7 @@ class NumericDocValuesWriter extends DocValuesWriter<NumericDocValues> {
if (finalValues == null) {
finalValues = pending.build();
}
final CachedNumericDVs sorted;
final NumericDVs sorted;
if (sortMap != null) {
NumericDocValues oldValues = new BufferedNumericDocValues(finalValues, docsWithField.iterator());
sorted = sortDocValues(state.segmentInfo.maxDoc(), sortMap, oldValues);
@ -169,11 +169,11 @@ class NumericDocValuesWriter extends DocValuesWriter<NumericDocValues> {
static class SortingNumericDocValues extends NumericDocValues {
private final CachedNumericDVs dvs;
private final NumericDVs dvs;
private int docID = -1;
private long cost = -1;
SortingNumericDocValues(CachedNumericDVs dvs) {
SortingNumericDocValues(NumericDVs dvs) {
this.dvs = dvs;
}
@ -218,11 +218,11 @@ class NumericDocValuesWriter extends DocValuesWriter<NumericDocValues> {
}
}
static class CachedNumericDVs {
static class NumericDVs {
private final long[] values;
private final BitSet docsWithField;
CachedNumericDVs(long[] values, BitSet docsWithField) {
NumericDVs(long[] values, BitSet docsWithField) {
this.values = values;
this.docsWithField = docsWithField;
}

View File

@ -30,8 +30,10 @@ import org.apache.lucene.codecs.PointsReader;
import org.apache.lucene.codecs.StoredFieldsReader;
import org.apache.lucene.codecs.TermVectorsReader;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.IOSupplier;
import org.apache.lucene.util.packed.PackedInts;
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
@ -41,21 +43,13 @@ import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
* {@link Sort}. This can be used to re-sort and index after it's been created by wrapping all
* readers of the index with this reader and adding it to a fresh IndexWriter via
* {@link IndexWriter#addIndexes(CodecReader...)}.
* NOTE: This reader should only be used for merging. Pulling fields from this reader might be very costly and memory
* intensive.
*
* @lucene.experimental
*/
public final class SortingCodecReader extends FilterCodecReader {
private final Map<String, NumericDocValuesWriter.CachedNumericDVs> cachedNumericDVs = new HashMap<>();
private final Map<String, BinaryDocValuesWriter.CachedBinaryDVs> cachedBinaryDVs = new HashMap<>();
private final Map<String, int[]> cachedSortedDVs = new HashMap<>();
private final Map<String, SortedSetDocValuesWriter.DocOrds> cachedSortedSetDVs = new HashMap<>();
private final Map<String, SortedNumericDocValuesWriter.LongValues> cachedSortedNumericDVs = new HashMap<>();
private static class SortingBits implements Bits {
private final Bits in;
@ -148,10 +142,6 @@ public final class SortingCodecReader extends FilterCodecReader {
}
}
/** Return a sorted view of <code>reader</code> according to the order
* defined by <code>sort</code>. If the reader is already sorted, this
* method might return the reader as-is. */
@ -313,15 +303,13 @@ public final class SortingCodecReader extends FilterCodecReader {
};
}
private final Map<String, NumericDocValuesWriter.CachedNumericDVs> cachedNorms = new HashMap<>();
@Override
public NormsProducer getNormsReader() {
final NormsProducer delegate = in.getNormsReader();
return new NormsProducer() {
@Override
public NumericDocValues getNorms(FieldInfo field) throws IOException {
return produceNumericDocValues(field, delegate.getNorms(field), cachedNorms);
return new NumericDocValuesWriter.SortingNumericDocValues(getOrCreateNorms(field.name, () -> getNumericDocValues(delegate.getNorms(field))));
}
@Override
@ -341,101 +329,48 @@ public final class SortingCodecReader extends FilterCodecReader {
};
}
private NumericDocValues produceNumericDocValues(FieldInfo field, NumericDocValues oldNorms,
Map<String, NumericDocValuesWriter.CachedNumericDVs> cachedNorms) throws IOException {
NumericDocValuesWriter.CachedNumericDVs norms;
synchronized (cachedNorms) {
norms = cachedNorms.get(field);
if (norms == null) {
FixedBitSet docsWithField = new FixedBitSet(maxDoc());
long[] values = new long[maxDoc()];
while (true) {
int docID = oldNorms.nextDoc();
if (docID == NO_MORE_DOCS) {
break;
}
int newDocID = docMap.oldToNew(docID);
docsWithField.set(newDocID);
values[newDocID] = oldNorms.longValue();
}
norms = new NumericDocValuesWriter.CachedNumericDVs(values, docsWithField);
cachedNorms.put(field.name, norms);
}
}
return new NumericDocValuesWriter.SortingNumericDocValues(norms);
}
@Override
public DocValuesProducer getDocValuesReader() {
final DocValuesProducer delegate = in.getDocValuesReader();
return new DocValuesProducer() {
@Override
public NumericDocValues getNumeric(FieldInfo field) throws IOException {
return produceNumericDocValues(field,delegate.getNumeric(field), cachedNumericDVs);
return new NumericDocValuesWriter.SortingNumericDocValues(getOrCreateDV(field.name, () -> getNumericDocValues(delegate.getNumeric(field))));
}
@Override
public BinaryDocValues getBinary(FieldInfo field) throws IOException {
final BinaryDocValues oldDocValues = delegate.getBinary(field);
BinaryDocValuesWriter.CachedBinaryDVs dvs;
synchronized (cachedBinaryDVs) {
dvs = cachedBinaryDVs.get(field);
if (dvs == null) {
dvs = new BinaryDocValuesWriter.CachedBinaryDVs(maxDoc(), docMap, oldDocValues);
cachedBinaryDVs.put(field.name, dvs);
}
}
return new BinaryDocValuesWriter.SortingBinaryDocValues(dvs);
return new BinaryDocValuesWriter.SortingBinaryDocValues(getOrCreateDV(field.name,
() -> new BinaryDocValuesWriter.BinaryDVs(maxDoc(), docMap, delegate.getBinary(field))));
}
@Override
public SortedDocValues getSorted(FieldInfo field) throws IOException {
SortedDocValues oldDocValues = delegate.getSorted(field);
int[] ords;
synchronized (cachedSortedDVs) {
ords = cachedSortedDVs.get(field);
if (ords == null) {
ords = new int[maxDoc()];
Arrays.fill(ords, -1);
int docID;
while ((docID = oldDocValues.nextDoc()) != NO_MORE_DOCS) {
int newDocID = docMap.oldToNew(docID);
ords[newDocID] = oldDocValues.ordValue();
}
cachedSortedDVs.put(field.name, ords);
return new SortedDocValuesWriter.SortingSortedDocValues(oldDocValues, getOrCreateDV(field.name, () -> {
int[] ords = new int[maxDoc()];
Arrays.fill(ords, -1);
int docID;
while ((docID = oldDocValues.nextDoc()) != NO_MORE_DOCS) {
int newDocID = docMap.oldToNew(docID);
ords[newDocID] = oldDocValues.ordValue();
}
}
return new SortedDocValuesWriter.SortingSortedDocValues(oldDocValues, ords);
return ords;
}));
}
@Override
public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException {
final SortedNumericDocValues oldDocValues = delegate.getSortedNumeric(field);
SortedNumericDocValuesWriter.LongValues values;
synchronized (cachedSortedNumericDVs) {
values = cachedSortedNumericDVs.get(field);
if (values == null) {
values = new SortedNumericDocValuesWriter.LongValues(maxDoc(), docMap, oldDocValues, PackedInts.FAST);
cachedSortedNumericDVs.put(field.name, values);
}
}
return new SortedNumericDocValuesWriter.SortingSortedNumericDocValues(oldDocValues, values);
return new SortedNumericDocValuesWriter.SortingSortedNumericDocValues(oldDocValues, getOrCreateDV(field.name, () ->
new SortedNumericDocValuesWriter.LongValues(maxDoc(), docMap, oldDocValues, PackedInts.FAST)));
}
@Override
public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException {
SortedSetDocValues oldDocValues = delegate.getSortedSet(field);
SortedSetDocValuesWriter.DocOrds ords;
synchronized (cachedSortedSetDVs) {
ords = cachedSortedSetDVs.get(field);
if (ords == null) {
ords = new SortedSetDocValuesWriter.DocOrds(maxDoc(), docMap, oldDocValues, PackedInts.FASTEST);
cachedSortedSetDVs.put(field.name, ords);
}
}
return new SortedSetDocValuesWriter.SortingSortedSetDocValues(oldDocValues, ords);
return new SortedSetDocValuesWriter.SortingSortedSetDocValues(oldDocValues, getOrCreateDV(field.name, () ->
new SortedSetDocValuesWriter.DocOrds(maxDoc(), docMap, oldDocValues, PackedInts.FAST)));
}
@Override
@ -455,6 +390,18 @@ public final class SortingCodecReader extends FilterCodecReader {
};
}
private NumericDocValuesWriter.NumericDVs getNumericDocValues(NumericDocValues oldNumerics) throws IOException {
FixedBitSet docsWithField = new FixedBitSet(maxDoc());
long[] values = new long[maxDoc()];
int docID;
while ((docID = oldNumerics.nextDoc()) != NO_MORE_DOCS) {
int newDocID = docMap.oldToNew(docID);
docsWithField.set(newDocID);
values[newDocID] = oldNumerics.longValue();
}
return new NumericDocValuesWriter.NumericDVs(values, docsWithField);
}
@Override
public TermVectorsReader getTermVectorsReader() {
return newTermVectorsReader(in.getTermVectorsReader());
@ -510,4 +457,51 @@ public final class SortingCodecReader extends FilterCodecReader {
return metaData;
}
// we try to cache the last used DV or Norms instance since during merge
// this instance is used more than once. We could in addition to this single instance
// also cache the fields that are used for sorting since we do the work twice for these fields
private String cachedField;
private Object cachedObject;
private boolean cacheIsNorms;
private <T> T getOrCreateNorms(String field, IOSupplier<T> supplier) throws IOException {
return getOrCreate(field, true, supplier);
}
@SuppressWarnings("unchecked")
private synchronized <T> T getOrCreate(String field, boolean norms, IOSupplier<T> supplier) throws IOException {
if ((field.equals(cachedField) && cacheIsNorms == norms) == false) {
assert assertCreatedOnlyOnce(field, norms);
cachedObject = supplier.get();
cachedField = field;
cacheIsNorms = norms;
}
assert cachedObject != null;
return (T) cachedObject;
}
private final Map<String, Integer> cacheStats = new HashMap<>(); // only with assertions enabled
private boolean assertCreatedOnlyOnce(String field, boolean norms) {
assert Thread.holdsLock(this);
// this is mainly there to make sure we change anything in the way we merge we realize it early
Integer timesCached = cacheStats.compute(field + "N:" + norms, (s, i) -> i == null ? 1 : i.intValue() + 1);
if (timesCached > 1) {
assert norms == false :"[" + field + "] norms must not be cached twice";
boolean isSortField = false;
for (SortField sf : metaData.getSort().getSort()) {
if (field.equals(sf.getField())) {
isSortField = true;
break;
}
}
assert timesCached == 2 : "[" + field + "] must not be cached more than twice but was cached: "
+ timesCached + " times isSortField: " + isSortField;
assert isSortField : "only sort fields should be cached twice but [" + field + "] is not a sort field";
}
return true;
}
private <T> T getOrCreateDV(String field, IOSupplier<T> supplier) throws IOException {
return getOrCreate(field, false, supplier);
}
}

View File

@ -115,14 +115,18 @@ public class TestSortingCodecReader extends LuceneTestCase {
for (int i = 0; i < numDocs; i++) {
int docId = docIds.get(i);
Document doc = new Document();
doc.add(new NumericDocValuesField("foo", random().nextInt(20)));
doc.add(new StringField("id", Integer.toString(docId), Field.Store.YES));
doc.add(new LongPoint("id", docId));
doc.add(new TextField("text_field", RandomStrings.randomRealisticUnicodeOfLength(random(), 25), Field.Store.YES));
String s = RandomStrings.randomRealisticUnicodeOfLength(random(), 25);
doc.add(new TextField("text_field", s, Field.Store.YES));
doc.add(new BinaryDocValuesField("text_field", new BytesRef(s)));
doc.add(new TextField("another_text_field", s, Field.Store.YES));
doc.add(new BinaryDocValuesField("another_text_field", new BytesRef(s)));
doc.add(new SortedNumericDocValuesField("sorted_numeric_dv", docId));
doc.add(new SortedDocValuesField("binary_sorted_dv", new BytesRef(Integer.toString(docId))));
doc.add(new BinaryDocValuesField("binary_dv", new BytesRef(Integer.toString(docId))));
doc.add(new SortedSetDocValuesField("sorted_set_dv", new BytesRef(Integer.toString(docId))));
doc.add(new NumericDocValuesField("foo", random().nextInt(20)));
FieldType ft = new FieldType(StringField.TYPE_NOT_STORED);
ft.setStoreTermVectors(true);